summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/stable/sysfs-driver-mlxreg-io36
-rw-r--r--Documentation/ABI/testing/ima_policy45
-rw-r--r--Documentation/ABI/testing/securityfs-secrets-coco51
-rw-r--r--Documentation/ABI/testing/sysfs-class-regulator81
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkback4
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkfront2
-rw-r--r--Documentation/ABI/testing/sysfs-platform-intel-ifs39
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.rst2
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst2
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.rst36
-rw-r--r--Documentation/RCU/arrayRCU.rst4
-rw-r--r--Documentation/RCU/checklist.rst9
-rw-r--r--Documentation/RCU/rcu.rst13
-rw-r--r--Documentation/RCU/rculist_nulls.rst2
-rw-r--r--Documentation/RCU/stallwarn.rst20
-rw-r--r--Documentation/RCU/whatisRCU.rst18
-rw-r--r--Documentation/accounting/psi.rst9
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst11
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt541
-rw-r--r--Documentation/admin-guide/media/vimc.dot14
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst15
-rw-r--r--Documentation/arm64/booting.rst10
-rw-r--r--Documentation/arm64/elf_hwcaps.rst33
-rw-r--r--Documentation/arm64/index.rst1
-rw-r--r--Documentation/arm64/sme.rst428
-rw-r--r--Documentation/arm64/sve.rst70
-rw-r--r--Documentation/cdrom/cdrom-standard.rst10
-rw-r--r--Documentation/cdrom/ide-cd.rst538
-rw-r--r--Documentation/cdrom/index.rst1
-rw-r--r--Documentation/core-api/index.rst2
-rw-r--r--Documentation/core-api/printk-index.rst137
-rw-r--r--Documentation/core-api/timekeeping.rst1
-rw-r--r--Documentation/core-api/watch_queue.rst (renamed from Documentation/watch_queue.rst)0
-rw-r--r--Documentation/dev-tools/ktap.rst18
-rw-r--r--Documentation/dev-tools/kunit/api/index.rst5
-rw-r--r--Documentation/dev-tools/kunit/api/resource.rst13
-rw-r--r--Documentation/dev-tools/kunit/architecture.rst2
-rw-r--r--Documentation/dev-tools/kunit/running_tips.rst3
-rw-r--r--Documentation/dev-tools/kunit/usage.rst19
-rw-r--r--Documentation/dev-tools/testing-overview.rst63
-rw-r--r--Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml1
-rw-r--r--Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt212
-rw-r--r--Documentation/devicetree/bindings/hwmon/adt7475.yaml22
-rw-r--r--Documentation/devicetree/bindings/hwmon/lm75.yaml1
-rw-r--r--Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml53
-rw-r--r--Documentation/devicetree/bindings/hwmon/national,lm90.yaml20
-rw-r--r--Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml57
-rw-r--r--Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml105
-rw-r--r--Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml2
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml8
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt9
-rw-r--r--Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.yaml41
-rw-r--r--Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml9
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml3
-rw-r--r--Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/microchip,xisc.yaml2
-rw-r--r--Documentation/devicetree/bindings/media/rockchip,vdec.yaml4
-rw-r--r--Documentation/devicetree/bindings/media/rockchip-vpu.yaml1
-rw-r--r--Documentation/devicetree/bindings/media/video-interfaces.yaml1
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml6
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml384
-rw-r--r--Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml20
-rw-r--r--Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml31
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,dove-sdhci.yaml44
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,orion-sdio.yaml44
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt173
-rw-r--r--Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml275
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-controller.yaml5
-rw-r--r--Documentation/devicetree/bindings/mmc/mtk-sd.yaml15
-rw-r--r--Documentation/devicetree/bindings/mmc/orion-sdio.txt16
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-am654.yaml7
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-dove.txt14
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-msm.txt123
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-msm.yaml194
-rw-r--r--Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml1
-rw-r--r--Documentation/devicetree/bindings/mtd/aspeed-smc.txt51
-rw-r--r--Documentation/devicetree/bindings/mtd/elm.txt16
-rw-r--r--Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml8
-rw-r--r--Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml55
-rw-r--r--Documentation/devicetree/bindings/mtd/renesas-nandc.yaml5
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,elm.yaml72
-rw-r--r--Documentation/devicetree/bindings/perf/arm,cmn.yaml2
-rw-r--r--Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt6
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml2
-rw-r--r--Documentation/devicetree/bindings/regulator/mt6358-regulator.txt22
-rw-r--r--Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml11
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml262
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml21
-rw-r--r--Documentation/devicetree/bindings/regulator/richtek,rt5759-regulator.yaml90
-rw-r--r--Documentation/devicetree/bindings/regulator/siliconmitus,sm5703-regulator.yaml49
-rw-r--r--Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml57
-rw-r--r--Documentation/devicetree/bindings/reserved-memory/phram.yaml47
-rw-r--r--Documentation/devicetree/bindings/spi/aspeed,ast2600-fmc.yaml82
-rw-r--r--Documentation/devicetree/bindings/spi/ingenic,spi.yaml3
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml4
-rw-r--r--Documentation/devicetree/bindings/spi/mediatek,spi-mtk-snfi.yaml88
-rw-r--r--Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml1
-rw-r--r--Documentation/devicetree/bindings/spi/renesas,rspi.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-lmh.yaml1
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml110
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-tsens.yaml5
-rw-r--r--Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml2
-rw-r--r--Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml63
-rw-r--r--Documentation/devicetree/bindings/trivial-devices.yaml4
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.yaml2
-rw-r--r--Documentation/doc-guide/contributing.rst5
-rw-r--r--Documentation/doc-guide/kernel-doc.rst2
-rw-r--r--Documentation/doc-guide/sphinx.rst5
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/driver-api/driver-model/devres.rst2
-rw-r--r--Documentation/driver-api/gpio/driver.rst175
-rw-r--r--Documentation/driver-api/libata.rst11
-rw-r--r--Documentation/driver-api/media/cec-core.rst13
-rw-r--r--Documentation/driver-api/media/mc-core.rst13
-rw-r--r--Documentation/driver-api/media/v4l2-subdev.rst69
-rw-r--r--Documentation/driver-api/thermal/intel_dptf.rst2
-rw-r--r--Documentation/fault-injection/fault-injection.rst14
-rw-r--r--Documentation/features/debug/debug-vm-pgtable/arch-support.txt2
-rw-r--r--Documentation/features/time/context-tracking/arch-support.txt2
-rw-r--r--Documentation/features/time/virt-cpuacct/arch-support.txt2
-rw-r--r--Documentation/filesystems/caching/cachefiles.rst178
-rw-r--r--Documentation/filesystems/caching/netfs-api.rst4
-rw-r--r--Documentation/filesystems/fscrypt.rst2
-rw-r--r--Documentation/filesystems/fsverity.rst37
-rw-r--r--Documentation/filesystems/idmappings.rst5
-rw-r--r--Documentation/filesystems/locking.rst36
-rw-r--r--Documentation/filesystems/netfs_library.rst9
-rw-r--r--Documentation/filesystems/porting.rst2
-rw-r--r--Documentation/filesystems/proc.rst92
-rw-r--r--Documentation/filesystems/vfs.rst86
-rw-r--r--Documentation/filesystems/zonefs.rst52
-rw-r--r--Documentation/firmware-guide/acpi/enumeration.rst3
-rw-r--r--Documentation/hwmon/aquacomputer_d5next.rst7
-rw-r--r--Documentation/hwmon/asus_ec_sensors.rst25
-rw-r--r--Documentation/hwmon/dell-smm-hwmon.rst9
-rw-r--r--Documentation/hwmon/hwmon-kernel-api.rst18
-rw-r--r--Documentation/hwmon/index.rst2
-rw-r--r--Documentation/hwmon/lan966x.rst40
-rw-r--r--Documentation/hwmon/max16601.rst8
-rw-r--r--Documentation/hwmon/xdpe152c4.rst118
-rw-r--r--Documentation/ide/ChangeLog.ide-cd.1994-2004268
-rw-r--r--Documentation/ide/ChangeLog.ide-floppy.1996-200263
-rw-r--r--Documentation/ide/ChangeLog.ide-tape.1995-2002257
-rw-r--r--Documentation/ide/changelogs.rst17
-rw-r--r--Documentation/ide/ide-tape.rst68
-rw-r--r--Documentation/ide/ide.rst265
-rw-r--r--Documentation/ide/index.rst21
-rw-r--r--Documentation/ide/warm-plug-howto.rst18
-rw-r--r--Documentation/index.rst2
-rw-r--r--Documentation/input/devices/atarikbd.rst4
-rw-r--r--Documentation/input/devices/ntrig.rst2
-rw-r--r--Documentation/kbuild/reproducible-builds.rst8
-rw-r--r--Documentation/kernel-hacking/hacking.rst36
-rw-r--r--Documentation/kernel-hacking/locking.rst5
-rw-r--r--Documentation/power/energy-model.rst24
-rw-r--r--Documentation/process/3.Early-stage.rst9
-rw-r--r--Documentation/process/changes.rst8
-rw-r--r--Documentation/process/maintainer-tip.rst14
-rw-r--r--Documentation/process/submitting-patches.rst14
-rw-r--r--Documentation/scheduler/sched-stats.rst8
-rw-r--r--Documentation/security/IMA-templates.rst11
-rw-r--r--Documentation/security/index.rst1
-rw-r--r--Documentation/security/keys/trusted-encrypted.rst60
-rw-r--r--Documentation/security/landlock.rst17
-rw-r--r--Documentation/security/secrets/coco.rst103
-rw-r--r--Documentation/security/secrets/index.rst9
-rw-r--r--Documentation/sphinx/kerneldoc-preamble.sty14
-rw-r--r--Documentation/tools/rtla/common_appendix.rst3
-rw-r--r--Documentation/translations/ja_JP/SubmittingPatches36
-rw-r--r--Documentation/translations/ja_JP/howto.rst44
-rw-r--r--Documentation/translations/ja_JP/index.rst2
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst4
-rw-r--r--Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst291
-rw-r--r--Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst167
-rw-r--r--Documentation/translations/zh_CN/dev-tools/index.rst2
-rw-r--r--Documentation/translations/zh_CN/devicetree/usage-model.rst8
-rw-r--r--Documentation/translations/zh_CN/index.rst2
-rw-r--r--Documentation/translations/zh_CN/locking/index.rst42
-rw-r--r--Documentation/translations/zh_CN/locking/spinlocks.rst149
-rw-r--r--Documentation/translations/zh_CN/process/howto.rst2
-rw-r--r--Documentation/translations/zh_CN/scheduler/index.rst2
-rw-r--r--Documentation/translations/zh_CN/scheduler/sched-debug.rst51
-rw-r--r--Documentation/translations/zh_CN/scheduler/schedutil.rst165
-rw-r--r--Documentation/translations/zh_CN/vm/damon/design.rst7
-rw-r--r--Documentation/translations/zh_CN/vm/frontswap.rst196
-rw-r--r--Documentation/translations/zh_CN/vm/hmm.rst361
-rw-r--r--Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst436
-rw-r--r--Documentation/translations/zh_CN/vm/hwpoison.rst166
-rw-r--r--Documentation/translations/zh_CN/vm/index.rst29
-rw-r--r--Documentation/translations/zh_CN/vm/memory-model.rst135
-rw-r--r--Documentation/translations/zh_CN/vm/mmu_notifier.rst97
-rw-r--r--Documentation/translations/zh_CN/vm/numa.rst101
-rw-r--r--Documentation/translations/zh_CN/vm/overcommit-accounting.rst86
-rw-r--r--Documentation/translations/zh_CN/vm/page_frags.rst38
-rw-r--r--Documentation/translations/zh_CN/vm/page_owner.rst116
-rw-r--r--Documentation/translations/zh_CN/vm/page_table_check.rst56
-rw-r--r--Documentation/translations/zh_CN/vm/remap_file_pages.rst32
-rw-r--r--Documentation/translations/zh_CN/vm/split_page_table_lock.rst96
-rw-r--r--Documentation/translations/zh_CN/vm/z3fold.rst31
-rw-r--r--Documentation/translations/zh_CN/vm/zsmalloc.rst78
-rw-r--r--Documentation/userspace-api/ioctl/cdrom.rst6
-rw-r--r--Documentation/userspace-api/landlock.rst180
-rw-r--r--Documentation/userspace-api/media/drivers/uvcvideo.rst2
-rw-r--r--Documentation/userspace-api/media/mediactl/media-controller-model.rst6
-rw-r--r--Documentation/userspace-api/media/mediactl/media-types.rst17
-rw-r--r--Documentation/userspace-api/media/v4l/dev-decoder.rst9
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst10
-rw-r--r--Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst22
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-reserved.rst19
-rw-r--r--Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst14
-rw-r--r--Documentation/userspace-api/media/v4l/vidioc-streamon.rst3
-rw-r--r--Documentation/userspace-api/seccomp_filter.rst10
-rw-r--r--Documentation/virt/coco/sev-guest.rst155
-rw-r--r--Documentation/virt/index.rst1
-rw-r--r--Documentation/virt/kvm/api.rst2
-rw-r--r--Documentation/vm/arch_pgtable_helpers.rst10
-rw-r--r--Documentation/vm/bootmem.rst5
-rw-r--r--Documentation/vm/index.rst40
-rw-r--r--Documentation/vm/oom.rst5
-rw-r--r--Documentation/vm/page_allocation.rst5
-rw-r--r--Documentation/vm/page_cache.rst5
-rw-r--r--Documentation/vm/page_reclaim.rst5
-rw-r--r--Documentation/vm/page_tables.rst5
-rw-r--r--Documentation/vm/physical_memory.rst5
-rw-r--r--Documentation/vm/process_addrs.rst5
-rw-r--r--Documentation/vm/shmfs.rst5
-rw-r--r--Documentation/vm/slab.rst5
-rw-r--r--Documentation/vm/slub.rst64
-rw-r--r--Documentation/vm/swap.rst5
-rw-r--r--Documentation/vm/vmalloc.rst5
-rw-r--r--Documentation/w1/slaves/w1_therm.rst9
-rw-r--r--Documentation/x86/cpuinfo.rst5
-rw-r--r--Documentation/x86/exception-tables.rst23
-rw-r--r--Documentation/x86/ifs.rst2
-rw-r--r--Documentation/x86/index.rst4
-rw-r--r--Documentation/x86/intel-iommu.rst115
-rw-r--r--Documentation/x86/iommu.rst151
-rw-r--r--Documentation/x86/tdx.rst218
-rw-r--r--Documentation/x86/x86_64/boot-options.rst23
-rw-r--r--Documentation/x86/zero-page.rst2
241 files changed, 8854 insertions, 3096 deletions
diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
index 12c3f895cd2f..b312242d4f40 100644
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@@ -467,3 +467,39 @@ Description: These files provide the maximum powered required for line card
feeding and line card configuration Id.
The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/phy_reset
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file allows to reset PHY 88E1548 when attribute is set 0
+ due to some abnormal PHY behavior.
+ Expected behavior:
+ When phy_reset is written 1, all PHY 88E1548 are released
+ from the reset state, when 0 - are hold in reset state.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/mac_reset
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file allows to reset ASIC MT52132 when attribute is set 0
+ due to some abnormal ASIC behavior.
+ Expected behavior:
+ When mac_reset is written 1, the ASIC MT52132 is released
+ from the reset state, when 0 - is hold in reset state.
+
+ The files are read/write.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/qsfp_pwr_good
+Date: May 2022
+KernelVersion: 5.19
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: This file shows QSFP ports power status. The value is set to 0
+ when one of any QSFP ports is plugged. The value is set to 1 when
+ there are no any QSFP ports are plugged.
+ The possible values are:
+ 0 - Power good, 1 - Not power good.
+
+ The files are read only.
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index 839fab811b18..db17fc8a0c9f 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -27,8 +27,9 @@ Description:
[fowner=] [fgroup=]]
lsm: [[subj_user=] [subj_role=] [subj_type=]
[obj_user=] [obj_role=] [obj_type=]]
- option: [[appraise_type=]] [template=] [permit_directio]
- [appraise_flag=] [appraise_algos=] [keyrings=]
+ option: [digest_type=] [template=] [permit_directio]
+ [appraise_type=] [appraise_flag=]
+ [appraise_algos=] [keyrings=]
base:
func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
@@ -47,10 +48,21 @@ Description:
fgroup:= decimal value
lsm: are LSM specific
option:
- appraise_type:= [imasig] [imasig|modsig]
+ appraise_type:= [imasig] | [imasig|modsig] | [sigv3]
+ where 'imasig' is the original or the signature
+ format v2.
+ where 'modsig' is an appended signature,
+ where 'sigv3' is the signature format v3. (Currently
+ limited to fsverity digest based signatures
+ stored in security.ima xattr. Requires
+ specifying "digest_type=verity" first.)
+
appraise_flag:= [check_blacklist]
Currently, blacklist check is only for files signed with appended
signature.
+ digest_type:= verity
+ Require fs-verity's file digest instead of the
+ regular IMA file hash.
keyrings:= list of keyrings
(eg, .builtin_trusted_keys|.ima). Only valid
when action is "measure" and func is KEY_CHECK.
@@ -149,3 +161,30 @@ Description:
security.ima xattr of a file:
appraise func=SETXATTR_CHECK appraise_algos=sha256,sha384,sha512
+
+ Example of a 'measure' rule requiring fs-verity's digests
+ with indication of type of digest in the measurement list.
+
+ measure func=FILE_CHECK digest_type=verity \
+ template=ima-ngv2
+
+ Example of 'measure' and 'appraise' rules requiring fs-verity
+ signatures (format version 3) stored in security.ima xattr.
+
+ The 'measure' rule specifies the 'ima-sigv3' template option,
+ which includes the indication of type of digest and the file
+ signature in the measurement list.
+
+ measure func=BPRM_CHECK digest_type=verity \
+ template=ima-sigv3
+
+
+ The 'appraise' rule specifies the type and signature format
+ version (sigv3) required.
+
+ appraise func=BPRM_CHECK digest_type=verity \
+ appraise_type=sigv3
+
+ All of these policy rules could, for example, be constrained
+ either based on a filesystem's UUID (fsuuid) or based on LSM
+ labels.
diff --git a/Documentation/ABI/testing/securityfs-secrets-coco b/Documentation/ABI/testing/securityfs-secrets-coco
new file mode 100644
index 000000000000..f2b6909155f9
--- /dev/null
+++ b/Documentation/ABI/testing/securityfs-secrets-coco
@@ -0,0 +1,51 @@
+What: security/secrets/coco
+Date: February 2022
+Contact: Dov Murik <dovmurik@linux.ibm.com>
+Description:
+ Exposes confidential computing (coco) EFI secrets to
+ userspace via securityfs.
+
+ EFI can declare memory area used by confidential computing
+ platforms (such as AMD SEV and SEV-ES) for secret injection by
+ the Guest Owner during VM's launch. The secrets are encrypted
+ by the Guest Owner and decrypted inside the trusted enclave,
+ and therefore are not readable by the untrusted host.
+
+ The efi_secret module exposes the secrets to userspace. Each
+ secret appears as a file under <securityfs>/secrets/coco,
+ where the filename is the GUID of the entry in the secrets
+ table. This module is loaded automatically by the EFI driver
+ if the EFI secret area is populated.
+
+ Two operations are supported for the files: read and unlink.
+ Reading the file returns the content of secret entry.
+ Unlinking the file overwrites the secret data with zeroes and
+ removes the entry from the filesystem. A secret cannot be read
+ after it has been unlinked.
+
+ For example, listing the available secrets::
+
+ # modprobe efi_secret
+ # ls -l /sys/kernel/security/secrets/coco
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+ -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910
+
+ Reading the secret data by reading a file::
+
+ # cat /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+ the-content-of-the-secret-data
+
+ Wiping a secret by unlinking a file::
+
+ # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+ # ls -l /sys/kernel/security/secrets/coco
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+
+ Note: The binary format of the secrets table injected by the
+ Guest Owner is described in
+ drivers/virt/coco/efi_secret/efi_secret.c under "Structure of
+ the EFI secret area".
diff --git a/Documentation/ABI/testing/sysfs-class-regulator b/Documentation/ABI/testing/sysfs-class-regulator
index 8516f08806dd..475b9a372657 100644
--- a/Documentation/ABI/testing/sysfs-class-regulator
+++ b/Documentation/ABI/testing/sysfs-class-regulator
@@ -370,3 +370,84 @@ Description:
'unknown' means software cannot determine the state, or
the reported state is invalid.
+
+What: /sys/class/regulator/.../under_voltage
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ under_voltage. This indicates if the device reports an
+ under-voltage fault (1) or not (0).
+
+What: /sys/class/regulator/.../over_current
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_current. This indicates if the device reports an
+ over-current fault (1) or not (0).
+
+What: /sys/class/regulator/.../regulation_out
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ regulation_out. This indicates if the device reports an
+ out-of-regulation fault (1) or not (0).
+
+What: /sys/class/regulator/.../fail
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ fail. This indicates if the device reports an output failure
+ (1) or not (0).
+
+What: /sys/class/regulator/.../over_temp
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_temp. This indicates if the device reports an
+ over-temperature fault (1) or not (0).
+
+What: /sys/class/regulator/.../under_voltage_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ under_voltage_warn. This indicates if the device reports an
+ under-voltage warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_current_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_current_warn. This indicates if the device reports an
+ over-current warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_voltage_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_voltage_warn. This indicates if the device reports an
+ over-voltage warning (1) or not (0).
+
+What: /sys/class/regulator/.../over_temp_warn
+Date: April 2022
+KernelVersion: 5.18
+Contact: Zev Weiss <zev@bewilderbeest.net>
+Description:
+ Some regulator directories will contain a field called
+ over_temp_warn. This indicates if the device reports an
+ over-temperature warning (1) or not (0).
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
index a74dfe52dd76..7faf719af165 100644
--- a/Documentation/ABI/testing/sysfs-driver-xen-blkback
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -29,7 +29,7 @@ Description:
What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms
Date: December 2019
KernelVersion: 5.6
-Contact: SeongJae Park <sj@kernel.org>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
When memory pressure is reported to blkback this option
controls the duration in milliseconds that blkback will not
@@ -39,7 +39,7 @@ Description:
What: /sys/module/xen_blkback/parameters/feature_persistent
Date: September 2020
KernelVersion: 5.10
-Contact: SeongJae Park <sj@kernel.org>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
Whether to enable the persistent grants feature or not. Note
that this option only takes effect on newly created backends.
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
index 61fd173fabfe..7f646c58832e 100644
--- a/Documentation/ABI/testing/sysfs-driver-xen-blkfront
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
@@ -12,7 +12,7 @@ Description:
What: /sys/module/xen_blkfront/parameters/feature_persistent
Date: September 2020
KernelVersion: 5.10
-Contact: SeongJae Park <sj@kernel.org>
+Contact: Maximilian Heyne <mheyne@amazon.de>
Description:
Whether to enable the persistent grants feature or not. Note
that this option only takes effect on newly created frontends.
diff --git a/Documentation/ABI/testing/sysfs-platform-intel-ifs b/Documentation/ABI/testing/sysfs-platform-intel-ifs
new file mode 100644
index 000000000000..486d6d2ff8a0
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-intel-ifs
@@ -0,0 +1,39 @@
+What: /sys/devices/virtual/misc/intel_ifs_<N>/run_test
+Date: April 21 2022
+KernelVersion: 5.19
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Write <cpu#> to trigger IFS test for one online core.
+ Note that the test is per core. The cpu# can be
+ for any thread on the core. Running on one thread
+ completes the test for the core containing that thread.
+ Example: to test the core containing cpu5: echo 5 >
+ /sys/devices/platform/intel_ifs.<N>/run_test
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/status
+Date: April 21 2022
+KernelVersion: 5.19
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: The status of the last test. It can be one of "pass", "fail"
+ or "untested".
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/details
+Date: April 21 2022
+KernelVersion: 5.19
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Additional information regarding the last test. The details file reports
+ the hex value of the SCAN_STATUS MSR. Note that the error_code field
+ may contain driver defined software code not defined in the Intel SDM.
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/image_version
+Date: April 21 2022
+KernelVersion: 5.19
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Version (hexadecimal) of loaded IFS binary image. If no scan image
+ is loaded reports "none".
+
+What: /sys/devices/virtual/misc/intel_ifs_<N>/reload
+Date: April 21 2022
+KernelVersion: 5.19
+Contact: "Jithu Joseph" <jithu.joseph@intel.com>
+Description: Write "1" (or "y" or "Y") to reload the IFS image from
+ /lib/firmware/intel/ifs/ff-mm-ss.scan.
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
index f4efd6897b09..b34990c7c377 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
@@ -973,7 +973,7 @@ The ``->dynticks`` field counts the corresponding CPU's transitions to
and from either dyntick-idle or user mode, so that this counter has an
even value when the CPU is in dyntick-idle mode or user mode and an odd
value otherwise. The transitions to/from user mode need to be counted
-for user mode adaptive-ticks support (see timers/NO_HZ.txt).
+for user mode adaptive-ticks support (see Documentation/timers/no_hz.rst).
The ``->rcu_need_heavy_qs`` field is used to record the fact that the
RCU core code would really like to see a quiescent state from the
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
index 6f89cf1e567d..c9c957c85bac 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -406,7 +406,7 @@ In earlier implementations, the task requesting the expedited grace
period also drove it to completion. This straightforward approach had
the disadvantage of needing to account for POSIX signals sent to user
tasks, so more recent implemementations use the Linux kernel's
-`workqueues <https://www.kernel.org/doc/Documentation/core-api/workqueue.rst>`__.
+workqueues (see Documentation/core-api/workqueue.rst).
The requesting task still does counter snapshotting and funnel-lock
processing, but the task reaching the top of the funnel lock does a
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 45278e2974c0..04ed8bf27a0e 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -370,8 +370,8 @@ pointer fetched by rcu_dereference() may not be used outside of the
outermost RCU read-side critical section containing that
rcu_dereference(), unless protection of the corresponding data
element has been passed from RCU to some other synchronization
-mechanism, most commonly locking or `reference
-counting <https://www.kernel.org/doc/Documentation/RCU/rcuref.txt>`__.
+mechanism, most commonly locking or reference counting
+(see ../../rcuref.rst).
.. |high-quality implementation of C11 memory_order_consume [PDF]| replace:: high-quality implementation of C11 ``memory_order_consume`` [PDF]
.. _high-quality implementation of C11 memory_order_consume [PDF]: http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf
@@ -2654,6 +2654,38 @@ synchronize_rcu(), and rcu_barrier(), respectively. In
three APIs are therefore implemented by separate functions that check
for voluntary context switches.
+Tasks Rude RCU
+~~~~~~~~~~~~~~
+
+Some forms of tracing need to wait for all preemption-disabled regions
+of code running on any online CPU, including those executed when RCU is
+not watching. This means that synchronize_rcu() is insufficient, and
+Tasks Rude RCU must be used instead. This flavor of RCU does its work by
+forcing a workqueue to be scheduled on each online CPU, hence the "Rude"
+moniker. And this operation is considered to be quite rude by real-time
+workloads that don't want their ``nohz_full`` CPUs receiving IPIs and
+by battery-powered systems that don't want their idle CPUs to be awakened.
+
+The tasks-rude-RCU API is also reader-marking-free and thus quite compact,
+consisting of call_rcu_tasks_rude(), synchronize_rcu_tasks_rude(),
+and rcu_barrier_tasks_rude().
+
+Tasks Trace RCU
+~~~~~~~~~~~~~~~
+
+Some forms of tracing need to sleep in readers, but cannot tolerate
+SRCU's read-side overhead, which includes a full memory barrier in both
+srcu_read_lock() and srcu_read_unlock(). This need is handled by a
+Tasks Trace RCU that uses scheduler locking and IPIs to synchronize with
+readers. Real-time systems that cannot tolerate IPIs may build their
+kernels with ``CONFIG_TASKS_TRACE_RCU_READ_MB=y``, which avoids the IPIs at
+the expense of adding full memory barriers to the read-side primitives.
+
+The tasks-trace-RCU API is also reasonably compact,
+consisting of rcu_read_lock_trace(), rcu_read_unlock_trace(),
+rcu_read_lock_trace_held(), call_rcu_tasks_trace(),
+synchronize_rcu_tasks_trace(), and rcu_barrier_tasks_trace().
+
Possible Future Changes
-----------------------
diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst
index 4051ea3871ef..a5f2ff8fc54c 100644
--- a/Documentation/RCU/arrayRCU.rst
+++ b/Documentation/RCU/arrayRCU.rst
@@ -33,8 +33,8 @@ Situation 1: Hash Tables
Hash tables are often implemented as an array, where each array entry
has a linked-list hash chain. Each hash chain can be protected by RCU
-as described in the listRCU.txt document. This approach also applies
-to other array-of-list situations, such as radix trees.
+as described in listRCU.rst. This approach also applies to other
+array-of-list situations, such as radix trees.
.. _static_arrays:
diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index f4545b7c9a63..42cc5d891bd2 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -140,8 +140,7 @@ over a rather long period of time, but improvements are always welcome!
prevents destructive compiler optimizations. However,
with a bit of devious creativity, it is possible to
mishandle the return value from rcu_dereference().
- Please see rcu_dereference.txt in this directory for
- more information.
+ Please see rcu_dereference.rst for more information.
The rcu_dereference() primitive is used by the
various "_rcu()" list-traversal primitives, such
@@ -151,7 +150,7 @@ over a rather long period of time, but improvements are always welcome!
primitives. This is particularly useful in code that
is common to readers and updaters. However, lockdep
will complain if you access rcu_dereference() outside
- of an RCU read-side critical section. See lockdep.txt
+ of an RCU read-side critical section. See lockdep.rst
to learn what to do about this.
Of course, neither rcu_dereference() nor the "_rcu()"
@@ -323,7 +322,7 @@ over a rather long period of time, but improvements are always welcome!
primitives when the update-side lock is held is that doing so
can be quite helpful in reducing code bloat when common code is
shared between readers and updaters. Additional primitives
- are provided for this case, as discussed in lockdep.txt.
+ are provided for this case, as discussed in lockdep.rst.
One exception to this rule is when data is only ever added to
the linked data structure, and is never removed during any
@@ -480,4 +479,4 @@ over a rather long period of time, but improvements are always welcome!
both rcu_barrier() and synchronize_rcu(), if necessary, using
something like workqueues to to execute them concurrently.
- See rcubarrier.txt for more information.
+ See rcubarrier.rst for more information.
diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst
index 0e03c6ef3147..3cfe01ba9a49 100644
--- a/Documentation/RCU/rcu.rst
+++ b/Documentation/RCU/rcu.rst
@@ -10,9 +10,8 @@ A "grace period" must elapse between the two parts, and this grace period
must be long enough that any readers accessing the item being deleted have
since dropped their references. For example, an RCU-protected deletion
from a linked list would first remove the item from the list, wait for
-a grace period to elapse, then free the element. See the
-:ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` for more information on
-using RCU with linked lists.
+a grace period to elapse, then free the element. See listRCU.rst for more
+information on using RCU with linked lists.
Frequently Asked Questions
--------------------------
@@ -50,7 +49,7 @@ Frequently Asked Questions
- If I am running on a uniprocessor kernel, which can only do one
thing at a time, why should I wait for a grace period?
- See :ref:`Documentation/RCU/UP.rst <up_doc>` for more information.
+ See UP.rst for more information.
- How can I see where RCU is currently used in the Linux kernel?
@@ -64,13 +63,13 @@ Frequently Asked Questions
- What guidelines should I follow when writing code that uses RCU?
- See the checklist.txt file in this directory.
+ See checklist.rst.
- Why the name "RCU"?
"RCU" stands for "read-copy update".
- :ref:`Documentation/RCU/listRCU.rst <list_rcu_doc>` has more information on where
- this name came from, search for "read-copy update" to find it.
+ listRCU.rst has more information on where this name came from, search
+ for "read-copy update" to find it.
- I hear that RCU is patented? What is with that?
diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst
index a9fc774bc400..ca4692775ad4 100644
--- a/Documentation/RCU/rculist_nulls.rst
+++ b/Documentation/RCU/rculist_nulls.rst
@@ -8,7 +8,7 @@ This section describes how to use hlist_nulls to
protect read-mostly linked lists and
objects using SLAB_TYPESAFE_BY_RCU allocations.
-Please read the basics in Documentation/RCU/listRCU.rst
+Please read the basics in listRCU.rst.
Using 'nulls'
=============
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 78404625bad2..794837eb519b 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -162,6 +162,26 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
Stall-warning messages may be enabled and disabled completely via
/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
+CONFIG_RCU_EXP_CPU_STALL_TIMEOUT
+--------------------------------
+
+ Same as the CONFIG_RCU_CPU_STALL_TIMEOUT parameter but only for
+ the expedited grace period. This parameter defines the period
+ of time that RCU will wait from the beginning of an expedited
+ grace period until it issues an RCU CPU stall warning. This time
+ period is normally 20 milliseconds on Android devices. A zero
+ value causes the CONFIG_RCU_CPU_STALL_TIMEOUT value to be used,
+ after conversion to milliseconds.
+
+ This configuration parameter may be changed at runtime via the
+ /sys/module/rcupdate/parameters/rcu_exp_cpu_stall_timeout, however
+ this parameter is checked only at the beginning of a cycle. If you
+ are in a current stall cycle, setting it to a new value will change
+ the timeout for the -next- stall.
+
+ Stall-warning messages may be enabled and disabled completely via
+ /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
+
RCU_STALL_DELAY_DELTA
---------------------
diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index c34d2212eaca..77ea260efd12 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -224,7 +224,7 @@ synchronize_rcu()
be delayed. This property results in system resilience in face
of denial-of-service attacks. Code using call_rcu() should limit
update rate in order to gain this same sort of resilience. See
- checklist.txt for some approaches to limiting the update rate.
+ checklist.rst for some approaches to limiting the update rate.
rcu_assign_pointer()
^^^^^^^^^^^^^^^^^^^^
@@ -318,7 +318,7 @@ rcu_dereference()
must prohibit. The rcu_dereference_protected() variant takes
a lockdep expression to indicate which locks must be acquired
by the caller. If the indicated protection is not provided,
- a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst
+ a lockdep splat is emitted. See Design/Requirements/Requirements.rst
and the API's code comments for more details and example usage.
.. [2] If the list_for_each_entry_rcu() instance might be used by
@@ -399,8 +399,7 @@ for specialized uses, but are relatively uncommon.
This section shows a simple use of the core RCU API to protect a
global pointer to a dynamically allocated structure. More-typical
-uses of RCU may be found in :ref:`listRCU.rst <list_rcu_doc>`,
-:ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst <NMI_rcu_doc>`.
+uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst.
::
struct foo {
@@ -482,10 +481,9 @@ So, to sum up:
RCU read-side critical sections that might be referencing that
data item.
-See checklist.txt for additional rules to follow when using RCU.
-And again, more-typical uses of RCU may be found in :ref:`listRCU.rst
-<list_rcu_doc>`, :ref:`arrayRCU.rst <array_rcu_doc>`, and :ref:`NMI-RCU.rst
-<NMI_rcu_doc>`.
+See checklist.rst for additional rules to follow when using RCU.
+And again, more-typical uses of RCU may be found in listRCU.rst,
+arrayRCU.rst, and NMI-RCU.rst.
.. _4_whatisRCU:
@@ -579,7 +577,7 @@ to avoid having to write your own callback::
kfree_rcu(old_fp, rcu);
-Again, see checklist.txt for additional rules governing the use of RCU.
+Again, see checklist.rst for additional rules governing the use of RCU.
.. _5_whatisRCU:
@@ -663,7 +661,7 @@ been able to write-acquire the lock otherwise. The smp_mb__after_spinlock()
promotes synchronize_rcu() to a full memory barrier in compliance with
the "Memory-Barrier Guarantees" listed in:
- Documentation/RCU/Design/Requirements/Requirements.rst
+ Design/Requirements/Requirements.rst
It is possible to nest rcu_read_lock(), since reader-writer locks may
be recursively acquired. Note also that rcu_read_lock() is immune
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index 860fe651d645..5e40b3f437f9 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -37,11 +37,7 @@ Pressure interface
Pressure information for each resource is exported through the
respective file in /proc/pressure/ -- cpu, memory, and io.
-The format for CPU is as such::
-
- some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-and for memory and IO::
+The format is as such::
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
@@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is
still doing productive work. As such, time spent in this subset of the
stall state is tracked separately and exported in the "full" averages.
+CPU full is undefined at the system level, but has been reported
+since 5.13, so it is set to zero for backward compatibility.
+
The ratios (in %) are tracked as recent trends over ten, sixty, and
three hundred second windows, which gives insight into short term events
as well as medium and long term trends. The total absolute stall time
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69d7a6983f78..38aa01939e1e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1881,7 +1881,7 @@ IO Latency Interface Files
io.latency
This takes a similar format as the other controllers.
- "MAJOR:MINOR target=<target time in microseconds"
+ "MAJOR:MINOR target=<target time in microseconds>"
io.stat
If the controller is enabled you will see extra stats in io.stat in
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 01ba293a2d70..959f73a32712 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -99,6 +99,7 @@ parameter is applicable::
ALSA ALSA sound support is enabled.
APIC APIC support is enabled.
APM Advanced Power Management support is enabled.
+ APPARMOR AppArmor support is enabled.
ARM ARM architecture is enabled.
ARM64 ARM64 architecture is enabled.
AX25 Appropriate AX.25 support is enabled.
@@ -108,15 +109,15 @@ parameter is applicable::
DYNAMIC_DEBUG Build in debug messages and enable them at runtime
EDD BIOS Enhanced Disk Drive Services (EDD) is enabled
EFI EFI Partitioning (GPT) is enabled
- EIDE EIDE/ATAPI support is enabled.
EVM Extended Verification Module
FB The frame buffer device is enabled.
FTRACE Function tracing enabled.
GCOV GCOV profiling is enabled.
+ HIBERNATION HIBERNATION is enabled.
HW Appropriate hardware is enabled.
+ HYPER_V HYPERV support is enabled.
IA-64 IA-64 architecture is enabled.
IMA Integrity measurement architecture is enabled.
- IOSCHED More than one I/O scheduler is enabled.
IP_PNP IP DHCP, BOOTP, or RARP is enabled.
IPV6 IPv6 support is enabled.
ISAPNP ISA PnP code is enabled.
@@ -140,7 +141,6 @@ parameter is applicable::
NUMA NUMA support is enabled.
NFS Appropriate NFS support is enabled.
OF Devicetree is enabled.
- OSS OSS sound support is enabled.
PV_OPS A paravirtualized kernel is enabled.
PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
PARISC The PA-RISC architecture is enabled.
@@ -160,7 +160,6 @@ parameter is applicable::
the Documentation/scsi/ sub-directory.
SECURITY Different security models are enabled.
SELINUX SELinux support is enabled.
- APPARMOR AppArmor support is enabled.
SERIAL Serial support is enabled.
SH SuperH architecture is enabled.
SMP The kernel is an SMP kernel.
@@ -168,7 +167,6 @@ parameter is applicable::
SWSUSP Software suspend (hibernation) is enabled.
SUSPEND System suspend states are enabled.
TPM TPM drivers are enabled.
- TS Appropriate touchscreen support is enabled.
UMS USB Mass Storage support is enabled.
USB USB support is enabled.
USBHID USB Human Interface Device support is enabled.
@@ -177,7 +175,6 @@ parameter is applicable::
VGA The VGA console has been enabled.
VT Virtual terminal support is enabled.
WDT Watchdog support is enabled.
- XT IBM PC/XT MFM hard disk support is enabled.
X86-32 X86-32, aka i386 architecture is enabled.
X86-64 X86-64 architecture is enabled.
More X86-64 boot options can be found in
@@ -211,7 +208,7 @@ The number of kernel parameters is not limited, but the length of the
complete command line (parameters including spaces etc.) is limited to
a fixed number of characters. This limit depends on the architecture
and is between 256 and 4096 characters. It is defined in the file
-./include/asm/setup.h as COMMAND_LINE_SIZE.
+./include/uapi/asm-generic/setup.h as COMMAND_LINE_SIZE.
Finally, the [KMG] suffix is commonly described after a number of kernel
parameter values. These 'K', 'M', and 'G' letters represent the _binary_
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3f1cc5e317ed..a9066cfb85a0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -461,6 +461,12 @@
Format: <io>,<irq>,<mode>
See header of drivers/net/hamradio/baycom_ser_hdx.c.
+ bert_disable [ACPI]
+ Disable BERT OS support on buggy BIOSes.
+
+ bgrt_disable [ACPI][X86]
+ Disable BGRT to avoid flickering OEM logo.
+
blkdevparts= Manual partition parsing of block device(s) for
embedded devices based on command line input.
See Documentation/block/cmdline-partition.rst
@@ -476,12 +482,6 @@
See Documentation/admin-guide/bootconfig.rst
- bert_disable [ACPI]
- Disable BERT OS support on buggy BIOSes.
-
- bgrt_disable [ACPI][X86]
- Disable BGRT to avoid flickering OEM logo.
-
bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards)
bttv.radio= Most important insmod options are available as
kernel args too.
@@ -563,6 +563,25 @@
cio_ignore= [S390]
See Documentation/s390/common_io.rst for details.
+
+ clearcpuid=X[,X...] [X86]
+ Disable CPUID feature X for the kernel. See
+ arch/x86/include/asm/cpufeatures.h for the valid bit
+ numbers X. Note the Linux-specific bits are not necessarily
+ stable over kernel options, but the vendor-specific
+ ones should be.
+ X can also be a string as appearing in the flags: line
+ in /proc/cpuinfo which does not have the above
+ instability issue. However, not all features have names
+ in /proc/cpuinfo.
+ Note that using this option will taint your kernel.
+ Also note that user programs calling CPUID directly
+ or using the feature without checking anything
+ will still see it. This just prevents it from
+ being used by the kernel or shown in /proc/cpuinfo.
+ Also note the kernel might malfunction if you disable
+ some critical bits.
+
clk_ignore_unused
[CLK]
Prevents the clock framework from automatically gating
@@ -631,19 +650,6 @@
Defaults to zero when built as a module and to
10 seconds when built into the kernel.
- clearcpuid=BITNUM[,BITNUM...] [X86]
- Disable CPUID feature X for the kernel. See
- arch/x86/include/asm/cpufeatures.h for the valid bit
- numbers. Note the Linux specific bits are not necessarily
- stable over kernel options, but the vendor specific
- ones should be.
- Also note that user programs calling CPUID directly
- or using the feature without checking anything
- will still see it. This just prevents it from
- being used by the kernel or shown in /proc/cpuinfo.
- Also note the kernel might malfunction if you disable
- some critical bits.
-
cma=nn[MG]@[start[MG][-end[MG]]]
[KNL,CMA]
Sets the size of kernel global memory area for
@@ -765,6 +771,24 @@
0: default value, disable debugging
1: enable debugging at boot time
+ cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
+ Format:
+ <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+
+ cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
+ CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
+ Some features depend on CPU0. Known dependencies are:
+ 1. Resume from suspend/hibernate depends on CPU0.
+ Suspend/hibernate will fail if CPU0 is offline and you
+ need to online CPU0 before suspend/hibernate.
+ 2. PIC interrupts also depend on CPU0. CPU0 can't be
+ removed if a PIC interrupt is detected.
+ It's said poweroff/reboot may depend on CPU0 on some
+ machines although I haven't seen such issues so far
+ after CPU0 is offline on a few tested machines.
+ If the dependencies are under your control, you can
+ turn on cpu0_hotplug.
+
cpuidle.off=1 [CPU_IDLE]
disable the cpuidle sub-system
@@ -785,9 +809,13 @@
on every CPU online, such as boot, and resume from suspend.
Default: 10000
- cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
- Format:
- <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
+ crash_kexec_post_notifiers
+ Run kdump after running panic-notifiers and dumping
+ kmsg. This only for the users who doubt kdump always
+ succeeds in any situation.
+ Note that this also increases risks of kdump failure,
+ because some panic notifiers can make the crashed
+ kernel more unstable.
crashkernel=size[KMG][@offset[KMG]]
[KNL] Using kexec, Linux can switch to a 'crash kernel'
@@ -808,7 +836,7 @@
Documentation/admin-guide/kdump/kdump.rst for an example.
crashkernel=size[KMG],high
- [KNL, X86-64] range could be above 4G. Allow kernel
+ [KNL, X86-64, ARM64] range could be above 4G. Allow kernel
to allocate physical memory region from top, so could
be above 4G if system have more than 4G ram installed.
Otherwise memory region will be allocated below 4G, if
@@ -821,14 +849,20 @@
that require some amount of low memory, e.g. swiotlb
requires at least 64M+32K low memory, also enough extra
low memory is needed to make sure DMA buffers for 32-bit
- devices won't run out. Kernel would try to allocate at
+ devices won't run out. Kernel would try to allocate
at least 256M below 4G automatically.
- This one let user to specify own low range under 4G
+ This one lets the user specify own low range under 4G
for second kernel instead.
0: to disable low allocation.
It will be ignored when crashkernel=X,high is not used
or memory reserved is below 4G.
+ [KNL, ARM64] range in low memory.
+ This one lets the user specify a low range in the
+ DMA zone for the crash dump kernel.
+ It will be ignored when crashkernel=X,high is not used
+ or memory reserved is located in the DMA zones.
+
cryptomgr.notests
[KNL] Disable crypto self-tests
@@ -950,6 +984,8 @@
dump out devices still on the deferred probe list after
retrying.
+ delayacct [KNL] Enable per-task delay accounting
+
dell_smm_hwmon.ignore_dmi=
[HW] Continue probing hardware even if DMI data
indicates that the driver is running on unsupported
@@ -1003,17 +1039,6 @@
disable= [IPV6]
See Documentation/networking/ipv6.rst.
- hardened_usercopy=
- [KNL] Under CONFIG_HARDENED_USERCOPY, whether
- hardening is enabled for this boot. Hardened
- usercopy checking is used to protect the kernel
- from reading or writing beyond known memory
- allocation boundaries as a proactive defense
- against bounds-checking flaws in the kernel's
- copy_to_user()/copy_from_user() interface.
- on Perform hardened usercopy checks (default).
- off Disable hardened usercopy checks.
-
disable_radix [PPC]
Disable RADIX MMU mode on POWER9
@@ -1282,7 +1307,7 @@
Append ",keep" to not disable it when the real console
takes over.
- Only one of vga, efi, serial, or usb debug port can
+ Only one of vga, serial, or usb debug port can
be used at a time.
Currently only ttyS0 and ttyS1 may be specified by
@@ -1297,7 +1322,7 @@
Interaction with the standard serial driver is not
very good.
- The VGA and EFI output is eventually overwritten by
+ The VGA output is eventually overwritten by
the real console.
The xen option can only be used in Xen domains.
@@ -1316,17 +1341,6 @@
force: enforce the use of EDAC to report H/W event.
default: on.
- ekgdboc= [X86,KGDB] Allow early kernel console debugging
- ekgdboc=kbd
-
- This is designed to be used in conjunction with
- the boot argument: earlyprintk=vga
-
- This parameter works in place of the kgdboc parameter
- but can only be used if the backing tty is available
- very early in the boot process. For early debugging
- via a serial port see kgdboc_earlycon instead.
-
edd= [EDD]
Format: {"off" | "on" | "skip[mbr]"}
@@ -1388,6 +1402,17 @@
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
+ ekgdboc= [X86,KGDB] Allow early kernel console debugging
+ Format: ekgdboc=kbd
+
+ This is designed to be used in conjunction with
+ the boot argument: earlyprintk=vga
+
+ This parameter works in place of the kgdboc parameter
+ but can only be used if the backing tty is available
+ very early in the boot process. For early debugging
+ via a serial port see kgdboc_earlycon instead.
+
elanfreq= [X86-32]
See comment before function elanfreq_setup() in
arch/x86/kernel/cpu/cpufreq/elanfreq.c.
@@ -1586,6 +1611,17 @@
Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
Default: 1024
+ hardened_usercopy=
+ [KNL] Under CONFIG_HARDENED_USERCOPY, whether
+ hardening is enabled for this boot. Hardened
+ usercopy checking is used to protect the kernel
+ from reading or writing beyond known memory
+ allocation boundaries as a proactive defense
+ against bounds-checking flaws in the kernel's
+ copy_to_user()/copy_from_user() interface.
+ on Perform hardened usercopy checks (default).
+ off Disable hardened usercopy checks.
+
hardlockup_all_cpu_backtrace=
[KNL] Should the hard-lockup detector generate
backtraces on all cpus.
@@ -1606,6 +1642,15 @@
corresponding firmware-first mode error processing
logic will be disabled.
+ hibernate= [HIBERNATION]
+ noresume Don't check if there's a hibernation image
+ present during boot.
+ nocompress Don't compress/decompress hibernation images.
+ no Disable hibernation and resume.
+ protect_image Turn on image protection during restoration
+ (that will set all pages holding image data
+ during restoration read-only).
+
highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact
size of <nn>. This works even on boxes that have no
highmem otherwise. This also works to reduce highmem
@@ -1628,16 +1673,6 @@
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
- hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
- of gigantic hugepages. Or using node format, the size
- of a CMA area per node can be specified.
- Format: nn[KMGTPE] or (node format)
- <node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
-
- Reserve a CMA area of given size and allocate gigantic
- hugepages using the CMA allocator. If enabled, the
- boot-time allocation of gigantic hugepages is skipped.
-
hugepages= [HW] Number of HugeTLB pages to allocate at boot.
If this follows hugepagesz (below), it specifies
the number of pages of hugepagesz to be allocated.
@@ -1659,6 +1694,16 @@
Documentation/admin-guide/mm/hugetlbpage.rst.
Format: size[KMG]
+ hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
+ of gigantic hugepages. Or using node format, the size
+ of a CMA area per node can be specified.
+ Format: nn[KMGTPE] or (node format)
+ <node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
+
+ Reserve a CMA area of given size and allocate gigantic
+ hugepages using the CMA allocator. If enabled, the
+ boot-time allocation of gigantic hugepages is skipped.
+
hugetlb_free_vmemmap=
[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
enabled.
@@ -1758,26 +1803,6 @@
icn= [HW,ISDN]
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
- ide-core.nodma= [HW] (E)IDE subsystem
- Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
- .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
- .cdrom .chs .ignore_cable are additional options
- See Documentation/ide/ide.rst.
-
- ide-generic.probe-mask= [HW] (E)IDE subsystem
- Format: <int>
- Probe mask for legacy ISA IDE ports. Depending on
- platform up to 6 ports are supported, enabled by
- setting corresponding bits in the mask to 1. The
- default value is 0x0, which has a special meaning.
- On systems that have PCI, it triggers scanning the
- PCI bus for the first and the second port, which
- are then probed. On systems without PCI the value
- of 0x0 enables probing the two first ports as if it
- was 0x3.
-
- ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
- Claim all unknown PCI IDE storage controllers.
idle= [X86]
Format: idle=poll, idle=halt, idle=nomwait
@@ -1903,7 +1928,8 @@
ima_template= [IMA]
Select one of defined IMA measurements template formats.
- Formats: { "ima" | "ima-ng" | "ima-sig" }
+ Formats: { "ima" | "ima-ng" | "ima-ngv2" | "ima-sig" |
+ "ima-sigv2" }
Default: "ima-ng"
ima_template_fmt=
@@ -2622,14 +2648,14 @@
when set.
Format: <int>
- libata.force= [LIBATA] Force configurations. The format is comma-
- separated list of "[ID:]VAL" where ID is
- PORT[.DEVICE]. PORT and DEVICE are decimal numbers
- matching port, link or device. Basically, it matches
- the ATA ID string printed on console by libata. If
- the whole ID part is omitted, the last PORT and DEVICE
- values are used. If ID hasn't been specified yet, the
- configuration applies to all ports, links and devices.
+ libata.force= [LIBATA] Force configurations. The format is a comma-
+ separated list of "[ID:]VAL" where ID is PORT[.DEVICE].
+ PORT and DEVICE are decimal numbers matching port, link
+ or device. Basically, it matches the ATA ID string
+ printed on console by libata. If the whole ID part is
+ omitted, the last PORT and DEVICE values are used. If
+ ID hasn't been specified yet, the configuration applies
+ to all ports, links and devices.
If only DEVICE is omitted, the parameter applies to
the port and all links and devices behind it. DEVICE
@@ -2639,7 +2665,7 @@
host link and device attached to it.
The VAL specifies the configuration to force. As long
- as there's no ambiguity shortcut notation is allowed.
+ as there is no ambiguity, shortcut notation is allowed.
For example, both 1.5 and 1.5G would work for 1.5Gbps.
The following configurations can be forced.
@@ -2652,27 +2678,64 @@
udma[/][16,25,33,44,66,100,133] notation is also
allowed.
+ * nohrst, nosrst, norst: suppress hard, soft and both
+ resets.
+
+ * rstonce: only attempt one reset during hot-unplug
+ link recovery.
+
+ * [no]dbdelay: Enable or disable the extra 200ms delay
+ before debouncing a link PHY and device presence
+ detection.
+
* [no]ncq: Turn on or off NCQ.
- * [no]ncqtrim: Turn off queued DSM TRIM.
+ * [no]ncqtrim: Enable or disable queued DSM TRIM.
+
+ * [no]ncqati: Enable or disable NCQ trim on ATI chipset.
+
+ * [no]trim: Enable or disable (unqueued) TRIM.
+
+ * trim_zero: Indicate that TRIM command zeroes data.
+
+ * max_trim_128m: Set 128M maximum trim size limit.
+
+ * [no]dma: Turn on or off DMA transfers.
- * nohrst, nosrst, norst: suppress hard, soft
- and both resets.
+ * atapi_dmadir: Enable ATAPI DMADIR bridge support.
- * rstonce: only attempt one reset during
- hot-unplug link recovery
+ * atapi_mod16_dma: Enable the use of ATAPI DMA for
+ commands that are not a multiple of 16 bytes.
- * dump_id: dump IDENTIFY data.
+ * [no]dmalog: Enable or disable the use of the
+ READ LOG DMA EXT command to access logs.
- * atapi_dmadir: Enable ATAPI DMADIR bridge support
+ * [no]iddevlog: Enable or disable access to the
+ identify device data log.
+
+ * [no]logdir: Enable or disable access to the general
+ purpose log directory.
+
+ * max_sec_128: Set transfer size limit to 128 sectors.
+
+ * max_sec_1024: Set or clear transfer size limit to
+ 1024 sectors.
+
+ * max_sec_lba48: Set or clear transfer size limit to
+ 65535 sectors.
+
+ * [no]lpm: Enable or disable link power management.
+
+ * [no]setxfer: Indicate if transfer speed mode setting
+ should be skipped.
+
+ * dump_id: Dump IDENTIFY data.
* disable: Disable this device.
If there are multiple matching configurations changing
the same attribute, the last one is used.
- memblock=debug [KNL] Enable memblock debug messages.
-
load_ramdisk= [RAM] [Deprecated]
lockd.nlm_grace_period=P [NFS] Assign grace period.
@@ -2814,7 +2877,7 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
- max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
+ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater
than or equal to this physical address is ignored.
maxcpus= [SMP] Maximum number of processors that an SMP kernel
@@ -2914,6 +2977,8 @@
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
+ memblock=debug [KNL] Enable memblock debug messages.
+
memchunk=nn[KMG]
[KNL,SH] Allow user to override the default size for
per-device physically contiguous DMA buffers.
@@ -3057,7 +3122,7 @@
mga= [HW,DRM]
- min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this
+ min_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory below this
physical address is ignored.
mini2440= [ARM,HW,KNL]
@@ -3103,6 +3168,7 @@
mds=off [X86]
tsx_async_abort=off [X86]
kvm.nx_huge_pages=off [X86]
+ srbds=off [X86,INTEL]
no_entry_flush [PPC]
no_uaccess_flush [PPC]
@@ -3181,20 +3247,6 @@
mtdparts= [MTD]
See drivers/mtd/parsers/cmdlinepart.c
- multitce=off [PPC] This parameter disables the use of the pSeries
- firmware feature for updating multiple TCE entries
- at a time.
-
- onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
-
- Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
-
- boundary - index of last SLC block on Flex-OneNAND.
- The remaining blocks are configured as MLC blocks.
- lock - Configure if Flex-OneNAND boundary should be locked.
- Once locked, the boundary cannot be changed.
- 1 indicates lock status, 0 indicates unlock status.
-
mtdset= [ARM]
ARM/S3C2412 JIVE boot control
@@ -3221,6 +3273,10 @@
Used for mtrr cleanup. It is spare mtrr entries number.
Set to 2 or more if your graphical card needs more.
+ multitce=off [PPC] This parameter disables the use of the pSeries
+ firmware feature for updating multiple TCE entries
+ at a time.
+
n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
netdev= [NET] Network devices parameters
@@ -3230,6 +3286,11 @@
This usage is only documented in each driver source
file if at all.
+ netpoll.carrier_timeout=
+ [NET] Specifies amount of time (in seconds) that
+ netpoll should wait for a carrier. By default netpoll
+ waits 4 seconds.
+
nf_conntrack.acct=
[NETFILTER] Enable connection tracking flow accounting
0 to disable accounting
@@ -3380,11 +3441,6 @@
These settings can be accessed at runtime via
the nmi_watchdog and hardlockup_panic sysctls.
- netpoll.carrier_timeout=
- [NET] Specifies amount of time (in seconds) that
- netpoll should wait for a carrier. By default netpoll
- waits 4 seconds.
-
no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
emulation library even if a 387 maths coprocessor
is present.
@@ -3439,10 +3495,6 @@
nocache [ARM]
- noclflush [BUGS=X86] Don't use the CLFLUSH instruction
-
- delayacct [KNL] Enable per-task delay accounting
-
nodsp [SH] Disable hardware DSP at boot time.
noefi Disable EFI runtime services support.
@@ -3451,16 +3503,11 @@
noexec [IA-64]
- noexec [X86]
- On X86-32 available only on PAE configured kernels.
- noexec=on: enable non-executable mappings (default)
- noexec=off: disable non-executable mappings
-
- nosmap [X86,PPC]
+ nosmap [PPC]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
- nosmep [X86,PPC64s]
+ nosmep [PPC64s]
Disable SMEP (Supervisor Mode Execution Prevention)
even if it is supported by processor.
@@ -3660,8 +3707,6 @@
nosbagart [IA-64]
- nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
-
nosgx [X86-64,SGX] Disables Intel SGX kernel support.
nosmp [SMP] Tells an SMP kernel to act as a UP kernel,
@@ -3678,20 +3723,6 @@
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
- cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
- CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
- Some features depend on CPU0. Known dependencies are:
- 1. Resume from suspend/hibernate depends on CPU0.
- Suspend/hibernate will fail if CPU0 is offline and you
- need to online CPU0 before suspend/hibernate.
- 2. PIC interrupts also depend on CPU0. CPU0 can't be
- removed if a PIC interrupt is detected.
- It's said poweroff/reboot may depend on CPU0 on some
- machines although I haven't seen such issues so far
- after CPU0 is offline on a few tested machines.
- If the dependencies are under your control, you can
- turn on cpu0_hotplug.
-
nps_mtm_hs_ctr= [KNL,ARC]
This parameter sets the maximum duration, in
cycles, each HW thread of the CTOP can run
@@ -3744,6 +3775,16 @@
For example, to override I2C bus2:
omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
+ onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
+
+ Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
+
+ boundary - index of last SLC block on Flex-OneNAND.
+ The remaining blocks are configured as MLC blocks.
+ lock - Configure if Flex-OneNAND boundary should be locked.
+ Once locked, the boundary cannot be changed.
+ 1 indicates lock status, 0 indicates unlock status.
+
oops=panic Always panic on oopses. Default is to just kill the
process, but there is a small probability of
deadlocking the machine.
@@ -3814,14 +3855,6 @@
panic_on_warn panic() instead of WARN(). Useful to cause kdump
on a WARN().
- crash_kexec_post_notifiers
- Run kdump after running panic-notifiers and dumping
- kmsg. This only for the users who doubt kdump always
- succeeds in any situation.
- Note that this also increases risks of kdump failure,
- because some panic notifiers can make the crashed
- kernel more unstable.
-
parkbd.port= [HW] Parallel port number the keyboard adapter is
connected to, default is 0.
Format: <parport#>
@@ -4893,6 +4926,18 @@
rcupdate.rcu_cpu_stall_timeout= [KNL]
Set timeout for RCU CPU stall warning messages.
+ The value is in seconds and the maximum allowed
+ value is 300 seconds.
+
+ rcupdate.rcu_exp_cpu_stall_timeout= [KNL]
+ Set timeout for expedited RCU CPU stall warning
+ messages. The value is in milliseconds
+ and the maximum allowed value is 21000
+ milliseconds. Please note that this value is
+ adjusted to an arch timer tick resolution.
+ Setting this to zero causes the value from
+ rcupdate.rcu_cpu_stall_timeout to be used (after
+ conversion from seconds to milliseconds).
rcupdate.rcu_expedited= [KNL]
Use expedited grace-period primitives, for
@@ -4955,10 +5000,34 @@
number avoids disturbing real-time workloads,
but lengthens grace periods.
+ rcupdate.rcu_task_stall_info= [KNL]
+ Set initial timeout in jiffies for RCU task stall
+ informational messages, which give some indication
+ of the problem for those not patient enough to
+ wait for ten minutes. Informational messages are
+ only printed prior to the stall-warning message
+ for a given grace period. Disable with a value
+ less than or equal to zero. Defaults to ten
+ seconds. A change in value does not take effect
+ until the beginning of the next grace period.
+
+ rcupdate.rcu_task_stall_info_mult= [KNL]
+ Multiplier for time interval between successive
+ RCU task stall informational messages for a given
+ RCU tasks grace period. This value is clamped
+ to one through ten, inclusive. It defaults to
+ the value three, so that the first informational
+ message is printed 10 seconds into the grace
+ period, the second at 40 seconds, the third at
+ 160 seconds, and then the stall warning at 600
+ seconds would prevent a fourth at 640 seconds.
+
rcupdate.rcu_task_stall_timeout= [KNL]
- Set timeout in jiffies for RCU task stall warning
- messages. Disable with a value less than or equal
- to zero.
+ Set timeout in jiffies for RCU task stall
+ warning messages. Disable with a value less
+ than or equal to zero. Defaults to ten minutes.
+ A change in value does not take effect until
+ the beginning of the next grace period.
rcupdate.rcu_self_test= [KNL]
Run the RCU early boot self tests
@@ -5077,15 +5146,6 @@
Useful for devices that are detected asynchronously
(e.g. USB and MMC devices).
- hibernate= [HIBERNATION]
- noresume Don't check if there's a hibernation image
- present during boot.
- nocompress Don't compress/decompress hibernation images.
- no Disable hibernation and resume.
- protect_image Turn on image protection during restoration
- (that will set all pages holding image data
- during restoration read-only).
-
retain_initrd [RAM] Keep initrd memory after extraction
rfkill.default_state=
@@ -5308,6 +5368,8 @@
serialnumber [BUGS=X86-32]
+ sev=option[,option...] [X86-64] See Documentation/x86/x86_64/boot-options.rst
+
shapers= [NET]
Maximal number of shapers.
@@ -5377,6 +5439,17 @@
smart2= [HW]
Format: <io1>[,<io2>[,...,<io8>]]
+ smp.csd_lock_timeout= [KNL]
+ Specify the period of time in milliseconds
+ that smp_call_function() and friends will wait
+ for a CPU to release the CSD lock. This is
+ useful when diagnosing bugs involving CPUs
+ disabling interrupts for extended periods
+ of time. Defaults to 5,000 milliseconds, and
+ setting a value of zero disables this feature.
+ This feature may be more efficiently disabled
+ using the csdlock_debug- kernel parameter.
+
smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
smsc-ircc2.ircc_sir= [HW] SIR base I/O port
@@ -5388,7 +5461,7 @@
1: Fast pin select (default)
2: ATC IRMode
- smt [KNL,S390] Set the maximum number of threads (logical
+ smt= [KNL,S390] Set the maximum number of threads (logical
CPUs) to use per physical CPU on systems capable of
symmetric multithreading (SMT). Will be capped to the
actual hardware limit.
@@ -5608,6 +5681,30 @@
off: Disable mitigation and remove
performance impact to RDRAND and RDSEED
+ srcutree.big_cpu_lim [KNL]
+ Specifies the number of CPUs constituting a
+ large system, such that srcu_struct structures
+ should immediately allocate an srcu_node array.
+ This kernel-boot parameter defaults to 128,
+ but takes effect only when the low-order four
+ bits of srcutree.convert_to_big is equal to 3
+ (decide at boot).
+
+ srcutree.convert_to_big [KNL]
+ Specifies under what conditions an SRCU tree
+ srcu_struct structure will be converted to big
+ form, that is, with an rcu_node tree:
+
+ 0: Never.
+ 1: At init_srcu_struct() time.
+ 2: When rcutorture decides to.
+ 3: Decide at boot time (default).
+ 0x1X: Above plus if high contention.
+
+ Either way, the srcu_node tree will be sized based
+ on the actual runtime number of CPUs (nr_cpu_ids)
+ instead of the compile-time CONFIG_NR_CPUS.
+
srcutree.counter_wrap_check [KNL]
Specifies how frequently to check for
grace-period sequence counter wrap for the
@@ -5625,6 +5722,14 @@
expediting. Set to zero to disable automatic
expediting.
+ srcutree.small_contention_lim [KNL]
+ Specifies the number of update-side contention
+ events per jiffy will be tolerated before
+ initiating a conversion of an srcu_struct
+ structure to big form. Note that the value of
+ srcutree.convert_to_big must have the 0x10 bit
+ set for contention-based conversions to occur.
+
ssbd= [ARM64,HW]
Speculative Store Bypass Disable control
@@ -5743,8 +5848,9 @@
This parameter controls use of the Protected
Execution Facility on pSeries.
- swapaccount=[0|1]
- [KNL] Enable accounting of swap in memory resource
+ swapaccount= [KNL]
+ Format: [0|1]
+ Enable accounting of swap in memory resource
controller if no parameter or 1 is given or disable
it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
@@ -5790,7 +5896,8 @@
tdfx= [HW,DRM]
- test_suspend= [SUSPEND][,N]
+ test_suspend= [SUSPEND]
+ Format: { "mem" | "standby" | "freeze" }[,N]
Specify "mem" (for Suspend-to-RAM) or "standby" (for
standby suspend) or "freeze" (for suspend type freeze)
as the system sleep state during system startup with
@@ -5874,32 +5981,7 @@
This will guarantee that all the other pcrs
are saved.
- trace_buf_size=nn[KMG]
- [FTRACE] will set tracing buffer size on each cpu.
-
- trace_event=[event-list]
- [FTRACE] Set and start specified trace events in order
- to facilitate early boot debugging. The event-list is a
- comma-separated list of trace events to enable. See
- also Documentation/trace/events.rst
-
- trace_options=[option-list]
- [FTRACE] Enable or disable tracer options at boot.
- The option-list is a comma delimited list of options
- that can be enabled or disabled just as if you were
- to echo the option name into
-
- /sys/kernel/debug/tracing/trace_options
-
- For example, to enable stacktrace option (to dump the
- stack trace of each event), add to the command line:
-
- trace_options=stacktrace
-
- See also Documentation/trace/ftrace.rst "trace options"
- section.
-
- tp_printk[FTRACE]
+ tp_printk [FTRACE]
Have the tracepoints sent to printk as well as the
tracing ring buffer. This is useful for early boot up
where the system hangs or reboots and does not give the
@@ -5921,7 +6003,7 @@
frequency tracepoints such as irq or sched, can cause
the system to live lock.
- tp_printk_stop_on_boot[FTRACE]
+ tp_printk_stop_on_boot [FTRACE]
When tp_printk (above) is set, it can cause a lot of noise
on the console. It may be useful to only include the
printing of events during boot up, as user space may
@@ -5930,6 +6012,53 @@
This command line option will stop the printing of events
to console at the late_initcall_sync() time frame.
+ trace_buf_size=nn[KMG]
+ [FTRACE] will set tracing buffer size on each cpu.
+
+ trace_clock= [FTRACE] Set the clock used for tracing events
+ at boot up.
+ local - Use the per CPU time stamp counter
+ (converted into nanoseconds). Fast, but
+ depending on the architecture, may not be
+ in sync between CPUs.
+ global - Event time stamps are synchronize across
+ CPUs. May be slower than the local clock,
+ but better for some race conditions.
+ counter - Simple counting of events (1, 2, ..)
+ note, some counts may be skipped due to the
+ infrastructure grabbing the clock more than
+ once per event.
+ uptime - Use jiffies as the time stamp.
+ perf - Use the same clock that perf uses.
+ mono - Use ktime_get_mono_fast_ns() for time stamps.
+ mono_raw - Use ktime_get_raw_fast_ns() for time
+ stamps.
+ boot - Use ktime_get_boot_fast_ns() for time stamps.
+ Architectures may add more clocks. See
+ Documentation/trace/ftrace.rst for more details.
+
+ trace_event=[event-list]
+ [FTRACE] Set and start specified trace events in order
+ to facilitate early boot debugging. The event-list is a
+ comma-separated list of trace events to enable. See
+ also Documentation/trace/events.rst
+
+ trace_options=[option-list]
+ [FTRACE] Enable or disable tracer options at boot.
+ The option-list is a comma delimited list of options
+ that can be enabled or disabled just as if you were
+ to echo the option name into
+
+ /sys/kernel/debug/tracing/trace_options
+
+ For example, to enable stacktrace option (to dump the
+ stack trace of each event), add to the command line:
+
+ trace_options=stacktrace
+
+ See also Documentation/trace/ftrace.rst "trace options"
+ section.
+
traceoff_on_warning
[FTRACE] enable this option to disable tracing when a
warning is hit. This turns off "tracing_on". Tracing can
@@ -5958,11 +6087,22 @@
sources:
- "tpm"
- "tee"
+ - "caam"
If not specified then it defaults to iterating through
the trust source list starting with TPM and assigns the
first trust source as a backend which is initialized
successfully during iteration.
+ trusted.rng= [KEYS]
+ Format: <string>
+ The RNG used to generate key material for trusted keys.
+ Can be one of:
+ - "kernel"
+ - the same value as trusted.source: "tpm" or "tee"
+ - "default"
+ If not specified, "default" is used. In this case,
+ the RNG's choice is left to each individual trust source.
+
tsc= Disable clocksource stability checks for TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
@@ -6270,7 +6410,7 @@
HIGHMEM regardless of setting
of CONFIG_HIGHPTE.
- vdso= [X86,SH]
+ vdso= [X86,SH,SPARC]
On X86_32, this is an alias for vdso32=. Otherwise:
vdso=1: enable VDSO (the default)
@@ -6296,11 +6436,12 @@
video= [FB] Frame buffer configuration
See Documentation/fb/modedb.rst.
- video.brightness_switch_enabled= [0,1]
+ video.brightness_switch_enabled= [ACPI]
+ Format: [0|1]
If set to 1, on receiving an ACPI notify event
generated by hotkey, video driver will adjust brightness
level and then send out the event to user space through
- the allocated input device; If set to 0, video driver
+ the allocated input device. If set to 0, video driver
will only send out the event without touching backlight
brightness level.
default: 1
diff --git a/Documentation/admin-guide/media/vimc.dot b/Documentation/admin-guide/media/vimc.dot
index 57863a13fa39..8e829c164626 100644
--- a/Documentation/admin-guide/media/vimc.dot
+++ b/Documentation/admin-guide/media/vimc.dot
@@ -9,14 +9,14 @@ digraph board {
n00000003:port0 -> n00000008:port0 [style=bold]
n00000003:port0 -> n0000000f [style=bold]
n00000005 [label="{{<port0> 0} | Debayer A\n/dev/v4l-subdev2 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000005:port1 -> n00000017:port0
+ n00000005:port1 -> n00000015:port0
n00000008 [label="{{<port0> 0} | Debayer B\n/dev/v4l-subdev3 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000008:port1 -> n00000017:port0 [style=dashed]
+ n00000008:port1 -> n00000015:port0 [style=dashed]
n0000000b [label="Raw Capture 0\n/dev/video0", shape=box, style=filled, fillcolor=yellow]
n0000000f [label="Raw Capture 1\n/dev/video1", shape=box, style=filled, fillcolor=yellow]
- n00000013 [label="RGB/YUV Input\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
- n00000013 -> n00000017:port0 [style=dashed]
- n00000017 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev4 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
- n00000017:port1 -> n0000001a [style=bold]
- n0000001a [label="RGB/YUV Capture\n/dev/video3", shape=box, style=filled, fillcolor=yellow]
+ n00000013 [label="{{} | RGB/YUV Input\n/dev/v4l-subdev4 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000013:port0 -> n00000015:port0 [style=dashed]
+ n00000015 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev5 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000015:port1 -> n00000018 [style=bold]
+ n00000018 [label="RGB/YUV Capture\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
}
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 1144ea3229a3..ddccd1077462 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -783,6 +783,13 @@ is useful to define the root cause of RCU stalls using a vmcore.
1 panic() after printing RCU stall messages.
= ============================================================
+max_rcu_stall_to_panic
+======================
+
+When ``panic_on_rcu_stall`` is set to 1, this value determines the
+number of times that RCU can stall before panic() is called.
+
+When ``panic_on_rcu_stall`` is set to 0, this value is has no effect.
perf_cpu_time_max_percent
=========================
@@ -994,6 +1001,9 @@ This is a directory, with the following entries:
* ``boot_id``: a UUID generated the first time this is retrieved, and
unvarying after that;
+* ``uuid``: a UUID generated every time this is retrieved (this can
+ thus be used to generate UUIDs at will);
+
* ``entropy_avail``: the pool's entropy count, in bits;
* ``poolsize``: the entropy pool size, in bits;
@@ -1001,10 +1011,7 @@ This is a directory, with the following entries:
* ``urandom_min_reseed_secs``: obsolete (used to determine the minimum
number of seconds between urandom pool reseeding). This file is
writable for compatibility purposes, but writing to it has no effect
- on any RNG behavior.
-
-* ``uuid``: a UUID generated every time this is retrieved (this can
- thus be used to generate UUIDs at will);
+ on any RNG behavior;
* ``write_wakeup_threshold``: when the entropy count drops below this
(as a number of bits), processes waiting to write to ``/dev/random``
diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
index 29884b261aa9..8aefa1001ae5 100644
--- a/Documentation/arm64/booting.rst
+++ b/Documentation/arm64/booting.rst
@@ -350,6 +350,16 @@ Before jumping into the kernel, the following conditions must be met:
- SMCR_EL2.FA64 (bit 31) must be initialised to 0b1.
+ For CPUs with the Memory Tagging Extension feature (FEAT_MTE2):
+
+ - If EL3 is present:
+
+ - SCR_EL3.ATA (bit 26) must be initialised to 0b1.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - HCR_EL2.ATA (bit 56) must be initialised to 0b1.
+
The requirements described above for CPU mode, caches, MMUs, architected
timers, coherency and system registers apply to all CPUs. All CPUs must
enter the kernel in the same exception level. Where the values documented
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index a8f30963e550..f8d818eaaff5 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -264,6 +264,39 @@ HWCAP2_MTE3
Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0011, as described
by Documentation/arm64/memory-tagging-extension.rst.
+HWCAP2_SME
+
+ Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
+ by Documentation/arm64/sme.rst.
+
+HWCAP2_SME_I16I64
+
+ Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
+
+HWCAP2_SME_F64F64
+
+ Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
+
+HWCAP2_SME_I8I32
+
+ Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
+
+HWCAP2_SME_F16F32
+
+ Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
+
+HWCAP2_SME_B16F32
+
+ Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
+
+HWCAP2_SME_F32F32
+
+ Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
+
+HWCAP2_SME_FA64
+
+ Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
+
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst
index 4f840bac083e..ae21f8118830 100644
--- a/Documentation/arm64/index.rst
+++ b/Documentation/arm64/index.rst
@@ -21,6 +21,7 @@ ARM64 Architecture
perf
pointer-authentication
silicon-errata
+ sme
sve
tagged-address-abi
tagged-pointers
diff --git a/Documentation/arm64/sme.rst b/Documentation/arm64/sme.rst
new file mode 100644
index 000000000000..8ba677b87e90
--- /dev/null
+++ b/Documentation/arm64/sme.rst
@@ -0,0 +1,428 @@
+===================================================
+Scalable Matrix Extension support for AArch64 Linux
+===================================================
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Scalable Matrix Extension (SME).
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive. It should be read in conjunction with the SVE
+documentation in sve.rst which provides details on the Streaming SVE mode
+included in SME.
+
+This document does not aim to describe the SME architecture or programmer's
+model. To aid understanding, a minimal description of relevant programmer's
+model features for SME is included in Appendix A.
+
+
+1. General
+-----------
+
+* PSTATE.SM, PSTATE.ZA, the streaming mode vector length, the ZA
+ register state and TPIDR2_EL0 are tracked per thread.
+
+* The presence of SME is reported to userspace via HWCAP2_SME in the aux vector
+ AT_HWCAP2 entry. Presence of this flag implies the presence of the SME
+ instructions and registers, and the Linux-specific system interfaces
+ described in this document. SME is reported in /proc/cpuinfo as "sme".
+
+* Support for the execution of SME instructions in userspace can also be
+ detected by reading the CPU ID register ID_AA64PFR1_EL1 using an MRS
+ instruction, and checking that the value of the SME field is nonzero. [3]
+
+ It does not guarantee the presence of the system interfaces described in the
+ following sections: software that needs to verify that those interfaces are
+ present must check for HWCAP2_SME instead.
+
+* There are a number of optional SME features, presence of these is reported
+ through AT_HWCAP2 through:
+
+ HWCAP2_SME_I16I64
+ HWCAP2_SME_F64F64
+ HWCAP2_SME_I8I32
+ HWCAP2_SME_F16F32
+ HWCAP2_SME_B16F32
+ HWCAP2_SME_F32F32
+ HWCAP2_SME_FA64
+
+ This list may be extended over time as the SME architecture evolves.
+
+ These extensions are also reported via the CPU ID register ID_AA64SMFR0_EL1,
+ which userspace can read using an MRS instruction. See elf_hwcaps.txt and
+ cpu-feature-registers.txt for details.
+
+* Debuggers should restrict themselves to interacting with the target via the
+ NT_ARM_SVE, NT_ARM_SSVE and NT_ARM_ZA regsets. The recommended way
+ of detecting support for these regsets is to connect to a target process
+ first and then attempt a
+
+ ptrace(PTRACE_GETREGSET, pid, NT_ARM_<regset>, &iov).
+
+* Whenever ZA register values are exchanged in memory between userspace and
+ the kernel, the register value is encoded in memory as a series of horizontal
+ vectors from 0 to VL/8-1 stored in the same endianness invariant format as is
+ used for SVE vectors.
+
+* On thread creation TPIDR2_EL0 is preserved unless CLONE_SETTLS is specified,
+ in which case it is set to 0.
+
+2. Vector lengths
+------------------
+
+SME defines a second vector length similar to the SVE vector length which is
+controls the size of the streaming mode SVE vectors and the ZA matrix array.
+The ZA matrix is square with each side having as many bytes as a streaming
+mode SVE vector.
+
+
+3. Sharing of streaming and non-streaming mode SVE state
+---------------------------------------------------------
+
+It is implementation defined which if any parts of the SVE state are shared
+between streaming and non-streaming modes. When switching between modes
+via software interfaces such as ptrace if no register content is provided as
+part of switching no state will be assumed to be shared and everything will
+be zeroed.
+
+
+4. System call behaviour
+-------------------------
+
+* On syscall PSTATE.ZA is preserved, if PSTATE.ZA==1 then the contents of the
+ ZA matrix are preserved.
+
+* On syscall PSTATE.SM will be cleared and the SVE registers will be handled
+ as per the standard SVE ABI.
+
+* Neither the SVE registers nor ZA are used to pass arguments to or receive
+ results from any syscall.
+
+* On process creation (eg, clone()) the newly created process will have
+ PSTATE.SM cleared.
+
+* All other SME state of a thread, including the currently configured vector
+ length, the state of the PR_SME_VL_INHERIT flag, and the deferred vector
+ length (if any), is preserved across all syscalls, subject to the specific
+ exceptions for execve() described in section 6.
+
+
+5. Signal handling
+-------------------
+
+* Signal handlers are invoked with streaming mode and ZA disabled.
+
+* A new signal frame record za_context encodes the ZA register contents on
+ signal delivery. [1]
+
+* The signal frame record for ZA always contains basic metadata, in particular
+ the thread's vector length (in za_context.vl).
+
+* The ZA matrix may or may not be included in the record, depending on
+ the value of PSTATE.ZA. The registers are present if and only if:
+ za_context.head.size >= ZA_SIG_CONTEXT_SIZE(sve_vq_from_vl(za_context.vl))
+ in which case PSTATE.ZA == 1.
+
+* If matrix data is present, the remainder of the record has a vl-dependent
+ size and layout. Macros ZA_SIG_* are defined [1] to facilitate access to
+ them.
+
+* The matrix is stored as a series of horizontal vectors in the same format as
+ is used for SVE vectors.
+
+* If the ZA context is too big to fit in sigcontext.__reserved[], then extra
+ space is allocated on the stack, an extra_context record is written in
+ __reserved[] referencing this space. za_context is then written in the
+ extra space. Refer to [1] for further details about this mechanism.
+
+
+5. Signal return
+-----------------
+
+When returning from a signal handler:
+
+* If there is no za_context record in the signal frame, or if the record is
+ present but contains no register data as described in the previous section,
+ then ZA is disabled.
+
+* If za_context is present in the signal frame and contains matrix data then
+ PSTATE.ZA is set to 1 and ZA is populated with the specified data.
+
+* The vector length cannot be changed via signal return. If za_context.vl in
+ the signal frame does not match the current vector length, the signal return
+ attempt is treated as illegal, resulting in a forced SIGSEGV.
+
+
+6. prctl extensions
+--------------------
+
+Some new prctl() calls are added to allow programs to manage the SME vector
+length:
+
+prctl(PR_SME_SET_VL, unsigned long arg)
+
+ Sets the vector length of the calling thread and related flags, where
+ arg == vl | flags. Other threads of the calling process are unaffected.
+
+ vl is the desired vector length, where sve_vl_valid(vl) must be true.
+
+ flags:
+
+ PR_SME_VL_INHERIT
+
+ Inherit the current vector length across execve(). Otherwise, the
+ vector length is reset to the system default at execve(). (See
+ Section 9.)
+
+ PR_SME_SET_VL_ONEXEC
+
+ Defer the requested vector length change until the next execve()
+ performed by this thread.
+
+ The effect is equivalent to implicit execution of the following
+ call immediately after the next execve() (if any) by the thread:
+
+ prctl(PR_SME_SET_VL, arg & ~PR_SME_SET_VL_ONEXEC)
+
+ This allows launching of a new program with a different vector
+ length, while avoiding runtime side effects in the caller.
+
+ Without PR_SME_SET_VL_ONEXEC, the requested change takes effect
+ immediately.
+
+
+ Return value: a nonnegative on success, or a negative value on error:
+ EINVAL: SME not supported, invalid vector length requested, or
+ invalid flags.
+
+
+ On success:
+
+ * Either the calling thread's vector length or the deferred vector length
+ to be applied at the next execve() by the thread (dependent on whether
+ PR_SME_SET_VL_ONEXEC is present in arg), is set to the largest value
+ supported by the system that is less than or equal to vl. If vl ==
+ SVE_VL_MAX, the value set will be the largest value supported by the
+ system.
+
+ * Any previously outstanding deferred vector length change in the calling
+ thread is cancelled.
+
+ * The returned value describes the resulting configuration, encoded as for
+ PR_SME_GET_VL. The vector length reported in this value is the new
+ current vector length for this thread if PR_SME_SET_VL_ONEXEC was not
+ present in arg; otherwise, the reported vector length is the deferred
+ vector length that will be applied at the next execve() by the calling
+ thread.
+
+ * Changing the vector length causes all of ZA, P0..P15, FFR and all bits of
+ Z0..Z31 except for Z0 bits [127:0] .. Z31 bits [127:0] to become
+ unspecified, including both streaming and non-streaming SVE state.
+ Calling PR_SME_SET_VL with vl equal to the thread's current vector
+ length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ does not constitute a change to the vector length for this purpose.
+
+ * Changing the vector length causes PSTATE.ZA and PSTATE.SM to be cleared.
+ Calling PR_SME_SET_VL with vl equal to the thread's current vector
+ length, or calling PR_SME_SET_VL with the PR_SVE_SET_VL_ONEXEC flag,
+ does not constitute a change to the vector length for this purpose.
+
+
+prctl(PR_SME_GET_VL)
+
+ Gets the vector length of the calling thread.
+
+ The following flag may be OR-ed into the result:
+
+ PR_SME_VL_INHERIT
+
+ Vector length will be inherited across execve().
+
+ There is no way to determine whether there is an outstanding deferred
+ vector length change (which would only normally be the case between a
+ fork() or vfork() and the corresponding execve() in typical use).
+
+ To extract the vector length from the result, bitwise and it with
+ PR_SME_VL_LEN_MASK.
+
+ Return value: a nonnegative value on success, or a negative value on error:
+ EINVAL: SME not supported.
+
+
+7. ptrace extensions
+---------------------
+
+* A new regset NT_ARM_SSVE is defined for access to streaming mode SVE
+ state via PTRACE_GETREGSET and PTRACE_SETREGSET, this is documented in
+ sve.rst.
+
+* A new regset NT_ARM_ZA is defined for ZA state for access to ZA state via
+ PTRACE_GETREGSET and PTRACE_SETREGSET.
+
+ Refer to [2] for definitions.
+
+The regset data starts with struct user_za_header, containing:
+
+ size
+
+ Size of the complete regset, in bytes.
+ This depends on vl and possibly on other things in the future.
+
+ If a call to PTRACE_GETREGSET requests less data than the value of
+ size, the caller can allocate a larger buffer and retry in order to
+ read the complete regset.
+
+ max_size
+
+ Maximum size in bytes that the regset can grow to for the target
+ thread. The regset won't grow bigger than this even if the target
+ thread changes its vector length etc.
+
+ vl
+
+ Target thread's current streaming vector length, in bytes.
+
+ max_vl
+
+ Maximum possible streaming vector length for the target thread.
+
+ flags
+
+ Zero or more of the following flags, which have the same
+ meaning and behaviour as the corresponding PR_SET_VL_* flags:
+
+ SME_PT_VL_INHERIT
+
+ SME_PT_VL_ONEXEC (SETREGSET only).
+
+* The effects of changing the vector length and/or flags are equivalent to
+ those documented for PR_SME_SET_VL.
+
+ The caller must make a further GETREGSET call if it needs to know what VL is
+ actually set by SETREGSET, unless is it known in advance that the requested
+ VL is supported.
+
+* The size and layout of the payload depends on the header fields. The
+ SME_PT_ZA_*() macros are provided to facilitate access to the data.
+
+* In either case, for SETREGSET it is permissible to omit the payload, in which
+ case the vector length and flags are changed and PSTATE.ZA is set to 0
+ (along with any consequences of those changes). If a payload is provided
+ then PSTATE.ZA will be set to 1.
+
+* For SETREGSET, if the requested VL is not supported, the effect will be the
+ same as if the payload were omitted, except that an EIO error is reported.
+ No attempt is made to translate the payload data to the correct layout
+ for the vector length actually set. It is up to the caller to translate the
+ payload layout for the actual VL and retry.
+
+* The effect of writing a partial, incomplete payload is unspecified.
+
+
+8. ELF coredump extensions
+---------------------------
+
+* NT_ARM_SSVE notes will be added to each coredump for
+ each thread of the dumped process. The contents will be equivalent to the
+ data that would have been read if a PTRACE_GETREGSET of the corresponding
+ type were executed for each thread when the coredump was generated.
+
+* A NT_ARM_ZA note will be added to each coredump for each thread of the
+ dumped process. The contents will be equivalent to the data that would have
+ been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
+ when the coredump was generated.
+
+
+9. System runtime configuration
+--------------------------------
+
+* To mitigate the ABI impact of expansion of the signal frame, a policy
+ mechanism is provided for administrators, distro maintainers and developers
+ to set the default vector length for userspace processes:
+
+/proc/sys/abi/sme_default_vector_length
+
+ Writing the text representation of an integer to this file sets the system
+ default vector length to the specified value, unless the value is greater
+ than the maximum vector length supported by the system in which case the
+ default vector length is set to that maximum.
+
+ The result can be determined by reopening the file and reading its
+ contents.
+
+ At boot, the default vector length is initially set to 32 or the maximum
+ supported vector length, whichever is smaller and supported. This
+ determines the initial vector length of the init process (PID 1).
+
+ Reading this file returns the current system default vector length.
+
+* At every execve() call, the new vector length of the new process is set to
+ the system default vector length, unless
+
+ * PR_SME_VL_INHERIT (or equivalently SME_PT_VL_INHERIT) is set for the
+ calling thread, or
+
+ * a deferred vector length change is pending, established via the
+ PR_SME_SET_VL_ONEXEC flag (or SME_PT_VL_ONEXEC).
+
+* Modifying the system default vector length does not affect the vector length
+ of any existing process or thread that does not make an execve() call.
+
+
+Appendix A. SME programmer's model (informative)
+=================================================
+
+This section provides a minimal description of the additions made by SVE to the
+ARMv8-A programmer's model that are relevant to this document.
+
+Note: This section is for information only and not intended to be complete or
+to replace any architectural specification.
+
+A.1. Registers
+---------------
+
+In A64 state, SME adds the following:
+
+* A new mode, streaming mode, in which a subset of the normal FPSIMD and SVE
+ features are available. When supported EL0 software may enter and leave
+ streaming mode at any time.
+
+ For best system performance it is strongly encouraged for software to enable
+ streaming mode only when it is actively being used.
+
+* A new vector length controlling the size of ZA and the Z registers when in
+ streaming mode, separately to the vector length used for SVE when not in
+ streaming mode. There is no requirement that either the currently selected
+ vector length or the set of vector lengths supported for the two modes in
+ a given system have any relationship. The streaming mode vector length
+ is referred to as SVL.
+
+* A new ZA matrix register. This is a square matrix of SVLxSVL bits. Most
+ operations on ZA require that streaming mode be enabled but ZA can be
+ enabled without streaming mode in order to load, save and retain data.
+
+ For best system performance it is strongly encouraged for software to enable
+ ZA only when it is actively being used.
+
+* Two new 1 bit fields in PSTATE which may be controlled via the SMSTART and
+ SMSTOP instructions or by access to the SVCR system register:
+
+ * PSTATE.ZA, if this is 1 then the ZA matrix is accessible and has valid
+ data while if it is 0 then ZA can not be accessed. When PSTATE.ZA is
+ changed from 0 to 1 all bits in ZA are cleared.
+
+ * PSTATE.SM, if this is 1 then the PE is in streaming mode. When the value
+ of PSTATE.SM is changed then it is implementation defined if the subset
+ of the floating point register bits valid in both modes may be retained.
+ Any other bits will be cleared.
+
+
+References
+==========
+
+[1] arch/arm64/include/uapi/asm/sigcontext.h
+ AArch64 Linux signal ABI definitions
+
+[2] arch/arm64/include/uapi/asm/ptrace.h
+ AArch64 Linux ptrace ABI definitions
+
+[3] Documentation/arm64/cpu-feature-registers.rst
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 9d9a4de5bc34..93c2c2990584 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -7,7 +7,9 @@ Author: Dave Martin <Dave.Martin@arm.com>
Date: 4 August 2017
This document outlines briefly the interface provided to userspace by Linux in
-order to support use of the ARM Scalable Vector Extension (SVE).
+order to support use of the ARM Scalable Vector Extension (SVE), including
+interactions with Streaming SVE mode added by the Scalable Matrix Extension
+(SME).
This is an outline of the most important features and issues only and not
intended to be exhaustive.
@@ -23,6 +25,10 @@ model features for SVE is included in Appendix A.
* SVE registers Z0..Z31, P0..P15 and FFR and the current vector length VL, are
tracked per-thread.
+* In streaming mode FFR is not accessible unless HWCAP2_SME_FA64 is present
+ in the system, when it is not supported and these interfaces are used to
+ access streaming mode FFR is read and written as zero.
+
* The presence of SVE is reported to userspace via HWCAP_SVE in the aux vector
AT_HWCAP entry. Presence of this flag implies the presence of the SVE
instructions and registers, and the Linux-specific system interfaces
@@ -53,10 +59,19 @@ model features for SVE is included in Appendix A.
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
cpu-feature-registers.txt for details.
+* On hardware that supports the SME extensions, HWCAP2_SME will also be
+ reported in the AT_HWCAP2 aux vector entry. Among other things SME adds
+ streaming mode which provides a subset of the SVE feature set using a
+ separate SME vector length and the same Z/V registers. See sme.rst
+ for more details.
+
* Debuggers should restrict themselves to interacting with the target via the
NT_ARM_SVE regset. The recommended way of detecting support for this regset
is to connect to a target process first and then attempt a
- ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov).
+ ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov). Note that when SME is
+ present and streaming SVE mode is in use the FPSIMD subset of registers
+ will be read via NT_ARM_SVE and NT_ARM_SVE writes will exit streaming mode
+ in the target.
* Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory
between userspace and the kernel, the register value is encoded in memory in
@@ -126,6 +141,11 @@ the SVE instruction set architecture.
are only present in fpsimd_context. For convenience, the content of V0..V31
is duplicated between sve_context and fpsimd_context.
+* The record contains a flag field which includes a flag SVE_SIG_FLAG_SM which
+ if set indicates that the thread is in streaming mode and the vector length
+ and register data (if present) describe the streaming SVE data and vector
+ length.
+
* The signal frame record for SVE always contains basic metadata, in particular
the thread's vector length (in sve_context.vl).
@@ -170,6 +190,11 @@ When returning from a signal handler:
the signal frame does not match the current vector length, the signal return
attempt is treated as illegal, resulting in a forced SIGSEGV.
+* It is permitted to enter or leave streaming mode by setting or clearing
+ the SVE_SIG_FLAG_SM flag but applications should take care to ensure that
+ when doing so sve_context.vl and any register data are appropriate for the
+ vector length in the new mode.
+
6. prctl extensions
--------------------
@@ -265,8 +290,14 @@ prctl(PR_SVE_GET_VL)
7. ptrace extensions
---------------------
-* A new regset NT_ARM_SVE is defined for use with PTRACE_GETREGSET and
- PTRACE_SETREGSET.
+* New regsets NT_ARM_SVE and NT_ARM_SSVE are defined for use with
+ PTRACE_GETREGSET and PTRACE_SETREGSET. NT_ARM_SSVE describes the
+ streaming mode SVE registers and NT_ARM_SVE describes the
+ non-streaming mode SVE registers.
+
+ In this description a register set is referred to as being "live" when
+ the target is in the appropriate streaming or non-streaming mode and is
+ using data beyond the subset shared with the FPSIMD Vn registers.
Refer to [2] for definitions.
@@ -297,7 +328,7 @@ The regset data starts with struct user_sve_header, containing:
flags
- either
+ at most one of
SVE_PT_REGS_FPSIMD
@@ -331,6 +362,10 @@ The regset data starts with struct user_sve_header, containing:
SVE_PT_VL_ONEXEC (SETREGSET only).
+ If neither FPSIMD nor SVE flags are provided then no register
+ payload is available, this is only possible when SME is implemented.
+
+
* The effects of changing the vector length and/or flags are equivalent to
those documented for PR_SVE_SET_VL.
@@ -346,6 +381,13 @@ The regset data starts with struct user_sve_header, containing:
case only the vector length and flags are changed (along with any
consequences of those changes).
+* In systems supporting SME when in streaming mode a GETREGSET for
+ NT_REG_SVE will return only the user_sve_header with no register data,
+ similarly a GETREGSET for NT_REG_SSVE will not return any register data
+ when not in streaming mode.
+
+* A GETREGSET for NT_ARM_SSVE will never return SVE_PT_REGS_FPSIMD.
+
* For SETREGSET, if an SVE_PT_REGS_SVE payload is present and the
requested VL is not supported, the effect will be the same as if the
payload were omitted, except that an EIO error is reported. No
@@ -355,17 +397,25 @@ The regset data starts with struct user_sve_header, containing:
unspecified. It is up to the caller to translate the payload layout
for the actual VL and retry.
+* Where SME is implemented it is not possible to GETREGSET the register
+ state for normal SVE when in streaming mode, nor the streaming mode
+ register state when in normal mode, regardless of the implementation defined
+ behaviour of the hardware for sharing data between the two modes.
+
+* Any SETREGSET of NT_ARM_SVE will exit streaming mode if the target was in
+ streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
+ if the target was not in streaming mode.
+
* The effect of writing a partial, incomplete payload is unspecified.
8. ELF coredump extensions
---------------------------
-* A NT_ARM_SVE note will be added to each coredump for each thread of the
- dumped process. The contents will be equivalent to the data that would have
- been read if a PTRACE_GETREGSET of NT_ARM_SVE were executed for each thread
- when the coredump was generated.
-
+* NT_ARM_SVE and NT_ARM_SSVE notes will be added to each coredump for
+ each thread of the dumped process. The contents will be equivalent to the
+ data that would have been read if a PTRACE_GETREGSET of the corresponding
+ type were executed for each thread when the coredump was generated.
9. System runtime configuration
--------------------------------
diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst
index 52ea7b6b2fe8..7964fe134277 100644
--- a/Documentation/cdrom/cdrom-standard.rst
+++ b/Documentation/cdrom/cdrom-standard.rst
@@ -218,7 +218,6 @@ current *struct* is::
int (*tray_move)(struct cdrom_device_info *, int);
int (*lock_door)(struct cdrom_device_info *, int);
int (*select_speed)(struct cdrom_device_info *, int);
- int (*select_disc)(struct cdrom_device_info *, int);
int (*get_last_session) (struct cdrom_device_info *,
struct cdrom_multisession *);
int (*get_mcn)(struct cdrom_device_info *, struct cdrom_mcn *);
@@ -421,15 +420,6 @@ return value indicates an error.
::
- int select_disc(struct cdrom_device_info *cdi, int number)
-
-If the drive can store multiple discs (a juke-box) this function
-will perform disc selection. It should return the number of the
-selected disc on success, a negative value on error. Currently, only
-the ide-cd driver supports this functionality.
-
-::
-
int get_last_session(struct cdrom_device_info *cdi,
struct cdrom_multisession *ms_info)
diff --git a/Documentation/cdrom/ide-cd.rst b/Documentation/cdrom/ide-cd.rst
deleted file mode 100644
index bdccb74fc92d..000000000000
--- a/Documentation/cdrom/ide-cd.rst
+++ /dev/null
@@ -1,538 +0,0 @@
-IDE-CD driver documentation
-===========================
-
-:Originally by: scott snyder <snyder@fnald0.fnal.gov> (19 May 1996)
-:Carrying on the torch is: Erik Andersen <andersee@debian.org>
-:New maintainers (19 Oct 1998): Jens Axboe <axboe@image.dk>
-
-1. Introduction
----------------
-
-The ide-cd driver should work with all ATAPI ver 1.2 to ATAPI 2.6 compliant
-CDROM drives which attach to an IDE interface. Note that some CDROM vendors
-(including Mitsumi, Sony, Creative, Aztech, and Goldstar) have made
-both ATAPI-compliant drives and drives which use a proprietary
-interface. If your drive uses one of those proprietary interfaces,
-this driver will not work with it (but one of the other CDROM drivers
-probably will). This driver will not work with `ATAPI` drives which
-attach to the parallel port. In addition, there is at least one drive
-(CyCDROM CR520ie) which attaches to the IDE port but is not ATAPI;
-this driver will not work with drives like that either (but see the
-aztcd driver).
-
-This driver provides the following features:
-
- - Reading from data tracks, and mounting ISO 9660 filesystems.
-
- - Playing audio tracks. Most of the CDROM player programs floating
- around should work; I usually use Workman.
-
- - Multisession support.
-
- - On drives which support it, reading digital audio data directly
- from audio tracks. The program cdda2wav can be used for this.
- Note, however, that only some drives actually support this.
-
- - There is now support for CDROM changers which comply with the
- ATAPI 2.6 draft standard (such as the NEC CDR-251). This additional
- functionality includes a function call to query which slot is the
- currently selected slot, a function call to query which slots contain
- CDs, etc. A sample program which demonstrates this functionality is
- appended to the end of this file. The Sanyo 3-disc changer
- (which does not conform to the standard) is also now supported.
- Please note the driver refers to the first CD as slot # 0.
-
-
-2. Installation
----------------
-
-0. The ide-cd relies on the ide disk driver. See
- Documentation/ide/ide.rst for up-to-date information on the ide
- driver.
-
-1. Make sure that the ide and ide-cd drivers are compiled into the
- kernel you're using. When configuring the kernel, in the section
- entitled "Floppy, IDE, and other block devices", say either `Y`
- (which will compile the support directly into the kernel) or `M`
- (to compile support as a module which can be loaded and unloaded)
- to the options::
-
- ATA/ATAPI/MFM/RLL support
- Include IDE/ATAPI CDROM support
-
- Depending on what type of IDE interface you have, you may need to
- specify additional configuration options. See
- Documentation/ide/ide.rst.
-
-2. You should also ensure that the iso9660 filesystem is either
- compiled into the kernel or available as a loadable module. You
- can see if a filesystem is known to the kernel by catting
- /proc/filesystems.
-
-3. The CDROM drive should be connected to the host on an IDE
- interface. Each interface on a system is defined by an I/O port
- address and an IRQ number, the standard assignments being
- 0x1f0 and 14 for the primary interface and 0x170 and 15 for the
- secondary interface. Each interface can control up to two devices,
- where each device can be a hard drive, a CDROM drive, a floppy drive,
- or a tape drive. The two devices on an interface are called `master`
- and `slave`; this is usually selectable via a jumper on the drive.
-
- Linux names these devices as follows. The master and slave devices
- on the primary IDE interface are called `hda` and `hdb`,
- respectively. The drives on the secondary interface are called
- `hdc` and `hdd`. (Interfaces at other locations get other letters
- in the third position; see Documentation/ide/ide.rst.)
-
- If you want your CDROM drive to be found automatically by the
- driver, you should make sure your IDE interface uses either the
- primary or secondary addresses mentioned above. In addition, if
- the CDROM drive is the only device on the IDE interface, it should
- be jumpered as `master`. (If for some reason you cannot configure
- your system in this manner, you can probably still use the driver.
- You may have to pass extra configuration information to the kernel
- when you boot, however. See Documentation/ide/ide.rst for more
- information.)
-
-4. Boot the system. If the drive is recognized, you should see a
- message which looks like::
-
- hdb: NEC CD-ROM DRIVE:260, ATAPI CDROM drive
-
- If you do not see this, see section 5 below.
-
-5. You may want to create a symbolic link /dev/cdrom pointing to the
- actual device. You can do this with the command::
-
- ln -s /dev/hdX /dev/cdrom
-
- where X should be replaced by the letter indicating where your
- drive is installed.
-
-6. You should be able to see any error messages from the driver with
- the `dmesg` command.
-
-
-3. Basic usage
---------------
-
-An ISO 9660 CDROM can be mounted by putting the disc in the drive and
-typing (as root)::
-
- mount -t iso9660 /dev/cdrom /mnt/cdrom
-
-where it is assumed that /dev/cdrom is a link pointing to the actual
-device (as described in step 5 of the last section) and /mnt/cdrom is
-an empty directory. You should now be able to see the contents of the
-CDROM under the /mnt/cdrom directory. If you want to eject the CDROM,
-you must first dismount it with a command like::
-
- umount /mnt/cdrom
-
-Note that audio CDs cannot be mounted.
-
-Some distributions set up /etc/fstab to always try to mount a CDROM
-filesystem on bootup. It is not required to mount the CDROM in this
-manner, though, and it may be a nuisance if you change CDROMs often.
-You should feel free to remove the cdrom line from /etc/fstab and
-mount CDROMs manually if that suits you better.
-
-Multisession and photocd discs should work with no special handling.
-The hpcdtoppm package (ftp.gwdg.de:/pub/linux/hpcdtoppm/) may be
-useful for reading photocds.
-
-To play an audio CD, you should first unmount and remove any data
-CDROM. Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.).
-
-On a few drives, you can read digital audio directly using a program
-such as cdda2wav. The only types of drive which I've heard support
-this are Sony and Toshiba drives. You will get errors if you try to
-use this function on a drive which does not support it.
-
-For supported changers, you can use the `cdchange` program (appended to
-the end of this file) to switch between changer slots. Note that the
-drive should be unmounted before attempting this. The program takes
-two arguments: the CDROM device, and the slot number to which you wish
-to change. If the slot number is -1, the drive is unloaded.
-
-
-4. Common problems
-------------------
-
-This section discusses some common problems encountered when trying to
-use the driver, and some possible solutions. Note that if you are
-experiencing problems, you should probably also review
-Documentation/ide/ide.rst for current information about the underlying
-IDE support code. Some of these items apply only to earlier versions
-of the driver, but are mentioned here for completeness.
-
-In most cases, you should probably check with `dmesg` for any errors
-from the driver.
-
-a. Drive is not detected during booting.
-
- - Review the configuration instructions above and in
- Documentation/ide/ide.rst, and check how your hardware is
- configured.
-
- - If your drive is the only device on an IDE interface, it should
- be jumpered as master, if at all possible.
-
- - If your IDE interface is not at the standard addresses of 0x170
- or 0x1f0, you'll need to explicitly inform the driver using a
- lilo option. See Documentation/ide/ide.rst. (This feature was
- added around kernel version 1.3.30.)
-
- - If the autoprobing is not finding your drive, you can tell the
- driver to assume that one exists by using a lilo option of the
- form `hdX=cdrom`, where X is the drive letter corresponding to
- where your drive is installed. Note that if you do this and you
- see a boot message like::
-
- hdX: ATAPI cdrom (?)
-
- this does _not_ mean that the driver has successfully detected
- the drive; rather, it means that the driver has not detected a
- drive, but is assuming there's one there anyway because you told
- it so. If you actually try to do I/O to a drive defined at a
- nonexistent or nonresponding I/O address, you'll probably get
- errors with a status value of 0xff.
-
- - Some IDE adapters require a nonstandard initialization sequence
- before they'll function properly. (If this is the case, there
- will often be a separate MS-DOS driver just for the controller.)
- IDE interfaces on sound cards often fall into this category.
-
- Support for some interfaces needing extra initialization is
- provided in later 1.3.x kernels. You may need to turn on
- additional kernel configuration options to get them to work;
- see Documentation/ide/ide.rst.
-
- Even if support is not available for your interface, you may be
- able to get it to work with the following procedure. First boot
- MS-DOS and load the appropriate drivers. Then warm-boot linux
- (i.e., without powering off). If this works, it can be automated
- by running loadlin from the MS-DOS autoexec.
-
-
-b. Timeout/IRQ errors.
-
- - If you always get timeout errors, interrupts from the drive are
- probably not making it to the host.
-
- - IRQ problems may also be indicated by the message
- `IRQ probe failed (<n>)` while booting. If <n> is zero, that
- means that the system did not see an interrupt from the drive when
- it was expecting one (on any feasible IRQ). If <n> is negative,
- that means the system saw interrupts on multiple IRQ lines, when
- it was expecting to receive just one from the CDROM drive.
-
- - Double-check your hardware configuration to make sure that the IRQ
- number of your IDE interface matches what the driver expects.
- (The usual assignments are 14 for the primary (0x1f0) interface
- and 15 for the secondary (0x170) interface.) Also be sure that
- you don't have some other hardware which might be conflicting with
- the IRQ you're using. Also check the BIOS setup for your system;
- some have the ability to disable individual IRQ levels, and I've
- had one report of a system which was shipped with IRQ 15 disabled
- by default.
-
- - Note that many MS-DOS CDROM drivers will still function even if
- there are hardware problems with the interrupt setup; they
- apparently don't use interrupts.
-
- - If you own a Pioneer DR-A24X, you _will_ get nasty error messages
- on boot such as "irq timeout: status=0x50 { DriveReady SeekComplete }"
- The Pioneer DR-A24X CDROM drives are fairly popular these days.
- Unfortunately, these drives seem to become very confused when we perform
- the standard Linux ATA disk drive probe. If you own one of these drives,
- you can bypass the ATA probing which confuses these CDROM drives, by
- adding `append="hdX=noprobe hdX=cdrom"` to your lilo.conf file and running
- lilo (again where X is the drive letter corresponding to where your drive
- is installed.)
-
-c. System hangups.
-
- - If the system locks up when you try to access the CDROM, the most
- likely cause is that you have a buggy IDE adapter which doesn't
- properly handle simultaneous transactions on multiple interfaces.
- The most notorious of these is the CMD640B chip. This problem can
- be worked around by specifying the `serialize` option when
- booting. Recent kernels should be able to detect the need for
- this automatically in most cases, but the detection is not
- foolproof. See Documentation/ide/ide.rst for more information
- about the `serialize` option and the CMD640B.
-
- - Note that many MS-DOS CDROM drivers will work with such buggy
- hardware, apparently because they never attempt to overlap CDROM
- operations with other disk activity.
-
-
-d. Can't mount a CDROM.
-
- - If you get errors from mount, it may help to check `dmesg` to see
- if there are any more specific errors from the driver or from the
- filesystem.
-
- - Make sure there's a CDROM loaded in the drive, and that's it's an
- ISO 9660 disc. You can't mount an audio CD.
-
- - With the CDROM in the drive and unmounted, try something like::
-
- cat /dev/cdrom | od | more
-
- If you see a dump, then the drive and driver are probably working
- OK, and the problem is at the filesystem level (i.e., the CDROM is
- not ISO 9660 or has errors in the filesystem structure).
-
- - If you see `not a block device` errors, check that the definitions
- of the device special files are correct. They should be as
- follows::
-
- brw-rw---- 1 root disk 3, 0 Nov 11 18:48 /dev/hda
- brw-rw---- 1 root disk 3, 64 Nov 11 18:48 /dev/hdb
- brw-rw---- 1 root disk 22, 0 Nov 11 18:48 /dev/hdc
- brw-rw---- 1 root disk 22, 64 Nov 11 18:48 /dev/hdd
-
- Some early Slackware releases had these defined incorrectly. If
- these are wrong, you can remake them by running the script
- scripts/MAKEDEV.ide. (You may have to make it executable
- with chmod first.)
-
- If you have a /dev/cdrom symbolic link, check that it is pointing
- to the correct device file.
-
- If you hear people talking of the devices `hd1a` and `hd1b`, these
- were old names for what are now called hdc and hdd. Those names
- should be considered obsolete.
-
- - If mount is complaining that the iso9660 filesystem is not
- available, but you know it is (check /proc/filesystems), you
- probably need a newer version of mount. Early versions would not
- always give meaningful error messages.
-
-
-e. Directory listings are unpredictably truncated, and `dmesg` shows
- `buffer botch` error messages from the driver.
-
- - There was a bug in the version of the driver in 1.2.x kernels
- which could cause this. It was fixed in 1.3.0. If you can't
- upgrade, you can probably work around the problem by specifying a
- blocksize of 2048 when mounting. (Note that you won't be able to
- directly execute binaries off the CDROM in that case.)
-
- If you see this in kernels later than 1.3.0, please report it as a
- bug.
-
-
-f. Data corruption.
-
- - Random data corruption was occasionally observed with the Hitachi
- CDR-7730 CDROM. If you experience data corruption, using "hdx=slow"
- as a command line parameter may work around the problem, at the
- expense of low system performance.
-
-
-5. cdchange.c
--------------
-
-::
-
- /*
- * cdchange.c [-v] <device> [<slot>]
- *
- * This loads a CDROM from a specified slot in a changer, and displays
- * information about the changer status. The drive should be unmounted before
- * using this program.
- *
- * Changer information is displayed if either the -v flag is specified
- * or no slot was specified.
- *
- * Based on code originally from Gerhard Zuber <zuber@berlin.snafu.de>.
- * Changer status information, and rewrite for the new Uniform CDROM driver
- * interface by Erik Andersen <andersee@debian.org>.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <errno.h>
- #include <string.h>
- #include <unistd.h>
- #include <fcntl.h>
- #include <sys/ioctl.h>
- #include <linux/cdrom.h>
-
-
- int
- main (int argc, char **argv)
- {
- char *program;
- char *device;
- int fd; /* file descriptor for CD-ROM device */
- int status; /* return status for system calls */
- int verbose = 0;
- int slot=-1, x_slot;
- int total_slots_available;
-
- program = argv[0];
-
- ++argv;
- --argc;
-
- if (argc < 1 || argc > 3) {
- fprintf (stderr, "usage: %s [-v] <device> [<slot>]\n",
- program);
- fprintf (stderr, " Slots are numbered 1 -- n.\n");
- exit (1);
- }
-
- if (strcmp (argv[0], "-v") == 0) {
- verbose = 1;
- ++argv;
- --argc;
- }
-
- device = argv[0];
-
- if (argc == 2)
- slot = atoi (argv[1]) - 1;
-
- /* open device */
- fd = open(device, O_RDONLY | O_NONBLOCK);
- if (fd < 0) {
- fprintf (stderr, "%s: open failed for `%s`: %s\n",
- program, device, strerror (errno));
- exit (1);
- }
-
- /* Check CD player status */
- total_slots_available = ioctl (fd, CDROM_CHANGER_NSLOTS);
- if (total_slots_available <= 1 ) {
- fprintf (stderr, "%s: Device `%s` is not an ATAPI "
- "compliant CD changer.\n", program, device);
- exit (1);
- }
-
- if (slot >= 0) {
- if (slot >= total_slots_available) {
- fprintf (stderr, "Bad slot number. "
- "Should be 1 -- %d.\n",
- total_slots_available);
- exit (1);
- }
-
- /* load */
- slot=ioctl (fd, CDROM_SELECT_DISC, slot);
- if (slot<0) {
- fflush(stdout);
- perror ("CDROM_SELECT_DISC ");
- exit(1);
- }
- }
-
- if (slot < 0 || verbose) {
-
- status=ioctl (fd, CDROM_SELECT_DISC, CDSL_CURRENT);
- if (status<0) {
- fflush(stdout);
- perror (" CDROM_SELECT_DISC");
- exit(1);
- }
- slot=status;
-
- printf ("Current slot: %d\n", slot+1);
- printf ("Total slots available: %d\n",
- total_slots_available);
-
- printf ("Drive status: ");
- status = ioctl (fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
- if (status<0) {
- perror(" CDROM_DRIVE_STATUS");
- } else switch(status) {
- case CDS_DISC_OK:
- printf ("Ready.\n");
- break;
- case CDS_TRAY_OPEN:
- printf ("Tray Open.\n");
- break;
- case CDS_DRIVE_NOT_READY:
- printf ("Drive Not Ready.\n");
- break;
- default:
- printf ("This Should not happen!\n");
- break;
- }
-
- for (x_slot=0; x_slot<total_slots_available; x_slot++) {
- printf ("Slot %2d: ", x_slot+1);
- status = ioctl (fd, CDROM_DRIVE_STATUS, x_slot);
- if (status<0) {
- perror(" CDROM_DRIVE_STATUS");
- } else switch(status) {
- case CDS_DISC_OK:
- printf ("Disc present.");
- break;
- case CDS_NO_DISC:
- printf ("Empty slot.");
- break;
- case CDS_TRAY_OPEN:
- printf ("CD-ROM tray open.\n");
- break;
- case CDS_DRIVE_NOT_READY:
- printf ("CD-ROM drive not ready.\n");
- break;
- case CDS_NO_INFO:
- printf ("No Information available.");
- break;
- default:
- printf ("This Should not happen!\n");
- break;
- }
- if (slot == x_slot) {
- status = ioctl (fd, CDROM_DISC_STATUS);
- if (status<0) {
- perror(" CDROM_DISC_STATUS");
- }
- switch (status) {
- case CDS_AUDIO:
- printf ("\tAudio disc.\t");
- break;
- case CDS_DATA_1:
- case CDS_DATA_2:
- printf ("\tData disc type %d.\t", status-CDS_DATA_1+1);
- break;
- case CDS_XA_2_1:
- case CDS_XA_2_2:
- printf ("\tXA data disc type %d.\t", status-CDS_XA_2_1+1);
- break;
- default:
- printf ("\tUnknown disc type 0x%x!\t", status);
- break;
- }
- }
- status = ioctl (fd, CDROM_MEDIA_CHANGED, x_slot);
- if (status<0) {
- perror(" CDROM_MEDIA_CHANGED");
- }
- switch (status) {
- case 1:
- printf ("Changed.\n");
- break;
- default:
- printf ("\n");
- break;
- }
- }
- }
-
- /* close device */
- status = close (fd);
- if (status != 0) {
- fprintf (stderr, "%s: close failed for `%s`: %s\n",
- program, device, strerror (errno));
- exit (1);
- }
-
- exit (0);
- }
diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst
index 338ad5f94e7c..e87a8785bc1a 100644
--- a/Documentation/cdrom/index.rst
+++ b/Documentation/cdrom/index.rst
@@ -8,7 +8,6 @@ cdrom
:maxdepth: 1
cdrom-standard
- ide-cd
packet-writing
.. only:: subproject and html
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 972d46a5ddf6..dedd4d853329 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -18,8 +18,10 @@ it.
kernel-api
workqueue
+ watch_queue
printk-basics
printk-formats
+ printk-index
symbol-namespaces
Data structures and low-level utilities
diff --git a/Documentation/core-api/printk-index.rst b/Documentation/core-api/printk-index.rst
new file mode 100644
index 000000000000..3062f37d119b
--- /dev/null
+++ b/Documentation/core-api/printk-index.rst
@@ -0,0 +1,137 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Printk Index
+============
+
+There are many ways how to monitor the state of the system. One important
+source of information is the system log. It provides a lot of information,
+including more or less important warnings and error messages.
+
+There are monitoring tools that filter and take action based on messages
+logged.
+
+The kernel messages are evolving together with the code. As a result,
+particular kernel messages are not KABI and never will be!
+
+It is a huge challenge for maintaining the system log monitors. It requires
+knowing what messages were updated in a particular kernel version and why.
+Finding these changes in the sources would require non-trivial parsers.
+Also it would require matching the sources with the binary kernel which
+is not always trivial. Various changes might be backported. Various kernel
+versions might be used on different monitored systems.
+
+This is where the printk index feature might become useful. It provides
+a dump of printk formats used all over the source code used for the kernel
+and modules on the running system. It is accessible at runtime via debugfs.
+
+The printk index helps to find changes in the message formats. Also it helps
+to track the strings back to the kernel sources and the related commit.
+
+
+User Interface
+==============
+
+The index of printk formats are split in into separate files. The files are
+named according to the binaries where the printk formats are built-in. There
+is always "vmlinux" and optionally also modules, for example::
+
+ /sys/kernel/debug/printk/index/vmlinux
+ /sys/kernel/debug/printk/index/ext4
+ /sys/kernel/debug/printk/index/scsi_mod
+
+Note that only loaded modules are shown. Also printk formats from a module
+might appear in "vmlinux" when the module is built-in.
+
+The content is inspired by the dynamic debug interface and looks like::
+
+ $> head -1 /sys/kernel/debug/printk/index/vmlinux; shuf -n 5 vmlinux
+ # <level[,flags]> filename:line function "format"
+ <5> block/blk-settings.c:661 disk_stack_limits "%s: Warning: Device %s is misaligned\n"
+ <4> kernel/trace/trace.c:8296 trace_create_file "Could not create tracefs '%s' entry\n"
+ <6> arch/x86/kernel/hpet.c:144 _hpet_print_config "hpet: %s(%d):\n"
+ <6> init/do_mounts.c:605 prepare_namespace "Waiting for root device %s...\n"
+ <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n"
+
+, where the meaning is:
+
+ - :level: log level value: 0-7 for particular severity, -1 as default,
+ 'c' as continuous line without an explicit log level
+ - :flags: optional flags: currently only 'c' for KERN_CONT
+ - :filename\:line: source filename and line number of the related
+ printk() call. Note that there are many wrappers, for example,
+ pr_warn(), pr_warn_once(), dev_warn().
+ - :function: function name where the printk() call is used.
+ - :format: format string
+
+The extra information makes it a bit harder to find differences
+between various kernels. Especially the line number might change
+very often. On the other hand, it helps a lot to confirm that
+it is the same string or find the commit that is responsible
+for eventual changes.
+
+
+printk() Is Not a Stable KABI
+=============================
+
+Several developers are afraid that exporting all these implementation
+details into the user space will transform particular printk() calls
+into KABI.
+
+But it is exactly the opposite. printk() calls must _not_ be KABI.
+And the printk index helps user space tools to deal with this.
+
+
+Subsystem specific printk wrappers
+==================================
+
+The printk index is generated using extra metadata that are stored in
+a dedicated .elf section ".printk_index". It is achieved using macro
+wrappers doing __printk_index_emit() together with the real printk()
+call. The same technique is used also for the metadata used by
+the dynamic debug feature.
+
+The metadata are stored for a particular message only when it is printed
+using these special wrappers. It is implemented for the commonly
+used printk() calls, including, for example, pr_warn(), or pr_once().
+
+Additional changes are necessary for various subsystem specific wrappers
+that call the original printk() via a common helper function. These needs
+their own wrappers adding __printk_index_emit().
+
+Only few subsystem specific wrappers have been updated so far,
+for example, dev_printk(). As a result, the printk formats from
+some subsystes can be missing in the printk index.
+
+
+Subsystem specific prefix
+=========================
+
+The macro pr_fmt() macro allows to define a prefix that is printed
+before the string generated by the related printk() calls.
+
+Subsystem specific wrappers usually add even more complicated
+prefixes.
+
+These prefixes can be stored into the printk index metadata
+by an optional parameter of __printk_index_emit(). The debugfs
+interface might then show the printk formats including these prefixes.
+For example, drivers/acpi/osl.c contains::
+
+ #define pr_fmt(fmt) "ACPI: OSL: " fmt
+
+ static int __init acpi_no_auto_serialize_setup(char *str)
+ {
+ acpi_gbl_auto_serialize_methods = FALSE;
+ pr_info("Auto-serialization disabled\n");
+
+ return 1;
+ }
+
+This results in the following printk index entry::
+
+ <6> drivers/acpi/osl.c:1410 acpi_no_auto_serialize_setup "ACPI: auto-serialization disabled\n"
+
+It helps matching messages from the real log with printk index.
+Then the source file name, line number, and function name can
+be used to match the string with the source code.
diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 729e24864fe7..22ec68f24421 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -132,6 +132,7 @@ Some additional variants exist for more specialized cases:
.. c:function:: u64 ktime_get_mono_fast_ns( void )
u64 ktime_get_raw_fast_ns( void )
u64 ktime_get_boot_fast_ns( void )
+ u64 ktime_get_tai_fast_ns( void )
u64 ktime_get_real_fast_ns( void )
These variants are safe to call from any context, including from
diff --git a/Documentation/watch_queue.rst b/Documentation/core-api/watch_queue.rst
index 54f13ad5fc17..54f13ad5fc17 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/core-api/watch_queue.rst
diff --git a/Documentation/dev-tools/ktap.rst b/Documentation/dev-tools/ktap.rst
index 5ee735c6687f..d0a9565b0f44 100644
--- a/Documentation/dev-tools/ktap.rst
+++ b/Documentation/dev-tools/ktap.rst
@@ -115,34 +115,32 @@ The diagnostic data field is optional, and results which have neither a
directive nor any diagnostic data do not need to include the "#" field
separator.
-Example result lines include:
-
-.. code-block:: none
+Example result lines include::
ok 1 test_case_name
The test "test_case_name" passed.
-.. code-block:: none
+::
not ok 1 test_case_name
The test "test_case_name" failed.
-.. code-block:: none
+::
ok 1 test # SKIP necessary dependency unavailable
The test "test" was SKIPPED with the diagnostic message "necessary dependency
unavailable".
-.. code-block:: none
+::
not ok 1 test # TIMEOUT 30 seconds
The test "test" timed out, with diagnostic data "30 seconds".
-.. code-block:: none
+::
ok 5 check return code # rcode=0
@@ -202,7 +200,7 @@ allowed to be either indented or not indented.
An example of a test with two nested subtests:
-.. code-block:: none
+::
KTAP version 1
1..1
@@ -215,7 +213,7 @@ An example of a test with two nested subtests:
An example format with multiple levels of nested testing:
-.. code-block:: none
+::
KTAP version 1
1..2
@@ -250,7 +248,7 @@ nested version line, uses a line of the form
Example KTAP output
--------------------
-.. code-block:: none
+::
KTAP version 1
1..1
diff --git a/Documentation/dev-tools/kunit/api/index.rst b/Documentation/dev-tools/kunit/api/index.rst
index 3006cadcf44a..45ce04823f9f 100644
--- a/Documentation/dev-tools/kunit/api/index.rst
+++ b/Documentation/dev-tools/kunit/api/index.rst
@@ -6,6 +6,7 @@ API Reference
.. toctree::
test
+ resource
This section documents the KUnit kernel testing API. It is divided into the
following sections:
@@ -13,3 +14,7 @@ following sections:
Documentation/dev-tools/kunit/api/test.rst
- documents all of the standard testing API
+
+Documentation/dev-tools/kunit/api/resource.rst
+
+ - documents the KUnit resource API
diff --git a/Documentation/dev-tools/kunit/api/resource.rst b/Documentation/dev-tools/kunit/api/resource.rst
new file mode 100644
index 000000000000..0a94f831259e
--- /dev/null
+++ b/Documentation/dev-tools/kunit/api/resource.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Resource API
+============
+
+This file documents the KUnit resource API.
+
+Most users won't need to use this API directly, power users can use it to store
+state on a per-test basis, register custom cleanup actions, and more.
+
+.. kernel-doc:: include/kunit/resource.h
+ :internal:
diff --git a/Documentation/dev-tools/kunit/architecture.rst b/Documentation/dev-tools/kunit/architecture.rst
index ff9c85a0bff2..cf9e6e3eeae4 100644
--- a/Documentation/dev-tools/kunit/architecture.rst
+++ b/Documentation/dev-tools/kunit/architecture.rst
@@ -125,7 +125,7 @@ All expectations/assertions are formatted as:
``void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch)``.
- ``kunit_try_catch_throw`` calls function:
- ``void complete_and_exit(struct completion *, long) __noreturn;``
+ ``void kthread_complete_and_exit(struct completion *, long) __noreturn;``
and terminates the special thread context.
- ``<op>`` denotes a check with options: ``TRUE`` (supplied property
diff --git a/Documentation/dev-tools/kunit/running_tips.rst b/Documentation/dev-tools/kunit/running_tips.rst
index 7b6d26a25959..c36f6760087d 100644
--- a/Documentation/dev-tools/kunit/running_tips.rst
+++ b/Documentation/dev-tools/kunit/running_tips.rst
@@ -114,6 +114,7 @@ Instead of enabling ``CONFIG_GCOV_KERNEL=y``, we can set these options:
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_INFO=y
+ CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_GCOV=y
@@ -122,7 +123,7 @@ Putting it together into a copy-pastable sequence of commands:
.. code-block:: bash
# Append coverage options to the current config
- $ echo -e "CONFIG_DEBUG_KERNEL=y\nCONFIG_DEBUG_INFO=y\nCONFIG_GCOV=y" >> .kunit/.kunitconfig
+ $ echo -e "CONFIG_DEBUG_KERNEL=y\nCONFIG_DEBUG_INFO=y\nCONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y\nCONFIG_GCOV=y" >> .kunit/.kunitconfig
$ ./tools/testing/kunit/kunit.py run
# Extract the coverage information from the build dir (.kunit/)
$ lcov -t "my_kunit_tests" -o coverage.info -c -d .kunit/
diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index 1c83e7d60a8a..d62a04255c2e 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -125,8 +125,8 @@ We need many test cases covering all the unit's behaviors. It is common to have
many similar tests. In order to reduce duplication in these closely related
tests, most unit testing frameworks (including KUnit) provide the concept of a
*test suite*. A test suite is a collection of test cases for a unit of code
-with a setup function that gets invoked before every test case and then a tear
-down function that gets invoked after every test case completes. For example:
+with optional setup and teardown functions that run before/after the whole
+suite and/or every test case. For example:
.. code-block:: c
@@ -141,16 +141,19 @@ down function that gets invoked after every test case completes. For example:
.name = "example",
.init = example_test_init,
.exit = example_test_exit,
+ .suite_init = example_suite_init,
+ .suite_exit = example_suite_exit,
.test_cases = example_test_cases,
};
kunit_test_suite(example_test_suite);
-In the above example, the test suite ``example_test_suite`` would run the test
-cases ``example_test_foo``, ``example_test_bar``, and ``example_test_baz``. Each
-would have ``example_test_init`` called immediately before it and
-``example_test_exit`` called immediately after it.
-``kunit_test_suite(example_test_suite)`` registers the test suite with the
-KUnit test framework.
+In the above example, the test suite ``example_test_suite`` would first run
+``example_suite_init``, then run the test cases ``example_test_foo``,
+``example_test_bar``, and ``example_test_baz``. Each would have
+``example_test_init`` called immediately before it and ``example_test_exit``
+called immediately after it. Finally, ``example_suite_exit`` would be called
+after everything else. ``kunit_test_suite(example_test_suite)`` registers the
+test suite with the KUnit test framework.
.. note::
A test case will only run if it is associated with a test suite.
diff --git a/Documentation/dev-tools/testing-overview.rst b/Documentation/dev-tools/testing-overview.rst
index 65feb81edb14..0aaf6ea53608 100644
--- a/Documentation/dev-tools/testing-overview.rst
+++ b/Documentation/dev-tools/testing-overview.rst
@@ -115,3 +115,66 @@ that none of these errors are occurring during the test.
Some of these tools integrate with KUnit or kselftest and will
automatically fail tests if an issue is detected.
+Static Analysis Tools
+=====================
+
+In addition to testing a running kernel, one can also analyze kernel source code
+directly (**at compile time**) using **static analysis** tools. The tools
+commonly used in the kernel allow one to inspect the whole source tree or just
+specific files within it. They make it easier to detect and fix problems during
+the development process.
+
+Sparse can help test the kernel by performing type-checking, lock checking,
+value range checking, in addition to reporting various errors and warnings while
+examining the code. See the Documentation/dev-tools/sparse.rst documentation
+page for details on how to use it.
+
+Smatch extends Sparse and provides additional checks for programming logic
+mistakes such as missing breaks in switch statements, unused return values on
+error checking, forgetting to set an error code in the return of an error path,
+etc. Smatch also has tests against more serious issues such as integer
+overflows, null pointer dereferences, and memory leaks. See the project page at
+http://smatch.sourceforge.net/.
+
+Coccinelle is another static analyzer at our disposal. Coccinelle is often used
+to aid refactoring and collateral evolution of source code, but it can also help
+to avoid certain bugs that occur in common code patterns. The types of tests
+available include API tests, tests for correct usage of kernel iterators, checks
+for the soundness of free operations, analysis of locking behavior, and further
+tests known to help keep consistent kernel usage. See the
+Documentation/dev-tools/coccinelle.rst documentation page for details.
+
+Beware, though, that static analysis tools suffer from **false positives**.
+Errors and warns need to be evaluated carefully before attempting to fix them.
+
+When to use Sparse and Smatch
+-----------------------------
+
+Sparse does type checking, such as verifying that annotated variables do not
+cause endianness bugs, detecting places that use ``__user`` pointers improperly,
+and analyzing the compatibility of symbol initializers.
+
+Smatch does flow analysis and, if allowed to build the function database, it
+also does cross function analysis. Smatch tries to answer questions like where
+is this buffer allocated? How big is it? Can this index be controlled by the
+user? Is this variable larger than that variable?
+
+It's generally easier to write checks in Smatch than it is to write checks in
+Sparse. Nevertheless, there are some overlaps between Sparse and Smatch checks.
+
+Strong points of Smatch and Coccinelle
+--------------------------------------
+
+Coccinelle is probably the easiest for writing checks. It works before the
+pre-processor so it's easier to check for bugs in macros using Coccinelle.
+Coccinelle also creates patches for you, which no other tool does.
+
+For example, with Coccinelle you can do a mass conversion from
+``kmalloc(x * size, GFP_KERNEL)`` to ``kmalloc_array(x, size, GFP_KERNEL)``, and
+that's really useful. If you just created a Smatch warning and try to push the
+work of converting on to the maintainers they would be annoyed. You'd have to
+argue about each warning if can really overflow or not.
+
+Coccinelle does no analysis of variable values, which is the strong point of
+Smatch. On the other hand, Coccinelle allows you to do simple things in a simple
+way.
diff --git a/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml b/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
index c060c7914cae..c4e4a9eab658 100644
--- a/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
+++ b/Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
@@ -26,6 +26,7 @@ properties:
- items:
- enum:
- renesas,sata-r8a774b1 # RZ/G2N
+ - renesas,sata-r8a774e1 # RZ/G2H
- renesas,sata-r8a7795 # R-Car H3
- renesas,sata-r8a77965 # R-Car M3-N
- const: renesas,rcar-gen3-sata # generic R-Car Gen3 or RZ/G2
diff --git a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
deleted file mode 100644
index 58fc8a6cebc7..000000000000
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-* Rockchip rk3399 DMC (Dynamic Memory Controller) device
-
-Required properties:
-- compatible: Must be "rockchip,rk3399-dmc".
-- devfreq-events: Node to get DDR loading, Refer to
- Documentation/devicetree/bindings/devfreq/event/
- rockchip-dfi.txt
-- clocks: Phandles for clock specified in "clock-names" property
-- clock-names : The name of clock used by the DFI, must be
- "pclk_ddr_mon";
-- operating-points-v2: Refer to Documentation/devicetree/bindings/opp/opp-v2.yaml
- for details.
-- center-supply: DMC supply node.
-- status: Marks the node enabled/disabled.
-- rockchip,pmu: Phandle to the syscon managing the "PMU general register
- files".
-
-Optional properties:
-- interrupts: The CPU interrupt number. The interrupt specifier
- format depends on the interrupt controller.
- It should be a DCF interrupt. When DDR DVFS finishes
- a DCF interrupt is triggered.
-- rockchip,pmu: Phandle to the syscon managing the "PMU general register
- files".
-
-Following properties relate to DDR timing:
-
-- rockchip,dram_speed_bin : Value reference include/dt-bindings/clock/rk3399-ddr.h,
- it selects the DDR3 cl-trp-trcd type. It must be
- set according to "Speed Bin" in DDR3 datasheet,
- DO NOT use a smaller "Speed Bin" than specified
- for the DDR3 being used.
-
-- rockchip,pd_idle : Configure the PD_IDLE value. Defines the
- power-down idle period in which memories are
- placed into power-down mode if bus is idle
- for PD_IDLE DFI clock cycles.
-
-- rockchip,sr_idle : Configure the SR_IDLE value. Defines the
- self-refresh idle period in which memories are
- placed into self-refresh mode if bus is idle
- for SR_IDLE * 1024 DFI clock cycles (DFI
- clocks freq is half of DRAM clock), default
- value is "0".
-
-- rockchip,sr_mc_gate_idle : Defines the memory self-refresh and controller
- clock gating idle period. Memories are placed
- into self-refresh mode and memory controller
- clock arg gating started if bus is idle for
- sr_mc_gate_idle*1024 DFI clock cycles.
-
-- rockchip,srpd_lite_idle : Defines the self-refresh power down idle
- period in which memories are placed into
- self-refresh power down mode if bus is idle
- for srpd_lite_idle * 1024 DFI clock cycles.
- This parameter is for LPDDR4 only.
-
-- rockchip,standby_idle : Defines the standby idle period in which
- memories are placed into self-refresh mode.
- The controller, pi, PHY and DRAM clock will
- be gated if bus is idle for standby_idle * DFI
- clock cycles.
-
-- rockchip,dram_dll_dis_freq : Defines the DDR3 DLL bypass frequency in MHz.
- When DDR frequency is less than DRAM_DLL_DISB_FREQ,
- DDR3 DLL will be bypassed. Note: if DLL was bypassed,
- the odt will also stop working.
-
-- rockchip,phy_dll_dis_freq : Defines the PHY dll bypass frequency in
- MHz (Mega Hz). When DDR frequency is less than
- DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed.
- Note: PHY DLL and PHY ODT are independent.
-
-- rockchip,ddr3_odt_dis_freq : When the DRAM type is DDR3, this parameter defines
- the ODT disable frequency in MHz (Mega Hz).
- when the DDR frequency is less then ddr3_odt_dis_freq,
- the ODT on the DRAM side and controller side are
- both disabled.
-
-- rockchip,ddr3_drv : When the DRAM type is DDR3, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 40.
-
-- rockchip,ddr3_odt : When the DRAM type is DDR3, this parameter defines
- the DRAM side ODT strength in ohms. Default value
- is 120.
-
-- rockchip,phy_ddr3_ca_drv : When the DRAM type is DDR3, this parameter defines
- the phy side CA line (incluing command line,
- address line and clock line) driver strength.
- Default value is 40.
-
-- rockchip,phy_ddr3_dq_drv : When the DRAM type is DDR3, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 40.
-
-- rockchip,phy_ddr3_odt : When the DRAM type is DDR3, this parameter defines
- the PHY side ODT strength. Default value is 240.
-
-- rockchip,lpddr3_odt_dis_freq : When the DRAM type is LPDDR3, this parameter defines
- then ODT disable frequency in MHz (Mega Hz).
- When DDR frequency is less then ddr3_odt_dis_freq,
- the ODT on the DRAM side and controller side are
- both disabled.
-
-- rockchip,lpddr3_drv : When the DRAM type is LPDDR3, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 34.
-
-- rockchip,lpddr3_odt : When the DRAM type is LPDDR3, this parameter defines
- the DRAM side ODT strength in ohms. Default value
- is 240.
-
-- rockchip,phy_lpddr3_ca_drv : When the DRAM type is LPDDR3, this parameter defines
- the PHY side CA line (including command line,
- address line and clock line) driver strength.
- Default value is 40.
-
-- rockchip,phy_lpddr3_dq_drv : When the DRAM type is LPDDR3, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 40.
-
-- rockchip,phy_lpddr3_odt : When dram type is LPDDR3, this parameter define
- the phy side odt strength, default value is 240.
-
-- rockchip,lpddr4_odt_dis_freq : When the DRAM type is LPDDR4, this parameter
- defines the ODT disable frequency in
- MHz (Mega Hz). When the DDR frequency is less then
- ddr3_odt_dis_freq, the ODT on the DRAM side and
- controller side are both disabled.
-
-- rockchip,lpddr4_drv : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side driver strength in ohms. Default
- value is 60.
-
-- rockchip,lpddr4_dq_odt : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side ODT on DQS/DQ line strength in ohms.
- Default value is 40.
-
-- rockchip,lpddr4_ca_odt : When the DRAM type is LPDDR4, this parameter defines
- the DRAM side ODT on CA line strength in ohms.
- Default value is 40.
-
-- rockchip,phy_lpddr4_ca_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side CA line (including command address
- line) driver strength. Default value is 40.
-
-- rockchip,phy_lpddr4_ck_cs_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side clock line and CS line driver
- strength. Default value is 80.
-
-- rockchip,phy_lpddr4_dq_drv : When the DRAM type is LPDDR4, this parameter defines
- the PHY side DQ line (including DQS/DQ/DM line)
- driver strength. Default value is 80.
-
-- rockchip,phy_lpddr4_odt : When the DRAM type is LPDDR4, this parameter defines
- the PHY side ODT strength. Default value is 60.
-
-Example:
- dmc_opp_table: dmc_opp_table {
- compatible = "operating-points-v2";
-
- opp00 {
- opp-hz = /bits/ 64 <300000000>;
- opp-microvolt = <900000>;
- };
- opp01 {
- opp-hz = /bits/ 64 <666000000>;
- opp-microvolt = <900000>;
- };
- };
-
- dmc: dmc {
- compatible = "rockchip,rk3399-dmc";
- devfreq-events = <&dfi>;
- interrupts = <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cru SCLK_DDRC>;
- clock-names = "dmc_clk";
- operating-points-v2 = <&dmc_opp_table>;
- center-supply = <&ppvar_centerlogic>;
- upthreshold = <15>;
- downdifferential = <10>;
- rockchip,ddr3_speed_bin = <21>;
- rockchip,pd_idle = <0x40>;
- rockchip,sr_idle = <0x2>;
- rockchip,sr_mc_gate_idle = <0x3>;
- rockchip,srpd_lite_idle = <0x4>;
- rockchip,standby_idle = <0x2000>;
- rockchip,dram_dll_dis_freq = <300>;
- rockchip,phy_dll_dis_freq = <125>;
- rockchip,auto_pd_dis_freq = <666>;
- rockchip,ddr3_odt_dis_freq = <333>;
- rockchip,ddr3_drv = <40>;
- rockchip,ddr3_odt = <120>;
- rockchip,phy_ddr3_ca_drv = <40>;
- rockchip,phy_ddr3_dq_drv = <40>;
- rockchip,phy_ddr3_odt = <240>;
- rockchip,lpddr3_odt_dis_freq = <333>;
- rockchip,lpddr3_drv = <34>;
- rockchip,lpddr3_odt = <240>;
- rockchip,phy_lpddr3_ca_drv = <40>;
- rockchip,phy_lpddr3_dq_drv = <40>;
- rockchip,phy_lpddr3_odt = <240>;
- rockchip,lpddr4_odt_dis_freq = <333>;
- rockchip,lpddr4_drv = <60>;
- rockchip,lpddr4_dq_odt = <40>;
- rockchip,lpddr4_ca_odt = <40>;
- rockchip,phy_lpddr4_ca_drv = <40>;
- rockchip,phy_lpddr4_ck_cs_drv = <80>;
- rockchip,phy_lpddr4_dq_drv = <80>;
- rockchip,phy_lpddr4_odt = <60>;
- };
diff --git a/Documentation/devicetree/bindings/hwmon/adt7475.yaml b/Documentation/devicetree/bindings/hwmon/adt7475.yaml
index 7d9c083632b9..22beb37f1bf1 100644
--- a/Documentation/devicetree/bindings/hwmon/adt7475.yaml
+++ b/Documentation/devicetree/bindings/hwmon/adt7475.yaml
@@ -61,6 +61,26 @@ patternProperties:
$ref: /schemas/types.yaml#/definitions/uint32
enum: [0, 1]
+ "adi,pin(5|10)-function":
+ description: |
+ Configures the function for pin 5 on the adi,adt7473 and adi,adt7475. Or
+ pin 10 on the adi,adt7476 and adi,adt7490.
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - pwm2
+ - smbalert#
+
+ "adi,pin(9|14)-function":
+ description: |
+ Configures the function for pin 9 on the adi,adt7473 and adi,adt7475. Or
+ pin 14 on the adi,adt7476 and adi,adt7490
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - tach4
+ - therm#
+ - smbalert#
+ - gpio
+
required:
- compatible
- reg
@@ -79,6 +99,8 @@ examples:
adi,bypass-attenuator-in0 = <1>;
adi,bypass-attenuator-in1 = <0>;
adi,pwm-active-state = <1 0 1>;
+ adi,pin10-function = "smbalert#";
+ adi,pin14-function = "tach4";
};
};
diff --git a/Documentation/devicetree/bindings/hwmon/lm75.yaml b/Documentation/devicetree/bindings/hwmon/lm75.yaml
index 72980d083c21..8226e3b5d028 100644
--- a/Documentation/devicetree/bindings/hwmon/lm75.yaml
+++ b/Documentation/devicetree/bindings/hwmon/lm75.yaml
@@ -14,6 +14,7 @@ properties:
compatible:
enum:
- adi,adt75
+ - atmel,at30ts74
- dallas,ds1775
- dallas,ds75
- dallas,ds7505
diff --git a/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml b/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml
new file mode 100644
index 000000000000..390dd6755ff5
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/microchip,lan966x.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/microchip,lan966x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Microchip LAN966x Hardware Monitor
+
+maintainers:
+ - Michael Walle <michael@walle.cc>
+
+description: |
+ Microchip LAN966x temperature monitor and fan controller
+
+properties:
+ compatible:
+ enum:
+ - microchip,lan9668-hwmon
+
+ reg:
+ items:
+ - description: PVT registers
+ - description: FAN registers
+
+ reg-names:
+ items:
+ - const: pvt
+ - const: fan
+
+ clocks:
+ maxItems: 1
+
+ '#thermal-sensor-cells':
+ const: 0
+
+required:
+ - compatible
+ - reg
+ - reg-names
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ hwmon: hwmon@e2010180 {
+ compatible = "microchip,lan9668-hwmon";
+ reg = <0xe2010180 0xc>,
+ <0xe20042a8 0xc>;
+ reg-names = "pvt", "fan";
+ clocks = <&sys_clk>;
+ #thermal-sensor-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/national,lm90.yaml b/Documentation/devicetree/bindings/hwmon/national,lm90.yaml
index 30db92977937..b04657849852 100644
--- a/Documentation/devicetree/bindings/hwmon/national,lm90.yaml
+++ b/Documentation/devicetree/bindings/hwmon/national,lm90.yaml
@@ -34,6 +34,7 @@ properties:
- nxp,sa56004
- onnn,nct1008
- ti,tmp451
+ - ti,tmp461
- winbond,w83l771
@@ -52,10 +53,29 @@ properties:
vcc-supply:
description: phandle to the regulator that provides the +VCC supply
+ ti,extended-range-enable:
+ description: Set to enable extended range temperature.
+ type: boolean
+
required:
- compatible
- reg
+allOf:
+ - if:
+ not:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - adi,adt7461
+ - adi,adt7461a
+ - ti,tmp451
+ - ti,tmp461
+ then:
+ properties:
+ ti,extended-range-enable: false
+
additionalProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml b/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml
new file mode 100644
index 000000000000..358b262431fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/nuvoton,nct6775.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+
+$id: http://devicetree.org/schemas/hwmon/nuvoton,nct6775.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nuvoton NCT6775 and compatible Super I/O chips
+
+maintainers:
+ - Zev Weiss <zev@bewilderbeest.net>
+
+properties:
+ compatible:
+ enum:
+ - nuvoton,nct6106
+ - nuvoton,nct6116
+ - nuvoton,nct6775
+ - nuvoton,nct6776
+ - nuvoton,nct6779
+ - nuvoton,nct6791
+ - nuvoton,nct6792
+ - nuvoton,nct6793
+ - nuvoton,nct6795
+ - nuvoton,nct6796
+ - nuvoton,nct6797
+ - nuvoton,nct6798
+
+ reg:
+ maxItems: 1
+
+ nuvoton,tsi-channel-mask:
+ description:
+ Bitmask indicating which TSI temperature sensor channels are
+ active. LSB is TSI0, bit 1 is TSI1, etc.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ maximum: 0xff
+ default: 0
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ superio@4d {
+ compatible = "nuvoton,nct6779";
+ reg = <0x4d>;
+ nuvoton,tsi-channel-mask = <0x03>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml
new file mode 100644
index 000000000000..fe0ac08faa1a
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ti,tmp401.yaml
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/hwmon/ti,tmp401.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: TMP401, TPM411 and TMP43x temperature sensor
+
+maintainers:
+ - Guenter Roeck <linux@roeck-us.net>
+
+description: |
+ ±1°C Remote and Local temperature sensor
+
+ Datasheets:
+ https://www.ti.com/lit/ds/symlink/tmp401.pdf
+ https://www.ti.com/lit/ds/symlink/tmp411.pdf
+ https://www.ti.com/lit/ds/symlink/tmp431.pdf
+ https://www.ti.com/lit/ds/symlink/tmp435.pdf
+
+properties:
+ compatible:
+ enum:
+ - ti,tmp401
+ - ti,tmp411
+ - ti,tmp431
+ - ti,tmp432
+ - ti,tmp435
+
+ reg:
+ maxItems: 1
+
+ ti,extended-range-enable:
+ description:
+ When set, this sensor measures over extended temperature range.
+ type: boolean
+
+ ti,n-factor:
+ description:
+ value to be used for converting remote channel measurements to
+ temperature.
+ $ref: /schemas/types.yaml#/definitions/int32
+ items:
+ minimum: -128
+ maximum: 127
+
+ ti,beta-compensation:
+ description:
+ value to select beta correction range.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 15
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - ti,tmp401
+ then:
+ properties:
+ ti,n-factor: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - ti,tmp401
+ - ti,tmp411
+ then:
+ properties:
+ ti,beta-compensation: false
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp401";
+ reg = <0x4c>;
+ };
+ };
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@4c {
+ compatible = "ti,tmp431";
+ reg = <0x4c>;
+ ti,extended-range-enable;
+ ti,n-factor = <0x3b>;
+ ti,beta-compensation = <0x7>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml
index b1770640f94b..03ebd2665d07 100644
--- a/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml
+++ b/Documentation/devicetree/bindings/input/mediatek,mt6779-keypad.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Mediatek's Keypad Controller device tree bindings
maintainers:
- - Fengping Yu <fengping.yu@mediatek.com>
+ - Mattijs Korpershoek <mkorpershoek@baylibre.com>
allOf:
- $ref: "/schemas/input/matrix-keymap.yaml#"
diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml
index b7197f78e158..3912a89162f0 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml
@@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: ARM Generic Interrupt Controller, version 3
maintainers:
- - Marc Zyngier <marc.zyngier@arm.com>
+ - Marc Zyngier <maz@kernel.org>
description: |
AArch64 SMP cores are often associated with a GICv3, providing Private
@@ -78,7 +78,11 @@ properties:
- GIC Hypervisor interface (GICH)
- GIC Virtual CPU interface (GICV)
- GICC, GICH and GICV are optional.
+ GICC, GICH and GICV are optional, but must be described if the CPUs
+ support them. Examples of such CPUs are ARM's implementations of the
+ ARMv8.0 architecture such as Cortex-A32, A34, A35, A53, A57, A72 and
+ A73 (this list is not exhaustive).
+
minItems: 2
maxItems: 4096 # Should be enough?
diff --git a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt
deleted file mode 100644
index c4701f1eaaf6..000000000000
--- a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-Dongwoon Anatech DW9807 voice coil lens driver
-
-DW9807 is a 10-bit DAC with current sink capability. It is intended for
-controlling voice coil lenses.
-
-Mandatory properties:
-
-- compatible: "dongwoon,dw9807-vcm"
-- reg: I2C slave address
diff --git a/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.yaml b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.yaml
new file mode 100644
index 000000000000..aae246ca3fcf
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/i2c/dongwoon,dw9807-vcm.yaml
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (C) 2018, 2021 Intel Corporation
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/media/i2c/dongwoon,dw9807-vcm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Dongwoon Anatech DW9807 voice coil lens driver
+
+maintainers:
+ - Sakari Ailus <sakari.ailus@linux.intel.com>
+
+description: |
+ DW9807 is a 10-bit DAC with current sink capability. It is intended for
+ controlling voice coil lenses.
+
+properties:
+ compatible:
+ const: dongwoon,dw9807-vcm
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ lens@e {
+ compatible = "dongwoon,dw9807-vcm";
+ reg = <0x0e>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml b/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml
index afcf70947f7e..26d1807d0bb6 100644
--- a/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml
+++ b/Documentation/devicetree/bindings/media/i2c/sony,imx412.yaml
@@ -32,6 +32,15 @@ properties:
description: Clock frequency 6MHz, 12MHz, 18MHz, 24MHz or 27MHz
maxItems: 1
+ dovdd-supply:
+ description: Interface power supply.
+
+ avdd-supply:
+ description: Analog power supply.
+
+ dvdd-supply:
+ description: Digital power supply.
+
reset-gpios:
description: Reference to the GPIO connected to the XCLR pin, if any.
maxItems: 1
diff --git a/Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml b/Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml
index deb5b657a2d5..d36fcca04cbc 100644
--- a/Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml
+++ b/Documentation/devicetree/bindings/media/mediatek,vcodec-encoder.yaml
@@ -63,6 +63,9 @@ properties:
description:
Describes point to scp.
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml b/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
index c73bf2352aca..440646e44c0d 100644
--- a/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
+++ b/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
@@ -47,7 +47,9 @@ description: |
properties:
compatible:
- const: mediatek,mt8192-vcodec-dec
+ enum:
+ - mediatek,mt8192-vcodec-dec
+ - mediatek,mt8186-vcodec-dec
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/media/microchip,xisc.yaml b/Documentation/devicetree/bindings/media/microchip,xisc.yaml
index 086e1430af4f..3be8f64c3e21 100644
--- a/Documentation/devicetree/bindings/media/microchip,xisc.yaml
+++ b/Documentation/devicetree/bindings/media/microchip,xisc.yaml
@@ -67,7 +67,7 @@ properties:
remote-endpoint: true
bus-width:
- enum: [8, 9, 10, 11, 12]
+ enum: [8, 9, 10, 11, 12, 14]
default: 12
hsync-active:
diff --git a/Documentation/devicetree/bindings/media/rockchip,vdec.yaml b/Documentation/devicetree/bindings/media/rockchip,vdec.yaml
index 089f11d21b25..3bcfb8e12333 100644
--- a/Documentation/devicetree/bindings/media/rockchip,vdec.yaml
+++ b/Documentation/devicetree/bindings/media/rockchip,vdec.yaml
@@ -18,7 +18,9 @@ properties:
oneOf:
- const: rockchip,rk3399-vdec
- items:
- - const: rockchip,rk3228-vdec
+ - enum:
+ - rockchip,rk3228-vdec
+ - rockchip,rk3328-vdec
- const: rockchip,rk3399-vdec
reg:
diff --git a/Documentation/devicetree/bindings/media/rockchip-vpu.yaml b/Documentation/devicetree/bindings/media/rockchip-vpu.yaml
index bacb60a34989..6cc4d3e5a61d 100644
--- a/Documentation/devicetree/bindings/media/rockchip-vpu.yaml
+++ b/Documentation/devicetree/bindings/media/rockchip-vpu.yaml
@@ -23,6 +23,7 @@ properties:
- rockchip,rk3328-vpu
- rockchip,rk3399-vpu
- rockchip,px30-vpu
+ - rockchip,rk3568-vpu
- items:
- const: rockchip,rk3188-vpu
- const: rockchip,rk3066-vpu
diff --git a/Documentation/devicetree/bindings/media/video-interfaces.yaml b/Documentation/devicetree/bindings/media/video-interfaces.yaml
index 4391dce2caee..68c3b9871cf3 100644
--- a/Documentation/devicetree/bindings/media/video-interfaces.yaml
+++ b/Documentation/devicetree/bindings/media/video-interfaces.yaml
@@ -93,6 +93,7 @@ properties:
- 4 # MIPI CSI-2 D-PHY
- 5 # Parallel
- 6 # BT.656
+ - 7 # DPI
description:
Data bus type.
diff --git a/Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml b/Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml
index af5147f9da72..84f778a99546 100644
--- a/Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml
+++ b/Documentation/devicetree/bindings/memory-controllers/fsl/fsl,ddr.yaml
@@ -25,12 +25,6 @@ properties:
- const: fsl,qoriq-memory-controller
- enum:
- fsl,bsc9132-memory-controller
- - fsl,8540-memory-controller
- - fsl,8541-memory-controller
- - fsl,8544-memory-controller
- - fsl,8548-memory-controller
- - fsl,8555-memory-controller
- - fsl,8568-memory-controller
- fsl,mpc8536-memory-controller
- fsl,mpc8540-memory-controller
- fsl,mpc8541-memory-controller
diff --git a/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml b/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml
new file mode 100644
index 000000000000..fb4920397d08
--- /dev/null
+++ b/Documentation/devicetree/bindings/memory-controllers/rockchip,rk3399-dmc.yaml
@@ -0,0 +1,384 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# %YAML 1.2
+---
+$id: http://devicetree.org/schemas/memory-controllers/rockchip,rk3399-dmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Rockchip rk3399 DMC (Dynamic Memory Controller) device
+
+maintainers:
+ - Brian Norris <briannorris@chromium.org>
+
+properties:
+ compatible:
+ enum:
+ - rockchip,rk3399-dmc
+
+ devfreq-events:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Node to get DDR loading. Refer to
+ Documentation/devicetree/bindings/devfreq/event/rockchip-dfi.txt.
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ items:
+ - const: dmc_clk
+
+ operating-points-v2: true
+
+ center-supply:
+ description:
+ DMC regulator supply.
+
+ rockchip,pmu:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ Phandle to the syscon managing the "PMU general register files".
+
+ interrupts:
+ maxItems: 1
+ description:
+ The CPU interrupt number. It should be a DCF interrupt. When DDR DVFS
+ finishes, a DCF interrupt is triggered.
+
+ rockchip,ddr3_speed_bin:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ For values, reference include/dt-bindings/clock/rk3399-ddr.h. Selects the
+ DDR3 cl-trp-trcd type. It must be set according to "Speed Bin" in DDR3
+ datasheet; DO NOT use a smaller "Speed Bin" than specified for the DDR3
+ being used.
+
+ rockchip,pd_idle:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Configure the PD_IDLE value. Defines the power-down idle period in which
+ memories are placed into power-down mode if bus is idle for PD_IDLE DFI
+ clock cycles.
+ See also rockchip,pd-idle-ns.
+
+ rockchip,sr_idle:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Configure the SR_IDLE value. Defines the self-refresh idle period in
+ which memories are placed into self-refresh mode if bus is idle for
+ SR_IDLE * 1024 DFI clock cycles (DFI clocks freq is half of DRAM clock).
+ See also rockchip,sr-idle-ns.
+ default: 0
+
+ rockchip,sr_mc_gate_idle:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Defines the memory self-refresh and controller clock gating idle period.
+ Memories are placed into self-refresh mode and memory controller clock
+ arg gating started if bus is idle for sr_mc_gate_idle*1024 DFI clock
+ cycles.
+ See also rockchip,sr-mc-gate-idle-ns.
+
+ rockchip,srpd_lite_idle:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Defines the self-refresh power down idle period in which memories are
+ placed into self-refresh power down mode if bus is idle for
+ srpd_lite_idle * 1024 DFI clock cycles. This parameter is for LPDDR4
+ only.
+ See also rockchip,srpd-lite-idle-ns.
+
+ rockchip,standby_idle:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Defines the standby idle period in which memories are placed into
+ self-refresh mode. The controller, pi, PHY and DRAM clock will be gated
+ if bus is idle for standby_idle * DFI clock cycles.
+ See also rockchip,standby-idle-ns.
+
+ rockchip,dram_dll_dis_freq:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Defines the DDR3 DLL bypass frequency in MHz. When DDR frequency is less
+ than DRAM_DLL_DISB_FREQ, DDR3 DLL will be bypassed.
+ Note: if DLL was bypassed, the odt will also stop working.
+
+ rockchip,phy_dll_dis_freq:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ Defines the PHY dll bypass frequency in MHz (Mega Hz). When DDR frequency
+ is less than DRAM_DLL_DISB_FREQ, PHY DLL will be bypassed.
+ Note: PHY DLL and PHY ODT are independent.
+
+ rockchip,auto_pd_dis_freq:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Defines the auto PD disable frequency in MHz.
+
+ rockchip,ddr3_odt_dis_freq:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1000000 # In case anyone thought this was MHz.
+ description:
+ When the DRAM type is DDR3, this parameter defines the ODT disable
+ frequency in Hz. When the DDR frequency is less then ddr3_odt_dis_freq,
+ the ODT on the DRAM side and controller side are both disabled.
+
+ rockchip,ddr3_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is DDR3, this parameter defines the DRAM side drive
+ strength in ohms.
+ default: 40
+
+ rockchip,ddr3_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is DDR3, this parameter defines the DRAM side ODT
+ strength in ohms.
+ default: 120
+
+ rockchip,phy_ddr3_ca_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is DDR3, this parameter defines the phy side CA line
+ (incluing command line, address line and clock line) drive strength.
+ default: 40
+
+ rockchip,phy_ddr3_dq_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is DDR3, this parameter defines the PHY side DQ line
+ (including DQS/DQ/DM line) drive strength.
+ default: 40
+
+ rockchip,phy_ddr3_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is DDR3, this parameter defines the PHY side ODT
+ strength.
+ default: 240
+
+ rockchip,lpddr3_odt_dis_freq:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1000000 # In case anyone thought this was MHz.
+ description:
+ When the DRAM type is LPDDR3, this parameter defines then ODT disable
+ frequency in Hz. When DDR frequency is less then ddr3_odt_dis_freq, the
+ ODT on the DRAM side and controller side are both disabled.
+
+ rockchip,lpddr3_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR3, this parameter defines the DRAM side drive
+ strength in ohms.
+ default: 34
+
+ rockchip,lpddr3_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR3, this parameter defines the DRAM side ODT
+ strength in ohms.
+ default: 240
+
+ rockchip,phy_lpddr3_ca_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR3, this parameter defines the PHY side CA line
+ (including command line, address line and clock line) drive strength.
+ default: 40
+
+ rockchip,phy_lpddr3_dq_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR3, this parameter defines the PHY side DQ line
+ (including DQS/DQ/DM line) drive strength.
+ default: 40
+
+ rockchip,phy_lpddr3_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When dram type is LPDDR3, this parameter define the phy side odt
+ strength, default value is 240.
+
+ rockchip,lpddr4_odt_dis_freq:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1000000 # In case anyone thought this was MHz.
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the ODT disable
+ frequency in Hz. When the DDR frequency is less then ddr3_odt_dis_freq,
+ the ODT on the DRAM side and controller side are both disabled.
+
+ rockchip,lpddr4_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the DRAM side drive
+ strength in ohms.
+ default: 60
+
+ rockchip,lpddr4_dq_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the DRAM side ODT on
+ DQS/DQ line strength in ohms.
+ default: 40
+
+ rockchip,lpddr4_ca_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the DRAM side ODT on
+ CA line strength in ohms.
+ default: 40
+
+ rockchip,phy_lpddr4_ca_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the PHY side CA line
+ (including command address line) drive strength.
+ default: 40
+
+ rockchip,phy_lpddr4_ck_cs_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the PHY side clock
+ line and CS line drive strength.
+ default: 80
+
+ rockchip,phy_lpddr4_dq_drv:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the PHY side DQ line
+ (including DQS/DQ/DM line) drive strength.
+ default: 80
+
+ rockchip,phy_lpddr4_odt:
+ deprecated: true
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ When the DRAM type is LPDDR4, this parameter defines the PHY side ODT
+ strength.
+ default: 60
+
+ rockchip,pd-idle-ns:
+ description:
+ Configure the PD_IDLE value in nanoseconds. Defines the power-down idle
+ period in which memories are placed into power-down mode if bus is idle
+ for PD_IDLE nanoseconds.
+
+ rockchip,sr-idle-ns:
+ description:
+ Configure the SR_IDLE value in nanoseconds. Defines the self-refresh idle
+ period in which memories are placed into self-refresh mode if bus is idle
+ for SR_IDLE nanoseconds.
+ default: 0
+
+ rockchip,sr-mc-gate-idle-ns:
+ description:
+ Defines the memory self-refresh and controller clock gating idle period in nanoseconds.
+ Memories are placed into self-refresh mode and memory controller clock
+ arg gating started if bus is idle for sr_mc_gate_idle nanoseconds.
+
+ rockchip,srpd-lite-idle-ns:
+ description:
+ Defines the self-refresh power down idle period in which memories are
+ placed into self-refresh power down mode if bus is idle for
+ srpd_lite_idle nanoseonds. This parameter is for LPDDR4 only.
+
+ rockchip,standby-idle-ns:
+ description:
+ Defines the standby idle period in which memories are placed into
+ self-refresh mode. The controller, pi, PHY and DRAM clock will be gated
+ if bus is idle for standby_idle nanoseconds.
+
+ rockchip,pd-idle-dis-freq-hz:
+ description:
+ Defines the power-down idle disable frequency in Hz. When the DDR
+ frequency is greater than pd-idle-dis-freq, power-down idle is disabled.
+ See also rockchip,pd-idle-ns.
+
+ rockchip,sr-idle-dis-freq-hz:
+ description:
+ Defines the self-refresh idle disable frequency in Hz. When the DDR
+ frequency is greater than sr-idle-dis-freq, self-refresh idle is
+ disabled. See also rockchip,sr-idle-ns.
+
+ rockchip,sr-mc-gate-idle-dis-freq-hz:
+ description:
+ Defines the self-refresh and memory-controller clock gating disable
+ frequency in Hz. When the DDR frequency is greater than
+ sr-mc-gate-idle-dis-freq, the clock will not be gated when idle. See also
+ rockchip,sr-mc-gate-idle-ns.
+
+ rockchip,srpd-lite-idle-dis-freq-hz:
+ description:
+ Defines the self-refresh power down idle disable frequency in Hz. When
+ the DDR frequency is greater than srpd-lite-idle-dis-freq, memory will
+ not be placed into self-refresh power down mode when idle. See also
+ rockchip,srpd-lite-idle-ns.
+
+ rockchip,standby-idle-dis-freq-hz:
+ description:
+ Defines the standby idle disable frequency in Hz. When the DDR frequency
+ is greater than standby-idle-dis-freq, standby idle is disabled. See also
+ rockchip,standby-idle-ns.
+
+required:
+ - compatible
+ - devfreq-events
+ - clocks
+ - clock-names
+ - operating-points-v2
+ - center-supply
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/clock/rk3399-cru.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ memory-controller {
+ compatible = "rockchip,rk3399-dmc";
+ devfreq-events = <&dfi>;
+ rockchip,pmu = <&pmu>;
+ interrupts = <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cru SCLK_DDRC>;
+ clock-names = "dmc_clk";
+ operating-points-v2 = <&dmc_opp_table>;
+ center-supply = <&ppvar_centerlogic>;
+ rockchip,pd-idle-ns = <160>;
+ rockchip,sr-idle-ns = <10240>;
+ rockchip,sr-mc-gate-idle-ns = <40960>;
+ rockchip,srpd-lite-idle-ns = <61440>;
+ rockchip,standby-idle-ns = <81920>;
+ rockchip,ddr3_odt_dis_freq = <333000000>;
+ rockchip,lpddr3_odt_dis_freq = <333000000>;
+ rockchip,lpddr4_odt_dis_freq = <333000000>;
+ rockchip,pd-idle-dis-freq-hz = <1000000000>;
+ rockchip,sr-idle-dis-freq-hz = <1000000000>;
+ rockchip,sr-mc-gate-idle-dis-freq-hz = <1000000000>;
+ rockchip,srpd-lite-idle-dis-freq-hz = <0>;
+ rockchip,standby-idle-dis-freq-hz = <928000000>;
+ };
diff --git a/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml b/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml
index dccd5ad96981..b672202fff4e 100644
--- a/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml
+++ b/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml
@@ -31,7 +31,7 @@ properties:
- const: brcm,sdhci-brcmstb
reg:
- minItems: 2
+ maxItems: 2
reg-names:
items:
@@ -65,15 +65,15 @@ unevaluatedProperties: false
examples:
- |
mmc@84b0000 {
- sd-uhs-sdr50;
- sd-uhs-ddr50;
- sd-uhs-sdr104;
- sdhci,auto-cmd12;
compatible = "brcm,bcm7216-sdhci",
"brcm,bcm7445-sdhci",
"brcm,sdhci-brcmstb";
reg = <0x84b0000 0x260>, <0x84b0300 0x200>;
reg-names = "host", "cfg";
+ sd-uhs-sdr50;
+ sd-uhs-ddr50;
+ sd-uhs-sdr104;
+ sdhci,auto-cmd12;
interrupts = <0x0 0x26 0x4>;
interrupt-names = "sdio0_0";
clocks = <&scmi_clk 245>;
@@ -81,6 +81,11 @@ examples:
};
mmc@84b1000 {
+ compatible = "brcm,bcm7216-sdhci",
+ "brcm,bcm7445-sdhci",
+ "brcm,sdhci-brcmstb";
+ reg = <0x84b1000 0x260>, <0x84b1300 0x200>;
+ reg-names = "host", "cfg";
mmc-ddr-1_8v;
mmc-hs200-1_8v;
mmc-hs400-1_8v;
@@ -88,11 +93,6 @@ examples:
supports-cqe;
non-removable;
bus-width = <0x8>;
- compatible = "brcm,bcm7216-sdhci",
- "brcm,bcm7445-sdhci",
- "brcm,sdhci-brcmstb";
- reg = <0x84b1000 0x260>, <0x84b1300 0x200>;
- reg-names = "host", "cfg";
interrupts = <0x0 0x27 0x4>;
interrupt-names = "sdio1_0";
clocks = <&scmi_clk 245>;
diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
index 7dbbcae9485c..29339d0196ec 100644
--- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
+++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml
@@ -34,22 +34,47 @@ properties:
- fsl,imx6ull-usdhc
- fsl,imx7d-usdhc
- fsl,imx7ulp-usdhc
+ - fsl,imx8mm-usdhc
- fsl,imxrt1050-usdhc
- nxp,s32g2-usdhc
- items:
- enum:
+ - fsl,imx8mq-usdhc
+ - const: fsl,imx7d-usdhc
+ - items:
+ - enum:
+ - fsl,imx8mn-usdhc
+ - fsl,imx8mp-usdhc
+ - fsl,imx93-usdhc
+ - fsl,imx8ulp-usdhc
+ - const: fsl,imx8mm-usdhc
+ - items:
+ - enum:
+ - fsl,imx8qm-usdhc
+ - const: fsl,imx8qxp-usdhc
+ - items:
+ - enum:
+ - fsl,imx8dxl-usdhc
- fsl,imx8mm-usdhc
- fsl,imx8mn-usdhc
- fsl,imx8mp-usdhc
- - fsl,imx8mq-usdhc
- fsl,imx8qm-usdhc
- fsl,imx8qxp-usdhc
- const: fsl,imx7d-usdhc
+ deprecated: true
- items:
- enum:
- - fsl,imx93-usdhc
- - fsl,imx8ulp-usdhc
+ - fsl,imx8mn-usdhc
+ - fsl,imx8mp-usdhc
- const: fsl,imx8mm-usdhc
+ - const: fsl,imx7d-usdhc
+ deprecated: true
+ - items:
+ - enum:
+ - fsl,imx8qm-usdhc
+ - const: fsl,imx8qxp-usdhc
+ - const: fsl,imx7d-usdhc
+ deprecated: true
reg:
maxItems: 1
diff --git a/Documentation/devicetree/bindings/mmc/marvell,dove-sdhci.yaml b/Documentation/devicetree/bindings/mmc/marvell,dove-sdhci.yaml
new file mode 100644
index 000000000000..7c9c652ad59c
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/marvell,dove-sdhci.yaml
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/marvell,dove-sdhci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell sdhci-dove controller
+
+maintainers:
+ - Adrian Hunter <adrian.hunter@intel.com>
+ - Ulf Hansson <ulf.hansson@linaro.org>
+
+allOf:
+ - $ref: mmc-controller.yaml#
+
+properties:
+ compatible:
+ const: marvell,dove-sdhci
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ minItems: 1
+ maxItems: 2
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ sdio0: mmc@92000 {
+ compatible = "marvell,dove-sdhci";
+ reg = <0x92000 0x100>;
+ interrupts = <35>;
+ clocks = <&gate_clk 9>;
+ };
diff --git a/Documentation/devicetree/bindings/mmc/marvell,orion-sdio.yaml b/Documentation/devicetree/bindings/mmc/marvell,orion-sdio.yaml
new file mode 100644
index 000000000000..8a97ded15aed
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/marvell,orion-sdio.yaml
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/marvell,orion-sdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell orion-sdio controller
+
+maintainers:
+ - Nicolas Pitre <nico@fluxnic.net>
+ - Ulf Hansson <ulf.hansson@linaro.org>
+
+allOf:
+ - $ref: mmc-controller.yaml#
+
+properties:
+ compatible:
+ const: marvell,orion-sdio
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ mmc@d00d4000 {
+ compatible = "marvell,orion-sdio";
+ reg = <0xd00d4000 0x200>;
+ interrupts = <54>;
+ clocks = <&gateclk 17>;
+ };
diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
deleted file mode 100644
index c51a62d751dc..000000000000
--- a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
+++ /dev/null
@@ -1,173 +0,0 @@
-Marvell Xenon SDHCI Controller device tree bindings
-This file documents differences between the core mmc properties
-described by mmc.txt and the properties used by the Xenon implementation.
-
-Multiple SDHCs might be put into a single Xenon IP, to save size and cost.
-Each SDHC is independent and owns independent resources, such as register sets,
-clock and PHY.
-Each SDHC should have an independent device tree node.
-
-Required Properties:
-- compatible: should be one of the following
- - "marvell,armada-3700-sdhci": For controllers on Armada-3700 SoC.
- Must provide a second register area and marvell,pad-type.
- - "marvell,armada-ap806-sdhci": For controllers on Armada AP806.
- - "marvell,armada-ap807-sdhci": For controllers on Armada AP807.
- - "marvell,armada-cp110-sdhci": For controllers on Armada CP110.
-
-- clocks:
- Array of clocks required for SDHC.
- Require at least input clock for Xenon IP core. For Armada AP806 and
- CP110, the AXI clock is also mandatory.
-
-- clock-names:
- Array of names corresponding to clocks property.
- The input clock for Xenon IP core should be named as "core".
- The input clock for the AXI bus must be named as "axi".
-
-- reg:
- * For "marvell,armada-3700-sdhci", two register areas.
- The first one for Xenon IP register. The second one for the Armada 3700 SoC
- PHY PAD Voltage Control register.
- Please follow the examples with compatible "marvell,armada-3700-sdhci"
- in below.
- Please also check property marvell,pad-type in below.
-
- * For other compatible strings, one register area for Xenon IP.
-
-Optional Properties:
-- marvell,xenon-sdhc-id:
- Indicate the corresponding bit index of current SDHC in
- SDHC System Operation Control Register Bit[7:0].
- Set/clear the corresponding bit to enable/disable current SDHC.
- If Xenon IP contains only one SDHC, this property is optional.
-
-- marvell,xenon-phy-type:
- Xenon support multiple types of PHYs.
- To select eMMC 5.1 PHY, set:
- marvell,xenon-phy-type = "emmc 5.1 phy"
- eMMC 5.1 PHY is the default choice if this property is not provided.
- To select eMMC 5.0 PHY, set:
- marvell,xenon-phy-type = "emmc 5.0 phy"
-
- All those types of PHYs can support eMMC, SD and SDIO.
- Please note that this property only presents the type of PHY.
- It doesn't stand for the entire SDHC type or property.
- For example, "emmc 5.1 phy" doesn't mean that this Xenon SDHC only
- supports eMMC 5.1.
-
-- marvell,xenon-phy-znr:
- Set PHY ZNR value.
- Only available for eMMC PHY.
- Valid range = [0:0x1F].
- ZNR is set as 0xF by default if this property is not provided.
-
-- marvell,xenon-phy-zpr:
- Set PHY ZPR value.
- Only available for eMMC PHY.
- Valid range = [0:0x1F].
- ZPR is set as 0xF by default if this property is not provided.
-
-- marvell,xenon-phy-nr-success-tun:
- Set the number of required consecutive successful sampling points
- used to identify a valid sampling window, in tuning process.
- Valid range = [1:7].
- Set as 0x4 by default if this property is not provided.
-
-- marvell,xenon-phy-tun-step-divider:
- Set the divider for calculating TUN_STEP.
- Set as 64 by default if this property is not provided.
-
-- marvell,xenon-phy-slow-mode:
- If this property is selected, transfers will bypass PHY.
- Only available when bus frequency lower than 55MHz in SDR mode.
- Disabled by default. Please only try this property if timing issues
- always occur with PHY enabled in eMMC HS SDR, SD SDR12, SD SDR25,
- SD Default Speed and HS mode and eMMC legacy speed mode.
-
-- marvell,xenon-tun-count:
- Xenon SDHC SoC usually doesn't provide re-tuning counter in
- Capabilities Register 3 Bit[11:8].
- This property provides the re-tuning counter.
- If this property is not set, default re-tuning counter will
- be set as 0x9 in driver.
-
-- marvell,pad-type:
- Type of Armada 3700 SoC PHY PAD Voltage Controller register.
- Only valid when "marvell,armada-3700-sdhci" is selected.
- Two types: "sd" and "fixed-1-8v".
- If "sd" is selected, SoC PHY PAD is set as 3.3V at the beginning and is
- switched to 1.8V when later in higher speed mode.
- If "fixed-1-8v" is selected, SoC PHY PAD is fixed 1.8V, such as for eMMC.
- Please follow the examples with compatible "marvell,armada-3700-sdhci"
- in below.
-
-Example:
-- For eMMC:
-
- sdhci@aa0000 {
- compatible = "marvell,armada-ap806-sdhci";
- reg = <0xaa0000 0x1000>;
- interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>
- clocks = <&emmc_clk>,<&axi_clk>;
- clock-names = "core", "axi";
- bus-width = <4>;
- marvell,xenon-phy-slow-mode;
- marvell,xenon-tun-count = <11>;
- non-removable;
- no-sd;
- no-sdio;
-
- /* Vmmc and Vqmmc are both fixed */
- };
-
-- For SD/SDIO:
-
- sdhci@ab0000 {
- compatible = "marvell,armada-cp110-sdhci";
- reg = <0xab0000 0x1000>;
- interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>
- vqmmc-supply = <&sd_vqmmc_regulator>;
- vmmc-supply = <&sd_vmmc_regulator>;
- clocks = <&sdclk>, <&axi_clk>;
- clock-names = "core", "axi";
- bus-width = <4>;
- marvell,xenon-tun-count = <9>;
- };
-
-- For eMMC with compatible "marvell,armada-3700-sdhci":
-
- sdhci@aa0000 {
- compatible = "marvell,armada-3700-sdhci";
- reg = <0xaa0000 0x1000>,
- <phy_addr 0x4>;
- interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>
- clocks = <&emmcclk>;
- clock-names = "core";
- bus-width = <8>;
- mmc-ddr-1_8v;
- mmc-hs400-1_8v;
- non-removable;
- no-sd;
- no-sdio;
-
- /* Vmmc and Vqmmc are both fixed */
-
- marvell,pad-type = "fixed-1-8v";
- };
-
-- For SD/SDIO with compatible "marvell,armada-3700-sdhci":
-
- sdhci@ab0000 {
- compatible = "marvell,armada-3700-sdhci";
- reg = <0xab0000 0x1000>,
- <phy_addr 0x4>;
- interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>
- vqmmc-supply = <&sd_regulator>;
- /* Vmmc is fixed */
- clocks = <&sdclk>;
- clock-names = "core";
- bus-width = <4>;
-
- marvell,pad-type = "sd";
- };
diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
new file mode 100644
index 000000000000..c79639e9027e
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml
@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/marvell,xenon-sdhci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell Xenon SDHCI Controller
+
+description: |
+ This file documents differences between the core MMC properties described by
+ mmc-controller.yaml and the properties used by the Xenon implementation.
+
+ Multiple SDHCs might be put into a single Xenon IP, to save size and cost.
+ Each SDHC is independent and owns independent resources, such as register
+ sets, clock and PHY.
+
+ Each SDHC should have an independent device tree node.
+
+maintainers:
+ - Ulf Hansson <ulf.hansson@linaro.org>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - marvell,armada-cp110-sdhci
+ - marvell,armada-ap806-sdhci
+
+ - items:
+ - const: marvell,armada-ap807-sdhci
+ - const: marvell,armada-ap806-sdhci
+
+ - items:
+ - const: marvell,armada-3700-sdhci
+ - const: marvell,sdhci-xenon
+
+ reg:
+ minItems: 1
+ maxItems: 2
+ description: |
+ For "marvell,armada-3700-sdhci", two register areas. The first one
+ for Xenon IP register. The second one for the Armada 3700 SoC PHY PAD
+ Voltage Control register. Please follow the examples with compatible
+ "marvell,armada-3700-sdhci" in below.
+ Please also check property marvell,pad-type in below.
+
+ For other compatible strings, one register area for Xenon IP.
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ minItems: 1
+ items:
+ - const: core
+ - const: axi
+
+ marvell,xenon-sdhc-id:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 7
+ description: |
+ Indicate the corresponding bit index of current SDHC in SDHC System
+ Operation Control Register Bit[7:0]. Set/clear the corresponding bit to
+ enable/disable current SDHC.
+
+ marvell,xenon-phy-type:
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - "emmc 5.1 phy"
+ - "emmc 5.0 phy"
+ description: |
+ Xenon support multiple types of PHYs. To select eMMC 5.1 PHY, set:
+ marvell,xenon-phy-type = "emmc 5.1 phy" eMMC 5.1 PHY is the default
+ choice if this property is not provided. To select eMMC 5.0 PHY, set:
+ marvell,xenon-phy-type = "emmc 5.0 phy"
+
+ All those types of PHYs can support eMMC, SD and SDIO. Please note that
+ this property only presents the type of PHY. It doesn't stand for the
+ entire SDHC type or property. For example, "emmc 5.1 phy" doesn't mean
+ that this Xenon SDHC only supports eMMC 5.1.
+
+ marvell,xenon-phy-znr:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 0x1f
+ default: 0xf
+ description: |
+ Set PHY ZNR value.
+ Only available for eMMC PHY.
+
+ marvell,xenon-phy-zpr:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 0
+ maximum: 0x1f
+ default: 0xf
+ description: |
+ Set PHY ZPR value.
+ Only available for eMMC PHY.
+
+ marvell,xenon-phy-nr-success-tun:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ minimum: 1
+ maximum: 7
+ default: 0x4
+ description: |
+ Set the number of required consecutive successful sampling points
+ used to identify a valid sampling window, in tuning process.
+
+ marvell,xenon-phy-tun-step-divider:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ default: 64
+ description: |
+ Set the divider for calculating TUN_STEP.
+
+ marvell,xenon-phy-slow-mode:
+ type: boolean
+ description: |
+ If this property is selected, transfers will bypass PHY.
+ Only available when bus frequency lower than 55MHz in SDR mode.
+ Disabled by default. Please only try this property if timing issues
+ always occur with PHY enabled in eMMC HS SDR, SD SDR12, SD SDR25,
+ SD Default Speed and HS mode and eMMC legacy speed mode.
+
+ marvell,xenon-tun-count:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ default: 0x9
+ description: |
+ Xenon SDHC SoC usually doesn't provide re-tuning counter in
+ Capabilities Register 3 Bit[11:8].
+ This property provides the re-tuning counter.
+
+allOf:
+ - $ref: mmc-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: marvell,armada-3700-sdhci
+
+ then:
+ properties:
+ reg:
+ items:
+ - description: Xenon IP registers
+ - description: Armada 3700 SoC PHY PAD Voltage Control register
+ minItems: 2
+
+ marvell,pad-type:
+ $ref: /schemas/types.yaml#/definitions/string
+ enum:
+ - sd
+ - fixed-1-8v
+ description: |
+ Type of Armada 3700 SoC PHY PAD Voltage Controller register.
+ If "sd" is selected, SoC PHY PAD is set as 3.3V at the beginning
+ and is switched to 1.8V when later in higher speed mode.
+ If "fixed-1-8v" is selected, SoC PHY PAD is fixed 1.8V, such as for
+ eMMC.
+ Please follow the examples with compatible
+ "marvell,armada-3700-sdhci" in below.
+
+ required:
+ - marvell,pad-type
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - marvell,armada-cp110-sdhci
+ - marvell,armada-ap807-sdhci
+ - marvell,armada-ap806-sdhci
+
+ then:
+ properties:
+ clocks:
+ minItems: 2
+
+ clock-names:
+ items:
+ - const: core
+ - const: axi
+
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ // For eMMC
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ mmc@aa0000 {
+ compatible = "marvell,armada-ap807-sdhci", "marvell,armada-ap806-sdhci";
+ reg = <0xaa0000 0x1000>;
+ interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&emmc_clk 0>, <&axi_clk 0>;
+ clock-names = "core", "axi";
+ bus-width = <4>;
+ marvell,xenon-phy-slow-mode;
+ marvell,xenon-tun-count = <11>;
+ non-removable;
+ no-sd;
+ no-sdio;
+
+ /* Vmmc and Vqmmc are both fixed */
+ };
+
+ - |
+ // For SD/SDIO
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ mmc@ab0000 {
+ compatible = "marvell,armada-cp110-sdhci";
+ reg = <0xab0000 0x1000>;
+ interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
+ vqmmc-supply = <&sd_vqmmc_regulator>;
+ vmmc-supply = <&sd_vmmc_regulator>;
+ clocks = <&sdclk 0>, <&axi_clk 0>;
+ clock-names = "core", "axi";
+ bus-width = <4>;
+ marvell,xenon-tun-count = <9>;
+ };
+
+ - |
+ // For eMMC with compatible "marvell,armada-3700-sdhci":
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ mmc@aa0000 {
+ compatible = "marvell,armada-3700-sdhci", "marvell,sdhci-xenon";
+ reg = <0xaa0000 0x1000>,
+ <0x17808 0x4>;
+ interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&emmcclk 0>;
+ clock-names = "core";
+ bus-width = <8>;
+ mmc-ddr-1_8v;
+ mmc-hs400-1_8v;
+ non-removable;
+ no-sd;
+ no-sdio;
+
+ /* Vmmc and Vqmmc are both fixed */
+
+ marvell,pad-type = "fixed-1-8v";
+ };
+
+ - |
+ // For SD/SDIO with compatible "marvell,armada-3700-sdhci":
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ mmc@ab0000 {
+ compatible = "marvell,armada-3700-sdhci", "marvell,sdhci-xenon";
+ reg = <0xab0000 0x1000>,
+ <0x17808 0x4>;
+ interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
+ vqmmc-supply = <&sd_regulator>;
+ /* Vmmc is fixed */
+ clocks = <&sdclk 0>;
+ clock-names = "core";
+ bus-width = <4>;
+
+ marvell,pad-type = "sd";
+ };
diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
index 513f3c8758aa..ff5ce89e5111 100644
--- a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
+++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
@@ -298,7 +298,10 @@ properties:
vqmmc-supply:
description:
- Supply for the bus IO line power
+ Supply for the bus IO line power, such as a level shifter.
+ If the level shifter is controlled by a GPIO line, this shall
+ be modeled as a "regulator-fixed" with a GPIO line for
+ switching the level shifter on/off.
mmc-pwrseq:
$ref: /schemas/types.yaml#/definitions/phandle
diff --git a/Documentation/devicetree/bindings/mmc/mtk-sd.yaml b/Documentation/devicetree/bindings/mmc/mtk-sd.yaml
index 297ada03e3de..2a2e9fa8c188 100644
--- a/Documentation/devicetree/bindings/mmc/mtk-sd.yaml
+++ b/Documentation/devicetree/bindings/mmc/mtk-sd.yaml
@@ -40,7 +40,10 @@ properties:
- const: mediatek,mt8183-mmc
reg:
- maxItems: 1
+ minItems: 1
+ items:
+ - description: base register (required).
+ - description: top base register (required for MT8183).
clocks:
description:
@@ -168,6 +171,16 @@ required:
- vmmc-supply
- vqmmc-supply
+if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt8183-mmc
+then:
+ properties:
+ reg:
+ minItems: 2
+
unevaluatedProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/mmc/orion-sdio.txt b/Documentation/devicetree/bindings/mmc/orion-sdio.txt
deleted file mode 100644
index 10f0818a34c5..000000000000
--- a/Documentation/devicetree/bindings/mmc/orion-sdio.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-* Marvell orion-sdio controller
-
-This file documents differences between the core properties in mmc.txt
-and the properties used by the orion-sdio driver.
-
-- compatible: Should be "marvell,orion-sdio"
-- clocks: reference to the clock of the SDIO interface
-
-Example:
-
- mvsdio@d00d4000 {
- compatible = "marvell,orion-sdio";
- reg = <0xd00d4000 0x200>;
- interrupts = <54>;
- clocks = <&gateclk 17>;
- };
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml
index 0566493c4def..0ab07759b472 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml
+++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml
@@ -186,6 +186,13 @@ properties:
description: Clock Delay Buffer Select
$ref: "/schemas/types.yaml#/definitions/uint32"
+ ti,fails-without-test-cd:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ When present, indicates that the CD line is not connected
+ and the controller is required to be forced into Test mode
+ to set the TESTCD bit.
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-dove.txt b/Documentation/devicetree/bindings/mmc/sdhci-dove.txt
deleted file mode 100644
index ae9aab9abcd7..000000000000
--- a/Documentation/devicetree/bindings/mmc/sdhci-dove.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-* Marvell sdhci-dove controller
-
-This file documents differences between the core properties in mmc.txt
-and the properties used by the sdhci-pxav2 and sdhci-pxav3 drivers.
-
-- compatible: Should be "marvell,dove-sdhci".
-
-Example:
-
-sdio0: sdio@92000 {
- compatible = "marvell,dove-sdhci";
- reg = <0x92000 0x100>;
- interrupts = <35>;
-};
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt b/Documentation/devicetree/bindings/mmc/sdhci-msm.txt
deleted file mode 100644
index 6216ed777343..000000000000
--- a/Documentation/devicetree/bindings/mmc/sdhci-msm.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-* Qualcomm SDHCI controller (sdhci-msm)
-
-This file documents differences between the core properties in mmc.txt
-and the properties used by the sdhci-msm driver.
-
-Required properties:
-- compatible: Should contain a SoC-specific string and a IP version string:
- version strings:
- "qcom,sdhci-msm-v4" for sdcc versions less than 5.0
- "qcom,sdhci-msm-v5" for sdcc version 5.0
- For SDCC version 5.0.0, MCI registers are removed from SDCC
- interface and some registers are moved to HC. New compatible
- string is added to support this change - "qcom,sdhci-msm-v5".
- full compatible strings with SoC and version:
- "qcom,apq8084-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8226-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8953-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8974-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8916-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8992-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8994-sdhci", "qcom,sdhci-msm-v4"
- "qcom,msm8996-sdhci", "qcom,sdhci-msm-v4"
- "qcom,qcs404-sdhci", "qcom,sdhci-msm-v5"
- "qcom,sc7180-sdhci", "qcom,sdhci-msm-v5";
- "qcom,sc7280-sdhci", "qcom,sdhci-msm-v5";
- "qcom,sdm845-sdhci", "qcom,sdhci-msm-v5"
- "qcom,sdx55-sdhci", "qcom,sdhci-msm-v5";
- "qcom,sm8250-sdhci", "qcom,sdhci-msm-v5"
- NOTE that some old device tree files may be floating around that only
- have the string "qcom,sdhci-msm-v4" without the SoC compatible string
- but doing that should be considered a deprecated practice.
-
-- reg: Base address and length of the register in the following order:
- - Host controller register map (required)
- - SD Core register map (required for controllers earlier than msm-v5)
- - CQE register map (Optional, CQE support is present on SDHC instance meant
- for eMMC and version v4.2 and above)
- - Inline Crypto Engine register map (optional)
-- reg-names: When CQE register map is supplied, below reg-names are required
- - "hc" for Host controller register map
- - "core" for SD core register map
- - "cqhci" for CQE register map
- - "ice" for Inline Crypto Engine register map (optional)
-- interrupts: Should contain an interrupt-specifiers for the interrupts:
- - Host controller interrupt (required)
-- pinctrl-names: Should contain only one value - "default".
-- pinctrl-0: Should specify pin control groups used for this controller.
-- clocks: A list of phandle + clock-specifier pairs for the clocks listed in clock-names.
-- clock-names: Should contain the following:
- "iface" - Main peripheral bus clock (PCLK/HCLK - AHB Bus clock) (required)
- "core" - SDC MMC clock (MCLK) (required)
- "bus" - SDCC bus voter clock (optional)
- "xo" - TCXO clock (optional)
- "cal" - reference clock for RCLK delay calibration (optional)
- "sleep" - sleep clock for RCLK delay calibration (optional)
- "ice" - clock for Inline Crypto Engine (optional)
-
-- qcom,ddr-config: Certain chipsets and platforms require particular settings
- for the DDR_CONFIG register. Use this field to specify the register
- value as per the Hardware Programming Guide.
-
-- qcom,dll-config: Chipset and Platform specific value. Use this field to
- specify the DLL_CONFIG register value as per Hardware Programming Guide.
-
-Optional Properties:
-* Following bus parameters are required for interconnect bandwidth scaling:
-- interconnects: Pairs of phandles and interconnect provider specifier
- to denote the edge source and destination ports of
- the interconnect path.
-
-- interconnect-names: For sdhc, we have two main paths.
- 1. Data path : sdhc to ddr
- 2. Config path : cpu to sdhc
- For Data interconnect path the name supposed to be
- is "sdhc-ddr" and for config interconnect path it is
- "cpu-sdhc".
- Please refer to Documentation/devicetree/bindings/
- interconnect/ for more details.
-
-Example:
-
- sdhc_1: sdhci@f9824900 {
- compatible = "qcom,msm8974-sdhci", "qcom,sdhci-msm-v4";
- reg = <0xf9824900 0x11c>, <0xf9824000 0x800>;
- interrupts = <0 123 0>;
- bus-width = <8>;
- non-removable;
-
- vmmc-supply = <&pm8941_l20>;
- vqmmc-supply = <&pm8941_s3>;
-
- pinctrl-names = "default";
- pinctrl-0 = <&sdc1_clk &sdc1_cmd &sdc1_data>;
-
- clocks = <&gcc GCC_SDCC1_APPS_CLK>, <&gcc GCC_SDCC1_AHB_CLK>;
- clock-names = "core", "iface";
- interconnects = <&qnoc MASTER_SDCC_ID &qnoc SLAVE_DDR_ID>,
- <&qnoc MASTER_CPU_ID &qnoc SLAVE_SDCC_ID>;
- interconnect-names = "sdhc-ddr","cpu-sdhc";
-
- qcom,dll-config = <0x000f642c>;
- qcom,ddr-config = <0x80040868>;
- };
-
- sdhc_2: sdhci@f98a4900 {
- compatible = "qcom,msm8974-sdhci", "qcom,sdhci-msm-v4";
- reg = <0xf98a4900 0x11c>, <0xf98a4000 0x800>;
- interrupts = <0 125 0>;
- bus-width = <4>;
- cd-gpios = <&msmgpio 62 0x1>;
-
- vmmc-supply = <&pm8941_l21>;
- vqmmc-supply = <&pm8941_l13>;
-
- pinctrl-names = "default";
- pinctrl-0 = <&sdc2_clk &sdc2_cmd &sdc2_data>;
-
- clocks = <&gcc GCC_SDCC2_APPS_CLK>, <&gcc GCC_SDCC2_AHB_CLK>;
- clock-names = "core", "iface";
-
- qcom,dll-config = <0x0007642c>;
- qcom,ddr-config = <0x80040868>;
- };
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml
new file mode 100644
index 000000000000..e4236334e748
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/sdhci-msm.yaml
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/mmc/sdhci-msm.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: Qualcomm SDHCI controller (sdhci-msm)
+
+maintainers:
+ - Bhupesh Sharma <bhupesh.sharma@linaro.org>
+
+description:
+ Secure Digital Host Controller Interface (SDHCI) present on
+ Qualcomm SOCs supports SD/MMC/SDIO devices.
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - qcom,apq8084-sdhci
+ - qcom,msm8226-sdhci
+ - qcom,msm8953-sdhci
+ - qcom,msm8974-sdhci
+ - qcom,msm8916-sdhci
+ - qcom,msm8992-sdhci
+ - qcom,msm8994-sdhci
+ - qcom,msm8996-sdhci
+ - qcom,qcs404-sdhci
+ - qcom,sc7180-sdhci
+ - qcom,sc7280-sdhci
+ - qcom,sdm630-sdhci
+ - qcom,sdm845-sdhci
+ - qcom,sdx55-sdhci
+ - qcom,sdx65-sdhci
+ - qcom,sm6125-sdhci
+ - qcom,sm6350-sdhci
+ - qcom,sm8150-sdhci
+ - qcom,sm8250-sdhci
+ - enum:
+ - qcom,sdhci-msm-v4 # for sdcc versions less than 5.0
+ - qcom,sdhci-msm-v5 # for sdcc version 5.0
+ - items:
+ - const: qcom,sdhci-msm-v4 # Deprecated (only for backward compatibility)
+ # for sdcc versions less than 5.0
+
+ reg:
+ minItems: 1
+ items:
+ - description: Host controller register map
+ - description: SD Core register map
+ - description: CQE register map
+ - description: Inline Crypto Engine register map
+
+ clocks:
+ minItems: 3
+ items:
+ - description: Main peripheral bus clock, PCLK/HCLK - AHB Bus clock
+ - description: SDC MMC clock, MCLK
+ - description: TCXO clock
+ - description: clock for Inline Crypto Engine
+ - description: SDCC bus voter clock
+ - description: reference clock for RCLK delay calibration
+ - description: sleep clock for RCLK delay calibration
+
+ clock-names:
+ minItems: 2
+ items:
+ - const: iface
+ - const: core
+ - const: xo
+ - const: ice
+ - const: bus
+ - const: cal
+ - const: sleep
+
+ interrupts:
+ maxItems: 2
+
+ interrupt-names:
+ items:
+ - const: hc_irq
+ - const: pwr_irq
+
+ pinctrl-names:
+ minItems: 1
+ items:
+ - const: default
+ - const: sleep
+
+ pinctrl-0:
+ description:
+ Should specify pin control groups used for this controller.
+
+ qcom,ddr-config:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: platform specific settings for DDR_CONFIG reg.
+
+ qcom,dll-config:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: platform specific settings for DLL_CONFIG reg.
+
+ iommus:
+ minItems: 1
+ maxItems: 8
+ description: |
+ phandle to apps_smmu node with sid mask.
+
+ interconnects:
+ items:
+ - description: data path, sdhc to ddr
+ - description: config path, cpu to sdhc
+
+ interconnect-names:
+ items:
+ - const: sdhc-ddr
+ - const: cpu-sdhc
+
+ power-domains:
+ description: A phandle to sdhci power domain node
+ maxItems: 1
+
+patternProperties:
+ '^opp-table(-[a-z0-9]+)?$':
+ if:
+ properties:
+ compatible:
+ const: operating-points-v2
+ then:
+ patternProperties:
+ '^opp-?[0-9]+$':
+ required:
+ - required-opps
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - interrupts
+
+additionalProperties: true
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/qcom,gcc-sm8250.h>
+ #include <dt-bindings/clock/qcom,rpmh.h>
+ #include <dt-bindings/power/qcom-rpmpd.h>
+
+ sdhc_2: sdhci@8804000 {
+ compatible = "qcom,sm8250-sdhci", "qcom,sdhci-msm-v5";
+ reg = <0 0x08804000 0 0x1000>;
+
+ interrupts = <GIC_SPI 204 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 222 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "hc_irq", "pwr_irq";
+
+ clocks = <&gcc GCC_SDCC2_AHB_CLK>,
+ <&gcc GCC_SDCC2_APPS_CLK>,
+ <&rpmhcc RPMH_CXO_CLK>;
+ clock-names = "iface", "core", "xo";
+ iommus = <&apps_smmu 0x4a0 0x0>;
+ qcom,dll-config = <0x0007642c>;
+ qcom,ddr-config = <0x80040868>;
+ power-domains = <&rpmhpd SM8250_CX>;
+
+ operating-points-v2 = <&sdhc2_opp_table>;
+
+ sdhc2_opp_table: opp-table {
+ compatible = "operating-points-v2";
+
+ opp-19200000 {
+ opp-hz = /bits/ 64 <19200000>;
+ required-opps = <&rpmhpd_opp_min_svs>;
+ };
+
+ opp-50000000 {
+ opp-hz = /bits/ 64 <50000000>;
+ required-opps = <&rpmhpd_opp_low_svs>;
+ };
+
+ opp-100000000 {
+ opp-hz = /bits/ 64 <100000000>;
+ required-opps = <&rpmhpd_opp_svs>;
+ };
+
+ opp-202000000 {
+ opp-hz = /bits/ 64 <202000000>;
+ required-opps = <&rpmhpd_opp_svs_l1>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml
index f300ced4cdf3..71f8e726d641 100644
--- a/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml
+++ b/Documentation/devicetree/bindings/mmc/snps,dwcmshc-sdhci.yaml
@@ -17,6 +17,7 @@ properties:
compatible:
enum:
- rockchip,rk3568-dwcmshc
+ - rockchip,rk3588-dwcmshc
- snps,dwcmshc-sdhci
reg:
diff --git a/Documentation/devicetree/bindings/mtd/aspeed-smc.txt b/Documentation/devicetree/bindings/mtd/aspeed-smc.txt
deleted file mode 100644
index 49f6528ef547..000000000000
--- a/Documentation/devicetree/bindings/mtd/aspeed-smc.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-* Aspeed Firmware Memory controller
-* Aspeed SPI Flash Memory Controller
-
-The Firmware Memory Controller in the Aspeed AST2500 SoC supports
-three chip selects, two of which are always of SPI type and the third
-can be SPI or NOR type flash. These bindings only describe SPI.
-
-The two SPI flash memory controllers in the AST2500 each support two
-chip selects.
-
-Required properties:
- - compatible : Should be one of
- "aspeed,ast2400-fmc" for the AST2400 Firmware Memory Controller
- "aspeed,ast2400-spi" for the AST2400 SPI Flash memory Controller
- "aspeed,ast2500-fmc" for the AST2500 Firmware Memory Controller
- "aspeed,ast2500-spi" for the AST2500 SPI flash memory controllers
-
- - reg : the first contains the control register location and length,
- the second contains the memory window mapping address and length
- - #address-cells : must be 1 corresponding to chip select child binding
- - #size-cells : must be 0 corresponding to chip select child binding
-
-Optional properties:
- - interrupts : Should contain the interrupt for the dma device if an
- FMC
-
-The child nodes are the SPI flash modules which must have a compatible
-property as specified in bindings/mtd/jedec,spi-nor.txt
-
-Optionally, the child node can contain properties for SPI mode (may be
-ignored):
- - spi-max-frequency - max frequency of spi bus
-
-
-Example:
-fmc: fmc@1e620000 {
- compatible = "aspeed,ast2500-fmc";
- reg = < 0x1e620000 0x94
- 0x20000000 0x02000000 >;
- #address-cells = <1>;
- #size-cells = <0>;
- interrupts = <19>;
- flash@0 {
- reg = < 0 >;
- compatible = "jedec,spi-nor";
- /* spi-max-frequency = <>; */
- /* m25p,fast-read; */
- #address-cells = <1>;
- #size-cells = <1>;
- };
-};
diff --git a/Documentation/devicetree/bindings/mtd/elm.txt b/Documentation/devicetree/bindings/mtd/elm.txt
deleted file mode 100644
index 59ddc61c1076..000000000000
--- a/Documentation/devicetree/bindings/mtd/elm.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Error location module
-
-Required properties:
-- compatible: Must be "ti,am3352-elm"
-- reg: physical base address and size of the registers map.
-- interrupts: Interrupt number for the elm.
-
-Optional properties:
-- ti,hwmods: Name of the hwmod associated to the elm
-
-Example:
-elm: elm@0 {
- compatible = "ti,am3352-elm";
- reg = <0x48080000 0x2000>;
- interrupts = <4>;
-};
diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml
index 4abfb4cfc157..7149784a36ac 100644
--- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml
+++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml
@@ -50,10 +50,6 @@ properties:
minItems: 1
maxItems: 2
- spi-max-frequency: true
- spi-rx-bus-width: true
- spi-tx-bus-width: true
-
m25p,fast-read:
type: boolean
description:
@@ -74,8 +70,6 @@ properties:
be used on such systems, to denote the absence of a reliable reset
mechanism.
- label: true
-
partitions:
type: object
@@ -99,8 +93,6 @@ examples:
#size-cells = <0>;
flash@0 {
- #address-cells = <1>;
- #size-cells = <1>;
compatible = "spansion,m25p80", "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <40000000>;
diff --git a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
index ea4cace6a955..ad3ccd250802 100644
--- a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
+++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
@@ -19,7 +19,11 @@ maintainers:
properties:
compatible:
- const: fixed-partitions
+ oneOf:
+ - const: fixed-partitions
+ - items:
+ - const: sercomm,sc-partitions
+ - const: fixed-partitions
"#address-cells": true
@@ -27,7 +31,24 @@ properties:
patternProperties:
"@[0-9a-f]+$":
- $ref: "partition.yaml#"
+ allOf:
+ - $ref: "partition.yaml#"
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: sercomm,sc-partitions
+ then:
+ properties:
+ sercomm,scpart-id:
+ description: Partition id in Sercomm partition map. Mtd
+ parser uses this id to find a record in the partition map
+ containing offset and size of the current partition. The
+ values from partition map overrides partition offset and
+ size defined in reg property of the dts. Frequently these
+ values are the same, but may differ if device has bad
+ eraseblocks on a flash.
+ $ref: /schemas/types.yaml#/definitions/uint32
required:
- "#address-cells"
@@ -52,6 +73,7 @@ examples:
reg = <0x0100000 0x200000>;
};
};
+
- |
partitions {
compatible = "fixed-partitions";
@@ -64,6 +86,7 @@ examples:
reg = <0x00000000 0x1 0x00000000>;
};
};
+
- |
partitions {
compatible = "fixed-partitions";
@@ -82,6 +105,7 @@ examples:
reg = <0x2 0x00000000 0x1 0x00000000>;
};
};
+
- |
partitions {
compatible = "fixed-partitions";
@@ -119,3 +143,30 @@ examples:
};
};
};
+
+ - |
+ partitions {
+ compatible = "sercomm,sc-partitions", "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "u-boot";
+ reg = <0x0 0x100000>;
+ sercomm,scpart-id = <0>;
+ read-only;
+ };
+
+ partition@100000 {
+ label = "dynamic partition map";
+ reg = <0x100000 0x100000>;
+ sercomm,scpart-id = <1>;
+ };
+
+ partition@200000 {
+ label = "Factory";
+ reg = <0x200000 0x100000>;
+ sercomm,scpart-id = <2>;
+ read-only;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mtd/renesas-nandc.yaml b/Documentation/devicetree/bindings/mtd/renesas-nandc.yaml
index 2870d36361c4..7b18bc5cc8b3 100644
--- a/Documentation/devicetree/bindings/mtd/renesas-nandc.yaml
+++ b/Documentation/devicetree/bindings/mtd/renesas-nandc.yaml
@@ -36,11 +36,15 @@ properties:
- const: hclk
- const: eclk
+ power-domains:
+ maxItems: 1
+
required:
- compatible
- reg
- clocks
- clock-names
+ - power-domains
- interrupts
unevaluatedProperties: false
@@ -56,6 +60,7 @@ examples:
interrupts = <GIC_SPI 58 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&sysctrl R9A06G032_HCLK_NAND>, <&sysctrl R9A06G032_CLK_NAND>;
clock-names = "hclk", "eclk";
+ power-domains = <&sysctrl>;
#address-cells = <1>;
#size-cells = <0>;
};
diff --git a/Documentation/devicetree/bindings/mtd/ti,elm.yaml b/Documentation/devicetree/bindings/mtd/ti,elm.yaml
new file mode 100644
index 000000000000..87128c004596
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/ti,elm.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/ti,elm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments Error Location Module (ELM).
+
+maintainers:
+ - Roger Quadros <rogerq@kernel.org>
+
+description:
+ ELM module is used together with GPMC and NAND Flash to detect
+ errors and the location of the error based on BCH algorithms
+ so they can be corrected if possible.
+
+properties:
+ compatible:
+ enum:
+ - ti,am3352-elm
+ - ti,am64-elm
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+ description: Functional clock.
+
+ clock-names:
+ items:
+ - const: fck
+
+ power-domains:
+ maxItems: 1
+
+ ti,hwmods:
+ description:
+ Name of the HWMOD associated with ELM. This is for legacy
+ platforms only.
+ $ref: /schemas/types.yaml#/definitions/string
+ deprecated: true
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: ti,am64-elm
+ then:
+ required:
+ - clocks
+ - clock-names
+ - power-domains
+
+additionalProperties: false
+
+examples:
+ - |
+ elm: ecc@0 {
+ compatible = "ti,am3352-elm";
+ reg = <0x0 0x2000>;
+ interrupts = <4>;
+ };
diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
index 2d4219ec7eda..2e51072e794a 100644
--- a/Documentation/devicetree/bindings/perf/arm,cmn.yaml
+++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
@@ -14,6 +14,8 @@ properties:
compatible:
enum:
- arm,cmn-600
+ - arm,cmn-650
+ - arm,cmn-700
- arm,ci-700
reg:
diff --git a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
index 8a70696395a7..22ad012660e9 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt
@@ -6,12 +6,6 @@ The cache bindings explained below are Devicetree Specification compliant
Required Properties:
- compatible : Should include one of the following:
- "fsl,8540-l2-cache-controller"
- "fsl,8541-l2-cache-controller"
- "fsl,8544-l2-cache-controller"
- "fsl,8548-l2-cache-controller"
- "fsl,8555-l2-cache-controller"
- "fsl,8568-l2-cache-controller"
"fsl,b4420-l2-cache-controller"
"fsl,b4860-l2-cache-controller"
"fsl,bsc9131-l2-cache-controller"
diff --git a/Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml b/Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml
index 61dd5af80db6..5d2d989de893 100644
--- a/Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/mt6315-regulator.yaml
@@ -31,7 +31,7 @@ properties:
$ref: "regulator.yaml#"
properties:
- regulator-name:
+ regulator-compatible:
pattern: "^vbuck[1-4]$"
additionalProperties: false
diff --git a/Documentation/devicetree/bindings/regulator/mt6358-regulator.txt b/Documentation/devicetree/bindings/regulator/mt6358-regulator.txt
index 9a90a92f2d7e..7034cdca54e0 100644
--- a/Documentation/devicetree/bindings/regulator/mt6358-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/mt6358-regulator.txt
@@ -8,14 +8,14 @@ Documentation/devicetree/bindings/regulator/regulator.txt.
The valid names for regulators are::
BUCK:
- buck_vdram1, buck_vcore, buck_vpa, buck_vproc11, buck_vproc12, buck_vgpu,
- buck_vs2, buck_vmodem, buck_vs1
+ buck_vdram1, buck_vcore, buck_vcore_sshub, buck_vpa, buck_vproc11,
+ buck_vproc12, buck_vgpu, buck_vs2, buck_vmodem, buck_vs1
LDO:
ldo_vdram2, ldo_vsim1, ldo_vibr, ldo_vrf12, ldo_vio18, ldo_vusb, ldo_vcamio,
ldo_vcamd, ldo_vcn18, ldo_vfe28, ldo_vsram_proc11, ldo_vcn28, ldo_vsram_others,
- ldo_vsram_gpu, ldo_vxo22, ldo_vefuse, ldo_vaux18, ldo_vmch, ldo_vbif28,
- ldo_vsram_proc12, ldo_vcama1, ldo_vemc, ldo_vio28, ldo_va12, ldo_vrf18,
- ldo_vcn33_bt, ldo_vcn33_wifi, ldo_vcama2, ldo_vmc, ldo_vldo28, ldo_vaud28,
+ ldo_vsram_others_sshub, ldo_vsram_gpu, ldo_vxo22, ldo_vefuse, ldo_vaux18,
+ ldo_vmch, ldo_vbif28, ldo_vsram_proc12, ldo_vcama1, ldo_vemc, ldo_vio28, ldo_va12,
+ ldo_vrf18, ldo_vcn33_bt, ldo_vcn33_wifi, ldo_vcama2, ldo_vmc, ldo_vldo28, ldo_vaud28,
ldo_vsim2
Example:
@@ -354,5 +354,17 @@ Example:
regulator-max-microvolt = <3100000>;
regulator-enable-ramp-delay = <540>;
};
+
+ mt6358_vcore_sshub_reg: buck_vcore_sshub {
+ regulator-name = "vcore_sshub";
+ regulator-min-microvolt = <500000>;
+ regulator-max-microvolt = <1293750>;
+ };
+
+ mt6358_vsram_others_sshub_reg: ldo_vsram_others_sshub {
+ regulator-name = "vsram_others_sshub";
+ regulator-min-microvolt = <500000>;
+ regulator-max-microvolt = <1293750>;
+ };
};
};
diff --git a/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml b/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
index f70f2e758a00..b539781e39aa 100644
--- a/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
@@ -92,6 +92,17 @@ properties:
LDO5CTRL_L or LDO5CTRL_H register. Use this if the SD_VSEL signal is
connected to a host GPIO.
+ nxp,i2c-lt-enable:
+ type: boolean
+ description:
+ Indicates that the I2C Level Translator is used.
+
+ nxp,wdog_b-warm-reset:
+ type: boolean
+ description:
+ When WDOG_B signal is asserted a warm reset will be done instead of cold
+ reset.
+
required:
- compatible
- reg
diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml
index e28ee9e46788..9a36bee750af 100644
--- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.yaml
@@ -7,7 +7,8 @@ $schema: http://devicetree.org/meta-schemas/core.yaml#
title: Qualcomm Technologies, Inc. RPMh Regulators
maintainers:
- - David Collins <collinsd@codeaurora.org>
+ - Bjorn Andersson <bjorn.andersson@linaro.org>
+ - Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
description: |
rpmh-regulator devices support PMIC regulator management via the Voltage
@@ -78,7 +79,7 @@ properties:
RPMh resource name suffix used for the regulators found
on this PMIC.
$ref: /schemas/types.yaml#/definitions/string
- enum: [a, b, c, d, e, f]
+ enum: [a, b, c, d, e, f, h, k]
qcom,always-wait-for-ack:
description: |
@@ -94,35 +95,264 @@ properties:
vdd-rgb-supply:
description: Input supply phandle of rgb.
- vin-lvs-1-2-supply:
- description: Input supply phandle of one or more regulators.
-
- vdd-bob-supply:
- description: BOB regulator parent supply phandle.
-
bob:
type: object
$ref: "regulator.yaml#"
description: BOB regulator node.
patternProperties:
- "^vdd-s([0-9]+)-supply$":
- description: Input supply phandle(s) of one or more regulators.
-
- "^vdd-(l[0-9]+[-]){1,5}supply$":
- description: Input supply phandle(s) of one or more regulators.
-
"^(smps|ldo|lvs)[0-9]+$":
type: object
$ref: "regulator.yaml#"
description: smps/ldo regulator nodes(s).
-additionalProperties: false
-
required:
- compatible
- qcom,pmic-id
+allOf:
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm6150-rpmh-regulators
+ then:
+ properties:
+ vdd-l2-l3-supply: true
+ vdd-l4-l7-l8-supply: true
+ vdd-l5-l16-l17-l18-l19-supply: true
+ vdd-l10-l14-l15-supply: true
+ vdd-l11-l12-l13-supply: true
+ patternProperties:
+ "^vdd-l[169]-supply$": true
+ "^vdd-s[1-5]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm6150l-rpmh-regulators
+ then:
+ properties:
+ vdd-bob-supply:
+ description: BOB regulator parent supply phandle.
+ vdd-l1-l8-supply: true
+ vdd-l2-l3-supply: true
+ vdd-l4-l5-l6-supply: true
+ vdd-l7-l11-supply: true
+ vdd-l9-l10-supply: true
+ patternProperties:
+ "^vdd-s[1-8]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm7325-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l4-l12-l15-supply: true
+ vdd-l2-l7-supply: true
+ vdd-l6-l9-l10-supply: true
+ vdd-l11-l17-l18-l19-supply: true
+ vdd-l13-supply: true
+ vdd-l14-l16-supply: true
+ patternProperties:
+ "^vdd-l[358]-supply$": true
+ "^vdd-s[1-8]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8005-rpmh-regulators
+ then:
+ patternProperties:
+ "^vdd-s[1-4]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8009-rpmh-regulators
+ - qcom,pm8009-1-rpmh-regulators
+ then:
+ properties:
+ vdd-l5-l6-supply: true
+ patternProperties:
+ "^vdd-l[1-47]-supply$": true
+ "^vdd-s[1-2]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8150-rpmh-regulators
+ - qcom,pmm8155au-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l8-l11-supply: true
+ vdd-l2-l10-supply: true
+ vdd-l3-l4-l5-l18-supply: true
+ vdd-l6-l9-supply: true
+ vdd-l7-l12-l14-l15-supply: true
+ vdd-l13-l16-l17-supply: true
+ patternProperties:
+ "^vdd-s([1-9]|10)-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8150l-rpmh-regulators
+ then:
+ properties:
+ vdd-bob-supply:
+ description: BOB regulator parent supply phandle.
+ vdd-l1-l8-supply: true
+ vdd-l2-l3-supply: true
+ vdd-l4-l5-l6-supply: true
+ vdd-l7-l11-supply: true
+ vdd-l9-l10-supply: true
+ patternProperties:
+ "^vdd-s[1-8]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8350-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l4-supply: true
+ vdd-l2-l7-supply: true
+ vdd-l3-l5-supply: true
+ vdd-l6-l9-l10-supply: true
+ vdd-l8-supply: true
+ patternProperties:
+ "^vdd-s([1-9]|1[0-2])-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8350c-rpmh-regulators
+ then:
+ properties:
+ vdd-bob-supply:
+ description: BOB regulator parent supply phandle.
+ vdd-l1-l12-supply: true
+ vdd-l2-l8-supply: true
+ vdd-l3-l4-l5-l7-l13-supply: true
+ vdd-l6-l9-l11-supply: true
+ vdd-l10-supply: true
+ patternProperties:
+ "^vdd-s([1-9]|10)-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8450-rpmh-regulators
+ then:
+ patternProperties:
+ "^vdd-l[1-4]-supply$": true
+ "^vdd-s[1-6]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pm8998-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l27-supply: true
+ vdd-l2-l8-l17-supply: true
+ vdd-l3-l11-supply: true
+ vdd-l4-l5-supply: true
+ vdd-l6-supply: true
+ vdd-l7-l12-l14-l15-supply: true
+ vdd-l9-supply: true
+ vdd-l10-l23-l25-supply: true
+ vdd-l13-l19-l21-supply: true
+ vdd-l16-l28-supply: true
+ vdd-l18-l22-supply: true
+ vdd-l20-l24-supply: true
+ vdd-l26-supply: true
+ vin-lvs-1-2-supply: true
+ patternProperties:
+ "^vdd-s([1-9]|1[0-3])-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pmg1110-rpmh-regulators
+ then:
+ properties:
+ vdd-s1-supply: true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pmi8998-rpmh-regulators
+ then:
+ properties:
+ vdd-bob-supply:
+ description: BOB regulator parent supply phandle.
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pmr735a-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l2-supply: true
+ vdd-l3-supply: true
+ vdd-l4-supply: true
+ vdd-l5-l6-supply: true
+ vdd-l7-bob-supply: true
+ patternProperties:
+ "^vdd-s[1-3]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pmx55-rpmh-regulators
+ then:
+ properties:
+ vdd-l1-l2-supply: true
+ vdd-l3-l9-supply: true
+ vdd-l4-l12-supply: true
+ vdd-l5-l6-supply: true
+ vdd-l7-l8-supply: true
+ vdd-l10-l11-l13-supply: true
+ patternProperties:
+ "^vdd-l1[4-6]-supply$": true
+ "^vdd-s[1-7]-supply$": true
+
+ - if:
+ properties:
+ compatible:
+ enum:
+ - qcom,pmx65-rpmh-regulators
+ then:
+ properties:
+ vdd-l2-l18-supply: true
+ vdd-l5-l6-l16-supply: true
+ vdd-l8-l9-supply: true
+ vdd-l11-l13-supply: true
+ patternProperties:
+ "^vdd-l[1347]-supply$": true
+ "^vdd-l1[0245789]-supply$": true
+ "^vdd-l2[01]-supply$": true
+ "^vdd-s[1-8]-supply$": true
+
+unevaluatedProperties: false
+
examples:
- |
#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
diff --git a/Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml b/Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml
index 235e593b3b2c..091150c4e579 100644
--- a/Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/richtek,rt4801-regulator.yaml
@@ -17,9 +17,6 @@ description: |
Datasheet is available at
https://www.richtek.com/assets/product_file/RT4801H/DS4801H-00.pdf
-#The valid names for RT4801 regulator nodes are:
-#DSVP, DSVN
-
properties:
compatible:
enum:
@@ -33,10 +30,13 @@ properties:
The first one is ENP to enable DSVP, and second one is ENM to enable DSVN.
Number of GPIO in the array list could be 1 or 2.
If only one gpio is specified, only one gpio used to control ENP/ENM.
- Else both are spefied, DSVP/DSVN could be controlled individually.
- Othersie, this property not specified. treat both as always-on regulator.
+ Else if both are specified, DSVP/DSVN could be controlled individually.
+ If this property not specified, treat both as always-on regulators.
+
+ Property is deprecated. Use enable-gpios in each regulator.
minItems: 1
maxItems: 2
+ deprecated: true
patternProperties:
"^DSV(P|N)$":
@@ -45,6 +45,14 @@ patternProperties:
description:
Properties for single display bias regulator.
+ properties:
+ enable-gpios:
+ description:
+ GPIO to use to enable DSVP/DSVN regulator. One GPIO can be configured
+ for controlling both regulators. If this property not specified for
+ any regulator, treat both as always-on regulators.
+ maxItems: 1
+
required:
- compatible
- reg
@@ -60,19 +68,20 @@ examples:
rt4801@73 {
compatible = "richtek,rt4801";
reg = <0x73>;
- enable-gpios = <&gpio26 2 0>, <&gpio26 3 0>;
dsvp: DSVP {
regulator-name = "rt4801,dsvp";
regulator-min-microvolt = <4000000>;
regulator-max-microvolt = <6000000>;
regulator-boot-on;
+ enable-gpios = <&gpio26 2 0>;
};
dsvn: DSVN {
regulator-name = "rt4801,dsvn";
regulator-min-microvolt = <4000000>;
regulator-max-microvolt = <6000000>;
regulator-boot-on;
+ enable-gpios = <&gpio26 3 0>;
};
};
diff --git a/Documentation/devicetree/bindings/regulator/richtek,rt5759-regulator.yaml b/Documentation/devicetree/bindings/regulator/richtek,rt5759-regulator.yaml
new file mode 100644
index 000000000000..0a4c9576a432
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/richtek,rt5759-regulator.yaml
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/richtek,rt5759-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Richtek RT5759 High Performance DCDC Converter
+
+maintainers:
+ - ChiYuan Huang <cy_huang@richtek.com>
+
+description: |
+ The RT5759 is a high-performance, synchronous step-down DC-DC converter that
+ can deliver up to 9A output current from 3V to 6.5V input supply, The output
+ voltage can be programmable with I2C controlled 7-Bit VID.
+
+ Datasheet is available at
+ https://www.richtek.com/assets/product_file/RT5759/DS5759-00.pdf
+
+properties:
+ compatible:
+ enum:
+ - richtek,rt5759
+ - richtek,rt5759a
+
+ reg:
+ maxItems: 1
+
+ regulator-allowed-modes:
+ description: |
+ buck allowed operating mode
+ 0: auto mode (PSKIP: pulse skipping)
+ 1: force pwm mode
+ items:
+ enum: [0, 1]
+
+ richtek,watchdog-enable:
+ description: enable the external watchdog reset pin listening
+ type: boolean
+
+allOf:
+ - $ref: regulator.yaml#
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: richtek,rt5759
+ then:
+ properties:
+ richtek,watchdog-enable: false
+
+required:
+ - compatible
+ - reg
+
+unevaluatedProperties: false
+
+examples:
+ # example 1 for RT5759
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ regulator@62 {
+ compatible = "richtek,rt5759";
+ reg = <0x62>;
+ regulator-name = "rt5759-buck";
+ regulator-min-microvolt = <600000>;
+ regulator-max-microvolt = <1500000>;
+ regulator-boot-on;
+ };
+ };
+ # example 2 for RT5759A
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ regulator@62 {
+ compatible = "richtek,rt5759a";
+ reg = <0x62>;
+ regulator-name = "rt5759a-buck";
+ regulator-min-microvolt = <600000>;
+ regulator-max-microvolt = <1725000>;
+ regulator-boot-on;
+ richtek,watchdog-enable;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/regulator/siliconmitus,sm5703-regulator.yaml b/Documentation/devicetree/bindings/regulator/siliconmitus,sm5703-regulator.yaml
new file mode 100644
index 000000000000..9d84117530ca
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/siliconmitus,sm5703-regulator.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/siliconmitus,sm5703-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Silicon Mitus SM5703 multi function device regulators
+
+maintainers:
+ - Markuss Broks <markuss.broks@gmail.com>
+
+description: |
+ SM5703 regulators node should be a sub node of the SM5703 MFD node. See SM5703 MFD
+ bindings at Documentation/devicetree/bindings/mfd/siliconmitus,sm5703.yaml
+ Regulator nodes should be named as USBLDO_<number>, BUCK, VBUS, LDO_<number>.
+ The definition for each of these nodes is defined using the standard
+ binding for regulators at Documentation/devicetree/bindings/regulator/regulator.txt.
+
+properties:
+ buck:
+ type: object
+ $ref: regulator.yaml#
+ unevaluatedProperties: false
+ description:
+ Properties for the BUCK regulator.
+
+ vbus:
+ type: object
+ $ref: regulator.yaml#
+ unevaluatedProperties: false
+ description:
+ Properties for the VBUS regulator.
+
+patternProperties:
+ "^ldo[1-3]$":
+ type: object
+ $ref: regulator.yaml#
+ unevaluatedProperties: false
+ description:
+ Properties for single LDO regulator.
+
+ "^usbldo[1-2]$":
+ type: object
+ $ref: regulator.yaml#
+ unevaluatedProperties: false
+ description:
+ Properties for a single USBLDO regulator.
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml b/Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml
index 1218f21ba320..75087c6e001c 100644
--- a/Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/socionext,uniphier-regulator.yaml
@@ -14,9 +14,6 @@ description: |
maintainers:
- Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
-allOf:
- - $ref: "regulator.yaml#"
-
# USB3 Controller
properties:
@@ -36,27 +33,51 @@ properties:
minItems: 1
maxItems: 2
- clock-names:
- oneOf:
- - items: # for Pro4, Pro5
- - const: gio
- - const: link
- - items: # for others
- - const: link
+ clock-names: true
resets:
minItems: 1
maxItems: 2
- reset-names:
- oneOf:
- - items: # for Pro4, Pro5
- - const: gio
- - const: link
- - items:
- - const: link
+ reset-names: true
-additionalProperties: false
+allOf:
+ - $ref: "regulator.yaml#"
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - socionext,uniphier-pro4-usb3-regulator
+ - socionext,uniphier-pro5-usb3-regulator
+ then:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+ clock-names:
+ items:
+ - const: gio
+ - const: link
+ resets:
+ minItems: 2
+ maxItems: 2
+ reset-names:
+ items:
+ - const: gio
+ - const: link
+ else:
+ properties:
+ clocks:
+ maxItems: 1
+ clock-names:
+ const: link
+ resets:
+ maxItems: 1
+ reset-names:
+ const: link
+
+unevaluatedProperties: false
required:
- compatible
diff --git a/Documentation/devicetree/bindings/reserved-memory/phram.yaml b/Documentation/devicetree/bindings/reserved-memory/phram.yaml
new file mode 100644
index 000000000000..6c4db28015f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/reserved-memory/phram.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/reserved-memory/phram.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MTD/block device in RAM
+
+description: |
+ Specifies that the reserved memory region can be used as an MTD or block
+ device.
+
+ The "phram" node is named after the "MTD in PHysical RAM" driver which
+ provides an implementation of this functionality in Linux.
+
+maintainers:
+ - Vincent Whitchurch <vincent.whitchurch@axis.com>
+
+allOf:
+ - $ref: "reserved-memory.yaml"
+ - $ref: "/schemas/mtd/mtd.yaml"
+
+properties:
+ compatible:
+ const: phram
+
+ reg:
+ description: region of memory that can be used as an MTD/block device
+
+required:
+ - compatible
+ - reg
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ reserved-memory {
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ phram: flash@12340000 {
+ compatible = "phram";
+ label = "rootfs";
+ reg = <0x12340000 0x00800000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/spi/aspeed,ast2600-fmc.yaml b/Documentation/devicetree/bindings/spi/aspeed,ast2600-fmc.yaml
new file mode 100644
index 000000000000..fa8f4ac20985
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/aspeed,ast2600-fmc.yaml
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/aspeed,ast2600-fmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Aspeed SMC controllers bindings
+
+maintainers:
+ - Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
+ - Cédric Le Goater <clg@kaod.org>
+
+description: |
+ This binding describes the Aspeed Static Memory Controllers (FMC and
+ SPI) of the AST2400, AST2500 and AST2600 SOCs.
+
+allOf:
+ - $ref: "spi-controller.yaml#"
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2600-fmc
+ - aspeed,ast2600-spi
+ - aspeed,ast2500-fmc
+ - aspeed,ast2500-spi
+ - aspeed,ast2400-fmc
+ - aspeed,ast2400-spi
+
+ reg:
+ items:
+ - description: registers
+ - description: memory mapping
+
+ clocks:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/interrupt-controller/aspeed-scu-ic.h>
+ #include <dt-bindings/clock/ast2600-clock.h>
+
+ spi@1e620000 {
+ reg = <0x1e620000 0xc4>, <0x20000000 0x10000000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "aspeed,ast2600-fmc";
+ clocks = <&syscon ASPEED_CLK_AHB>;
+ interrupts = <GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>;
+
+ flash@0 {
+ reg = < 0 >;
+ compatible = "jedec,spi-nor";
+ spi-max-frequency = <50000000>;
+ spi-rx-bus-width = <2>;
+ };
+
+ flash@1 {
+ reg = < 1 >;
+ compatible = "jedec,spi-nor";
+ spi-max-frequency = <50000000>;
+ spi-rx-bus-width = <2>;
+ };
+
+ flash@2 {
+ reg = < 2 >;
+ compatible = "jedec,spi-nor";
+ spi-max-frequency = <50000000>;
+ spi-rx-bus-width = <2>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/spi/ingenic,spi.yaml b/Documentation/devicetree/bindings/spi/ingenic,spi.yaml
index 5b1c7a2a6a31..360f76c226d9 100644
--- a/Documentation/devicetree/bindings/spi/ingenic,spi.yaml
+++ b/Documentation/devicetree/bindings/spi/ingenic,spi.yaml
@@ -18,7 +18,10 @@ properties:
oneOf:
- enum:
- ingenic,jz4750-spi
+ - ingenic,jz4775-spi
- ingenic,jz4780-spi
+ - ingenic,x1000-spi
+ - ingenic,x2000-spi
- items:
- enum:
- ingenic,jz4760-spi
diff --git a/Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml b/Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml
index 818130b11bb9..94ef0552bd42 100644
--- a/Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml
+++ b/Documentation/devicetree/bindings/spi/mediatek,spi-mt65xx.yaml
@@ -53,16 +53,20 @@ properties:
maxItems: 1
clocks:
+ minItems: 3
items:
- description: clock used for the parent clock
- description: clock used for the muxes clock
- description: clock used for the clock gate
+ - description: clock used for the AHB bus, this clock is optional
clock-names:
+ minItems: 3
items:
- const: parent-clk
- const: sel-clk
- const: spi-clk
+ - const: hclk
mediatek,pad-select:
$ref: /schemas/types.yaml#/definitions/uint32-array
diff --git a/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-snfi.yaml b/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-snfi.yaml
new file mode 100644
index 000000000000..6e6e02c91780
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-snfi.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/mediatek,spi-mtk-snfi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SPI-NAND flash controller for MediaTek ARM SoCs
+
+maintainers:
+ - Chuanhong Guo <gch981213@gmail.com>
+
+description: |
+ The Mediatek SPI-NAND flash controller is an extended version of
+ the Mediatek NAND flash controller. It can perform standard SPI
+ instructions with one continuous write and one read for up-to 0xa0
+ bytes. It also supports typical SPI-NAND page cache operations
+ in single, dual or quad IO mode with pipelined ECC encoding/decoding
+ using the accompanying ECC engine. There should be only one spi
+ slave device following generic spi bindings.
+
+allOf:
+ - $ref: /schemas/spi/spi-controller.yaml#
+
+properties:
+ compatible:
+ enum:
+ - mediatek,mt7622-snand
+ - mediatek,mt7629-snand
+
+ reg:
+ items:
+ - description: core registers
+
+ interrupts:
+ items:
+ - description: NFI interrupt
+
+ clocks:
+ items:
+ - description: clock used for the controller
+ - description: clock used for the SPI bus
+
+ clock-names:
+ items:
+ - const: nfi_clk
+ - const: pad_clk
+
+ nand-ecc-engine:
+ description: device-tree node of the accompanying ECC engine.
+ $ref: /schemas/types.yaml#/definitions/phandle
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - nand-ecc-engine
+
+unevaluatedProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+ #include <dt-bindings/clock/mt7622-clk.h>
+ soc {
+ #address-cells = <2>;
+ #size-cells = <2>;
+ snfi: spi@1100d000 {
+ compatible = "mediatek,mt7622-snand";
+ reg = <0 0x1100d000 0 0x1000>;
+ interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&pericfg CLK_PERI_NFI_PD>, <&pericfg CLK_PERI_SNFI_PD>;
+ clock-names = "nfi_clk", "pad_clk";
+ nand-ecc-engine = <&bch>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ flash@0 {
+ compatible = "spi-nand";
+ reg = <0>;
+ spi-tx-bus-width = <4>;
+ spi-rx-bus-width = <4>;
+ nand-ecc-engine = <&snfi>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
index 5a60fba14bba..44d08aa3fd85 100644
--- a/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
+++ b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
@@ -49,6 +49,7 @@ properties:
maxItems: 2
interconnect-names:
+ minItems: 1
items:
- const: qspi-config
- const: qspi-memory
diff --git a/Documentation/devicetree/bindings/spi/renesas,rspi.yaml b/Documentation/devicetree/bindings/spi/renesas,rspi.yaml
index 2c3c6bd6ec45..f45d3b75d6de 100644
--- a/Documentation/devicetree/bindings/spi/renesas,rspi.yaml
+++ b/Documentation/devicetree/bindings/spi/renesas,rspi.yaml
@@ -21,6 +21,7 @@ properties:
- enum:
- renesas,rspi-r7s72100 # RZ/A1H
- renesas,rspi-r7s9210 # RZ/A2
+ - renesas,r9a07g043-rspi # RZ/G2UL
- renesas,r9a07g044-rspi # RZ/G2{L,LC}
- renesas,r9a07g054-rspi # RZ/V2L
- const: renesas,rspi-rz
@@ -124,6 +125,7 @@ allOf:
contains:
enum:
- renesas,qspi
+ - renesas,r9a07g043-rspi
- renesas,r9a07g044-rspi
- renesas,r9a07g054-rspi
then:
diff --git a/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml b/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml
index a9b7388ca9ac..e1587ddf7de3 100644
--- a/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml
+++ b/Documentation/devicetree/bindings/thermal/qcom-lmh.yaml
@@ -18,6 +18,7 @@ description:
properties:
compatible:
enum:
+ - qcom,sc8180x-lmh
- qcom,sdm845-lmh
- qcom,sm8150-lmh
diff --git a/Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml b/Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml
index 3ea8c0c1f45f..feb390d50696 100644
--- a/Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml
+++ b/Documentation/devicetree/bindings/thermal/qcom-spmi-adc-tm5.yaml
@@ -10,7 +10,9 @@ maintainers:
properties:
compatible:
- const: qcom,spmi-adc-tm5
+ enum:
+ - qcom,spmi-adc-tm5
+ - qcom,spmi-adc-tm5-gen2
reg:
maxItems: 1
@@ -33,6 +35,7 @@ properties:
qcom,avg-samples:
$ref: /schemas/types.yaml#/definitions/uint32
description: Number of samples to be used for measurement.
+ Not applicable for Gen2 ADC_TM peripheral.
enum:
- 1
- 2
@@ -45,6 +48,7 @@ properties:
$ref: /schemas/types.yaml#/definitions/uint32
description: This parameter is used to decrease ADC sampling rate.
Quicker measurements can be made by reducing decimation ratio.
+ Not applicable for Gen2 ADC_TM peripheral.
enum:
- 250
- 420
@@ -93,6 +97,29 @@ patternProperties:
- const: 1
- enum: [ 1, 3, 4, 6, 20, 8, 10 ]
+ qcom,avg-samples:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Number of samples to be used for measurement.
+ This property in child node is applicable only for Gen2 ADC_TM peripheral.
+ enum:
+ - 1
+ - 2
+ - 4
+ - 8
+ - 16
+ default: 1
+
+ qcom,decimation:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: This parameter is used to decrease ADC sampling rate.
+ Quicker measurements can be made by reducing decimation ratio.
+ This property in child node is applicable only for Gen2 ADC_TM peripheral.
+ enum:
+ - 85
+ - 340
+ - 1360
+ default: 1360
+
required:
- reg
- io-channels
@@ -100,6 +127,31 @@ patternProperties:
additionalProperties:
false
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,spmi-adc-tm5
+
+ then:
+ patternProperties:
+ "^([-a-z0-9]*)@[0-7]$":
+ properties:
+ qcom,decimation: false
+ qcom,avg-samples: false
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: qcom,spmi-adc-tm5-gen2
+
+ then:
+ properties:
+ qcom,avg-samples: false
+ qcom,decimation: false
+
required:
- compatible
- reg
@@ -124,7 +176,7 @@ examples:
#size-cells = <0>;
#io-channel-cells = <1>;
- /* Other propreties are omitted */
+ /* Other properties are omitted */
conn-therm@4f {
reg = <ADC5_AMUX_THM3_100K_PU>;
qcom,ratiometric;
@@ -148,4 +200,58 @@ examples:
};
};
};
+
+ - |
+ #include <dt-bindings/iio/qcom,spmi-adc7-pmk8350.h>
+ #include <dt-bindings/iio/qcom,spmi-adc7-pm8350.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ spmi_bus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ pmk8350_vadc: adc@3100 {
+ reg = <0x3100>;
+ compatible = "qcom,spmi-adc7";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ #io-channel-cells = <1>;
+
+ /* Other properties are omitted */
+ xo-therm@44 {
+ reg = <PMK8350_ADC7_AMUX_THM1_100K_PU>;
+ qcom,ratiometric;
+ qcom,hw-settle-time = <200>;
+ };
+
+ conn-therm@47 {
+ reg = <PM8350_ADC7_AMUX_THM4_100K_PU>;
+ qcom,ratiometric;
+ qcom,hw-settle-time = <200>;
+ };
+ };
+
+ pmk8350_adc_tm: adc-tm@3400 {
+ compatible = "qcom,spmi-adc-tm5-gen2";
+ reg = <0x3400>;
+ interrupts = <0x0 0x34 0x0 IRQ_TYPE_EDGE_RISING>;
+ #thermal-sensor-cells = <1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ pmk8350-xo-therm@0 {
+ reg = <0>;
+ io-channels = <&pmk8350_vadc PMK8350_ADC7_AMUX_THM1_100K_PU>;
+ qcom,decimation = <340>;
+ qcom,ratiometric;
+ qcom,hw-settle-time-us = <200>;
+ };
+
+ conn-therm@1 {
+ reg = <1>;
+ io-channels = <&pmk8350_vadc PM8350_ADC7_AMUX_THM4_100K_PU>;
+ qcom,avg-samples = <2>;
+ qcom,ratiometric;
+ qcom,hw-settle-time-us = <200>;
+ };
+ };
+ };
...
diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
index b6406bcc683f..a24baf9b9f64 100644
--- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
+++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml
@@ -19,10 +19,11 @@ description: |
properties:
compatible:
oneOf:
- - description: msm9860 TSENS based
+ - description: msm8960 TSENS based
items:
- enum:
- qcom,ipq8064-tsens
+ - qcom,msm8960-tsens
- description: v0.1 of TSENS
items:
@@ -49,6 +50,7 @@ properties:
- qcom,sc7180-tsens
- qcom,sc7280-tsens
- qcom,sc8180x-tsens
+ - qcom,sc8280xp-tsens
- qcom,sdm630-tsens
- qcom,sdm845-tsens
- qcom,sm8150-tsens
@@ -116,6 +118,7 @@ allOf:
- qcom,ipq8064-tsens
- qcom,mdm9607-tsens
- qcom,msm8916-tsens
+ - qcom,msm8960-tsens
- qcom,msm8974-tsens
- qcom,msm8976-tsens
- qcom,qcs404-tsens
diff --git a/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml
index ccab9511a042..1d8373397848 100644
--- a/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml
+++ b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml
@@ -17,7 +17,9 @@ properties:
compatible:
items:
- enum:
+ - renesas,r9a07g043-tsu # RZ/G2UL
- renesas,r9a07g044-tsu # RZ/G2{L,LC}
+ - renesas,r9a07g054-tsu # RZ/V2L
- const: renesas,rzg2l-tsu
reg:
diff --git a/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml
new file mode 100644
index 000000000000..c74f124ebfc0
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/ti,j72xx-thermal.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/thermal/ti,j72xx-thermal.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments J72XX VTM (DTS) binding
+
+maintainers:
+ - Keerthy <j-keerthy@ti.com>
+
+properties:
+ compatible:
+ enum:
+ - ti,j721e-vtm
+ - ti,j7200-vtm
+
+ reg:
+ items:
+ - description: VTM cfg1 register space
+ - description: VTM cfg2 register space
+ - description: VTM efuse register space
+
+ power-domains:
+ maxItems: 1
+
+ "#thermal-sensor-cells":
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - power-domains
+ - "#thermal-sensor-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/soc/ti,sci_pm_domain.h>
+ wkup_vtm0: thermal-sensor@42040000 {
+ compatible = "ti,j721e-vtm";
+ reg = <0x42040000 0x350>,
+ <0x42050000 0x350>,
+ <0x43000300 0x10>;
+ power-domains = <&k3_pds 154 TI_SCI_PD_EXCLUSIVE>;
+ #thermal-sensor-cells = <1>;
+ };
+
+ mpu_thermal: mpu-thermal {
+ polling-delay-passive = <250>; /* milliseconds */
+ polling-delay = <500>; /* milliseconds */
+ thermal-sensors = <&wkup_vtm0 0>;
+
+ trips {
+ mpu_crit: mpu-crit {
+ temperature = <125000>; /* milliCelsius */
+ hysteresis = <2000>; /* milliCelsius */
+ type = "critical";
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 550a2e5c9e05..c11520347a9d 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -143,6 +143,10 @@ properties:
- infineon,xdpe12254
# Infineon Multi-phase Digital VR Controller xdpe12284
- infineon,xdpe12284
+ # Infineon Multi-phase Digital VR Controller xdpe15284
+ - infineon,xdpe15284
+ # Infineon Multi-phase Digital VR Controller xdpe152c4
+ - infineon,xdpe152c4
# Injoinic IP5108 2.0A Power Bank IC with I2C
- injoinic,ip5108
# Injoinic IP5109 2.1A Power Bank IC with I2C
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index e12a75e10456..2bf2b3accc8e 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -1084,6 +1084,8 @@ patternProperties:
description: Sensirion AG
"^sensortek,.*":
description: Sensortek Technology Corporation
+ "^sercomm,.*":
+ description: Sercomm (Suzhou) Corporation
"^sff,.*":
description: Small Form Factor Committee
"^sgd,.*":
diff --git a/Documentation/doc-guide/contributing.rst b/Documentation/doc-guide/contributing.rst
index 207fd93d7c80..d4793826ad9a 100644
--- a/Documentation/doc-guide/contributing.rst
+++ b/Documentation/doc-guide/contributing.rst
@@ -79,8 +79,9 @@ simplistic idea of what C comment blocks look like. This problem had been
present since that comment was added in 2016 — a full four years. Fixing
it was a matter of adding the missing asterisks. A quick look at the
history for that file showed what the normal format for subject lines is,
-and ``scripts/get_maintainer.pl`` told me who should receive it. The
-resulting patch looked like this::
+and ``scripts/get_maintainer.pl`` told me who should receive it (pass paths to
+your patches as arguments to scripts/get_maintainer.pl). The resulting patch
+looked like this::
[PATCH] PM / devfreq: Fix two malformed kerneldoc comments
diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst
index 79aaa55d6bcf..a7cb2afd7990 100644
--- a/Documentation/doc-guide/kernel-doc.rst
+++ b/Documentation/doc-guide/kernel-doc.rst
@@ -1,3 +1,4 @@
+===========================
Writing kernel-doc comments
===========================
@@ -436,6 +437,7 @@ The title following ``DOC:`` acts as a heading within the source file, but also
as an identifier for extracting the documentation comment. Thus, the title must
be unique within the file.
+=============================
Including kernel-doc comments
=============================
diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst
index bb36f18ae9ac..2ff1ab4158d4 100644
--- a/Documentation/doc-guide/sphinx.rst
+++ b/Documentation/doc-guide/sphinx.rst
@@ -1,7 +1,8 @@
.. _sphinxdoc:
-Introduction
-============
+=====================================
+Using Sphinx for kernel documentation
+=====================================
The Linux kernel uses `Sphinx`_ to generate pretty documentation from
`reStructuredText`_ files under ``Documentation``. To build the documentation in
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 910b30a2a7d9..352ff53a2306 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -211,6 +211,7 @@ r200_reg_safe.h
r300_reg_safe.h
r420_reg_safe.h
r600_reg_safe.h
+randstruct.seed
randomize_layout_hash.h
randomize_layout_seed.h
recordmcount
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5018403fe82f..2d39967bafcc 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -249,7 +249,7 @@ CLOCK
devm_clk_bulk_get()
devm_clk_bulk_get_all()
devm_clk_bulk_get_optional()
- devm_get_clk_from_childl()
+ devm_get_clk_from_child()
devm_clk_hw_register()
devm_of_clk_add_hw_provider()
devm_clk_hw_register_clkdev()
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index bbc53920d4dd..a1ddefa1f55f 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -417,30 +417,66 @@ struct gpio_irq_chip inside struct gpio_chip before adding the gpio_chip.
If you do this, the additional irq_chip will be set up by gpiolib at the
same time as setting up the rest of the GPIO functionality. The following
is a typical example of a chained cascaded interrupt handler using
-the gpio_irq_chip:
+the gpio_irq_chip. Note how the mask/unmask (or disable/enable) functions
+call into the core gpiolib code:
.. code-block:: c
- /* Typical state container with dynamic irqchip */
+ /* Typical state container */
struct my_gpio {
struct gpio_chip gc;
- struct irq_chip irq;
+ };
+
+ static void my_gpio_mask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ /*
+ * Perform any necessary action to mask the interrupt,
+ * and then call into the core code to synchronise the
+ * state.
+ */
+
+ gpiochip_disable_irq(gc, d->hwirq);
+ }
+
+ static void my_gpio_unmask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ gpiochip_enable_irq(gc, d->hwirq);
+
+ /*
+ * Perform any necessary action to unmask the interrupt,
+ * after having called into the core code to synchronise
+ * the state.
+ */
+ }
+
+ /*
+ * Statically populate the irqchip. Note that it is made const
+ * (further indicated by the IRQCHIP_IMMUTABLE flag), and that
+ * the GPIOCHIP_IRQ_RESOURCE_HELPER macro adds some extra
+ * callbacks to the structure.
+ */
+ static const struct irq_chip my_gpio_irq_chip = {
+ .name = "my_gpio_irq",
+ .irq_ack = my_gpio_ack_irq,
+ .irq_mask = my_gpio_mask_irq,
+ .irq_unmask = my_gpio_unmask_irq,
+ .irq_set_type = my_gpio_set_irq_type,
+ .flags = IRQCHIP_IMMUTABLE,
+ /* Provide the gpio resource callbacks */
+ GPIOCHIP_IRQ_RESOURCE_HELPERS,
};
int irq; /* from platform etc */
struct my_gpio *g;
struct gpio_irq_chip *girq;
- /* Set up the irqchip dynamically */
- g->irq.name = "my_gpio_irq";
- g->irq.irq_ack = my_gpio_ack_irq;
- g->irq.irq_mask = my_gpio_mask_irq;
- g->irq.irq_unmask = my_gpio_unmask_irq;
- g->irq.irq_set_type = my_gpio_set_irq_type;
-
/* Get a pointer to the gpio_irq_chip */
girq = &g->gc.irq;
- girq->chip = &g->irq;
+ gpio_irq_chip_set_chip(girq, &my_gpio_irq_chip);
girq->parent_handler = ftgpio_gpio_irq_handler;
girq->num_parents = 1;
girq->parents = devm_kcalloc(dev, 1, sizeof(*girq->parents),
@@ -458,23 +494,58 @@ the interrupt separately and go with it:
.. code-block:: c
- /* Typical state container with dynamic irqchip */
+ /* Typical state container */
struct my_gpio {
struct gpio_chip gc;
- struct irq_chip irq;
+ };
+
+ static void my_gpio_mask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ /*
+ * Perform any necessary action to mask the interrupt,
+ * and then call into the core code to synchronise the
+ * state.
+ */
+
+ gpiochip_disable_irq(gc, d->hwirq);
+ }
+
+ static void my_gpio_unmask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ gpiochip_enable_irq(gc, d->hwirq);
+
+ /*
+ * Perform any necessary action to unmask the interrupt,
+ * after having called into the core code to synchronise
+ * the state.
+ */
+ }
+
+ /*
+ * Statically populate the irqchip. Note that it is made const
+ * (further indicated by the IRQCHIP_IMMUTABLE flag), and that
+ * the GPIOCHIP_IRQ_RESOURCE_HELPER macro adds some extra
+ * callbacks to the structure.
+ */
+ static const struct irq_chip my_gpio_irq_chip = {
+ .name = "my_gpio_irq",
+ .irq_ack = my_gpio_ack_irq,
+ .irq_mask = my_gpio_mask_irq,
+ .irq_unmask = my_gpio_unmask_irq,
+ .irq_set_type = my_gpio_set_irq_type,
+ .flags = IRQCHIP_IMMUTABLE,
+ /* Provide the gpio resource callbacks */
+ GPIOCHIP_IRQ_RESOURCE_HELPERS,
};
int irq; /* from platform etc */
struct my_gpio *g;
struct gpio_irq_chip *girq;
- /* Set up the irqchip dynamically */
- g->irq.name = "my_gpio_irq";
- g->irq.irq_ack = my_gpio_ack_irq;
- g->irq.irq_mask = my_gpio_mask_irq;
- g->irq.irq_unmask = my_gpio_unmask_irq;
- g->irq.irq_set_type = my_gpio_set_irq_type;
-
ret = devm_request_threaded_irq(dev, irq, NULL,
irq_thread_fn, IRQF_ONESHOT, "my-chip", g);
if (ret < 0)
@@ -482,7 +553,7 @@ the interrupt separately and go with it:
/* Get a pointer to the gpio_irq_chip */
girq = &g->gc.irq;
- girq->chip = &g->irq;
+ gpio_irq_chip_set_chip(girq, &my_gpio_irq_chip);
/* This will let us handle the parent IRQ in the driver */
girq->parent_handler = NULL;
girq->num_parents = 0;
@@ -500,24 +571,61 @@ In this case the typical set-up will look like this:
/* Typical state container with dynamic irqchip */
struct my_gpio {
struct gpio_chip gc;
- struct irq_chip irq;
struct fwnode_handle *fwnode;
};
- int irq; /* from platform etc */
+ static void my_gpio_mask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ /*
+ * Perform any necessary action to mask the interrupt,
+ * and then call into the core code to synchronise the
+ * state.
+ */
+
+ gpiochip_disable_irq(gc, d->hwirq);
+ irq_mask_mask_parent(d);
+ }
+
+ static void my_gpio_unmask_irq(struct irq_data *d)
+ {
+ struct gpio_chip *gc = irq_desc_get_handler_data(d);
+
+ gpiochip_enable_irq(gc, d->hwirq);
+
+ /*
+ * Perform any necessary action to unmask the interrupt,
+ * after having called into the core code to synchronise
+ * the state.
+ */
+
+ irq_mask_unmask_parent(d);
+ }
+
+ /*
+ * Statically populate the irqchip. Note that it is made const
+ * (further indicated by the IRQCHIP_IMMUTABLE flag), and that
+ * the GPIOCHIP_IRQ_RESOURCE_HELPER macro adds some extra
+ * callbacks to the structure.
+ */
+ static const struct irq_chip my_gpio_irq_chip = {
+ .name = "my_gpio_irq",
+ .irq_ack = my_gpio_ack_irq,
+ .irq_mask = my_gpio_mask_irq,
+ .irq_unmask = my_gpio_unmask_irq,
+ .irq_set_type = my_gpio_set_irq_type,
+ .flags = IRQCHIP_IMMUTABLE,
+ /* Provide the gpio resource callbacks */
+ GPIOCHIP_IRQ_RESOURCE_HELPERS,
+ };
+
struct my_gpio *g;
struct gpio_irq_chip *girq;
- /* Set up the irqchip dynamically */
- g->irq.name = "my_gpio_irq";
- g->irq.irq_ack = my_gpio_ack_irq;
- g->irq.irq_mask = my_gpio_mask_irq;
- g->irq.irq_unmask = my_gpio_unmask_irq;
- g->irq.irq_set_type = my_gpio_set_irq_type;
-
/* Get a pointer to the gpio_irq_chip */
girq = &g->gc.irq;
- girq->chip = &g->irq;
+ gpio_irq_chip_set_chip(girq, &my_gpio_irq_chip);
girq->default_type = IRQ_TYPE_NONE;
girq->handler = handle_bad_irq;
girq->fwnode = g->fwnode;
@@ -605,8 +713,9 @@ When implementing an irqchip inside a GPIO driver, these two functions should
typically be called in the .irq_disable() and .irq_enable() callbacks from the
irqchip.
-When using the gpiolib irqchip helpers, these callbacks are automatically
-assigned.
+When IRQCHIP_IMMUTABLE is not advertised by the irqchip, these callbacks
+are automatically assigned. This behaviour is deprecated and on its way
+to be removed from the kernel.
Real-Time compliance for GPIO IRQ chips
diff --git a/Documentation/driver-api/libata.rst b/Documentation/driver-api/libata.rst
index d477e296bda5..311af516a3fd 100644
--- a/Documentation/driver-api/libata.rst
+++ b/Documentation/driver-api/libata.rst
@@ -424,12 +424,6 @@ How commands are issued
-----------------------
Internal commands
- First, qc is allocated and initialized using :c:func:`ata_qc_new_init`.
- Although :c:func:`ata_qc_new_init` doesn't implement any wait or retry
- mechanism when qc is not available, internal commands are currently
- issued only during initialization and error recovery, so no other
- command is active and allocation is guaranteed to succeed.
-
Once allocated qc's taskfile is initialized for the command to be
executed. qc currently has two mechanisms to notify completion. One
is via ``qc->complete_fn()`` callback and the other is completion
@@ -447,11 +441,6 @@ SCSI commands
translated. No qc is involved in processing a simulated scmd. The
result is computed right away and the scmd is completed.
- For a translated scmd, :c:func:`ata_qc_new_init` is invoked to allocate a
- qc and the scmd is translated into the qc. SCSI midlayer's
- completion notification function pointer is stored into
- ``qc->scsidone``.
-
``qc->complete_fn()`` callback is used for completion notification. ATA
commands use :c:func:`ata_scsi_qc_complete` while ATAPI commands use
:c:func:`atapi_qc_complete`. Both functions end up calling ``qc->scsidone``
diff --git a/Documentation/driver-api/media/cec-core.rst b/Documentation/driver-api/media/cec-core.rst
index c6194ee81c41..ae0d20798edc 100644
--- a/Documentation/driver-api/media/cec-core.rst
+++ b/Documentation/driver-api/media/cec-core.rst
@@ -109,6 +109,7 @@ your driver:
int (*adap_monitor_all_enable)(struct cec_adapter *adap, bool enable);
int (*adap_monitor_pin_enable)(struct cec_adapter *adap, bool enable);
int (*adap_log_addr)(struct cec_adapter *adap, u8 logical_addr);
+ void (*adap_configured)(struct cec_adapter *adap, bool configured);
int (*adap_transmit)(struct cec_adapter *adap, u8 attempts,
u32 signal_free_time, struct cec_msg *msg);
void (*adap_status)(struct cec_adapter *adap, struct seq_file *file);
@@ -117,7 +118,7 @@ your driver:
/* Error injection callbacks */
...
- /* High-level callbacks */
+ /* High-level callback */
...
};
@@ -178,6 +179,16 @@ can receive directed messages to that address.
Note that adap_log_addr must return 0 if logical_addr is CEC_LOG_ADDR_INVALID.
+Called when the adapter is fully configured or unconfigured::
+
+ void (*adap_configured)(struct cec_adapter *adap, bool configured);
+
+If configured == true, then the adapter is fully configured, i.e. all logical
+addresses have been successfully claimed. If configured == false, then the
+adapter is unconfigured. If the driver has to take specific actions after
+(un)configuration, then that can be done through this optional callback.
+
+
To transmit a new message::
int (*adap_transmit)(struct cec_adapter *adap, u8 attempts,
diff --git a/Documentation/driver-api/media/mc-core.rst b/Documentation/driver-api/media/mc-core.rst
index 57b5bbba944e..02481a2513b9 100644
--- a/Documentation/driver-api/media/mc-core.rst
+++ b/Documentation/driver-api/media/mc-core.rst
@@ -42,9 +42,16 @@ Allocation of the structure is handled by the media device driver, usually by
embedding the :c:type:`media_device` instance in a larger driver-specific
structure.
-Drivers register media device instances by calling
-:c:func:`__media_device_register()` via the macro ``media_device_register()``
-and unregistered by calling :c:func:`media_device_unregister()`.
+Drivers initialise media device instances by calling
+:c:func:`media_device_init()`. After initialising a media device instance, it is
+registered by calling :c:func:`__media_device_register()` via the macro
+``media_device_register()`` and unregistered by calling
+:c:func:`media_device_unregister()`. An initialised media device must be
+eventually cleaned up by calling :c:func:`media_device_cleanup()`.
+
+Note that it is not allowed to unregister a media device instance that was not
+previously registered, or clean up a media device instance that was not
+previously initialised.
Entities
^^^^^^^^
diff --git a/Documentation/driver-api/media/v4l2-subdev.rst b/Documentation/driver-api/media/v4l2-subdev.rst
index 08ea2673b19e..cf3b52bdbfb9 100644
--- a/Documentation/driver-api/media/v4l2-subdev.rst
+++ b/Documentation/driver-api/media/v4l2-subdev.rst
@@ -518,6 +518,75 @@ The :c:func:`v4l2_i2c_new_subdev` function will call
:c:type:`i2c_board_info` structure using the ``client_type`` and the
``addr`` to fill it.
+Centrally managed subdev active state
+-------------------------------------
+
+Traditionally V4L2 subdev drivers maintained internal state for the active
+device configuration. This is often implemented as e.g. an array of struct
+v4l2_mbus_framefmt, one entry for each pad, and similarly for crop and compose
+rectangles.
+
+In addition to the active configuration, each subdev file handle has an array of
+struct v4l2_subdev_pad_config, managed by the V4L2 core, which contains the try
+configuration.
+
+To simplify the subdev drivers the V4L2 subdev API now optionally supports a
+centrally managed active configuration represented by
+:c:type:`v4l2_subdev_state`. One instance of state, which contains the active
+device configuration, is stored in the sub-device itself as part of
+the :c:type:`v4l2_subdev` structure, while the core associates a try state to
+each open file handle, to store the try configuration related to that file
+handle.
+
+Sub-device drivers can opt-in and use state to manage their active configuration
+by initializing the subdevice state with a call to v4l2_subdev_init_finalize()
+before registering the sub-device. They must also call v4l2_subdev_cleanup()
+to release all the allocated resources before unregistering the sub-device.
+The core automatically allocates and initializes a state for each open file
+handle to store the try configurations and frees it when closing the file
+handle.
+
+V4L2 sub-device operations that use both the :ref:`ACTIVE and TRY formats
+<v4l2-subdev-format-whence>` receive the correct state to operate on through
+the 'state' parameter. The state must be locked and unlocked by the
+caller by calling :c:func:`v4l2_subdev_lock_state()` and
+:c:func:`v4l2_subdev_unlock_state()`. The caller can do so by calling the subdev
+operation through the :c:func:`v4l2_subdev_call_state_active()` macro.
+
+Operations that do not receive a state parameter implicitly operate on the
+subdevice active state, which drivers can exclusively access by
+calling :c:func:`v4l2_subdev_lock_and_get_active_state()`. The sub-device active
+state must equally be released by calling :c:func:`v4l2_subdev_unlock_state()`.
+
+Drivers must never manually access the state stored in the :c:type:`v4l2_subdev`
+or in the file handle without going through the designated helpers.
+
+While the V4L2 core passes the correct try or active state to the subdevice
+operations, many existing device drivers pass a NULL state when calling
+operations with :c:func:`v4l2_subdev_call()`. This legacy construct causes
+issues with subdevice drivers that let the V4L2 core manage the active state,
+as they expect to receive the appropriate state as a parameter. To help the
+conversion of subdevice drivers to a managed active state without having to
+convert all callers at the same time, an additional wrapper layer has been
+added to v4l2_subdev_call(), which handles the NULL case by geting and locking
+the callee's active state with :c:func:`v4l2_subdev_lock_and_get_active_state()`,
+and unlocking the state after the call.
+
+The whole subdev state is in reality split into three parts: the
+v4l2_subdev_state, subdev controls and subdev driver's internal state. In the
+future these parts should be combined into a single state. For the time being
+we need a way to handle the locking for these parts. This can be accomplished
+by sharing a lock. The v4l2_ctrl_handler already supports this via its 'lock'
+pointer and the same model is used with states. The driver can do the following
+before calling v4l2_subdev_init_finalize():
+
+.. code-block:: c
+
+ sd->ctrl_handler->lock = &priv->mutex;
+ sd->state_lock = &priv->mutex;
+
+This shares the driver's private mutex between the controls and the states.
+
V4L2 sub-device functions and data structures
---------------------------------------------
diff --git a/Documentation/driver-api/thermal/intel_dptf.rst b/Documentation/driver-api/thermal/intel_dptf.rst
index 96668dca753a..372bdb4d04c6 100644
--- a/Documentation/driver-api/thermal/intel_dptf.rst
+++ b/Documentation/driver-api/thermal/intel_dptf.rst
@@ -4,7 +4,7 @@
Intel(R) Dynamic Platform and Thermal Framework Sysfs Interface
===============================================================
-:Copyright: |copy| 2022 Intel Corporation
+:Copyright: © 2022 Intel Corporation
:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
index 4a25c5eb6f07..eb9c2d9a4f5f 100644
--- a/Documentation/fault-injection/fault-injection.rst
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -132,16 +132,16 @@ configuration of fault-injection capabilities.
Format: { 'Y' | 'N' }
- default is 'N', setting it to 'Y' won't inject failures into
- highmem/user allocations.
+ default is 'Y', setting it to 'N' will also inject failures into
+ highmem/user allocations (__GFP_HIGHMEM allocations).
- /sys/kernel/debug/failslab/ignore-gfp-wait:
- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
Format: { 'Y' | 'N' }
- default is 'N', setting it to 'Y' will inject failures
- only into non-sleep allocations (GFP_ATOMIC allocations).
+ default is 'Y', setting it to 'N' will also inject failures
+ into allocations that can sleep (__GFP_DIRECT_RECLAIM allocations).
- /sys/kernel/debug/fail_page_alloc/min-order:
@@ -280,7 +280,7 @@ Application Examples
printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
- echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+ echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
faulty_system()
{
@@ -334,8 +334,8 @@ Application Examples
printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times
echo 0 > /sys/kernel/debug/$FAILTYPE/space
echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
- echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
- echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
+ echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
+ echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index 83eafe1a7f68..ff21a83abe62 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -27,5 +27,5 @@
| sparc: | TODO |
| um: | TODO |
| x86: | ok |
- | xtensa: | TODO |
+ | xtensa: | ok |
-----------------------
diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index bb1c1801553e..72e7aadeda7e 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -27,5 +27,5 @@
| sparc: | ok |
| um: | TODO |
| x86: | ok |
- | xtensa: | TODO |
+ | xtensa: | ok |
-----------------------
diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt
index 5163a60a1c1e..c905aa3c1d81 100644
--- a/Documentation/features/time/virt-cpuacct/arch-support.txt
+++ b/Documentation/features/time/virt-cpuacct/arch-support.txt
@@ -27,5 +27,5 @@
| sparc: | ok |
| um: | TODO |
| x86: | ok |
- | xtensa: | TODO |
+ | xtensa: | ok |
-----------------------
diff --git a/Documentation/filesystems/caching/cachefiles.rst b/Documentation/filesystems/caching/cachefiles.rst
index 8bf396b76359..fc7abf712315 100644
--- a/Documentation/filesystems/caching/cachefiles.rst
+++ b/Documentation/filesystems/caching/cachefiles.rst
@@ -28,6 +28,7 @@ Cache on Already Mounted Filesystem
(*) Debugging.
+ (*) On-demand Read.
Overview
@@ -482,3 +483,180 @@ the control file. For example::
echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
will turn on all function entry debugging.
+
+
+On-demand Read
+==============
+
+When working in its original mode, CacheFiles serves as a local cache for a
+remote networking fs - while in on-demand read mode, CacheFiles can boost the
+scenario where on-demand read semantics are needed, e.g. container image
+distribution.
+
+The essential difference between these two modes is seen when a cache miss
+occurs: In the original mode, the netfs will fetch the data from the remote
+server and then write it to the cache file; in on-demand read mode, fetching
+the data and writing it into the cache is delegated to a user daemon.
+
+``CONFIG_CACHEFILES_ONDEMAND`` should be enabled to support on-demand read mode.
+
+
+Protocol Communication
+----------------------
+
+The on-demand read mode uses a simple protocol for communication between kernel
+and user daemon. The protocol can be modeled as::
+
+ kernel --[request]--> user daemon --[reply]--> kernel
+
+CacheFiles will send requests to the user daemon when needed. The user daemon
+should poll the devnode ('/dev/cachefiles') to check if there's a pending
+request to be processed. A POLLIN event will be returned when there's a pending
+request.
+
+The user daemon then reads the devnode to fetch a request to process. It should
+be noted that each read only gets one request. When it has finished processing
+the request, the user daemon should write the reply to the devnode.
+
+Each request starts with a message header of the form::
+
+ struct cachefiles_msg {
+ __u32 msg_id;
+ __u32 opcode;
+ __u32 len;
+ __u32 object_id;
+ __u8 data[];
+ };
+
+where:
+
+ * ``msg_id`` is a unique ID identifying this request among all pending
+ requests.
+
+ * ``opcode`` indicates the type of this request.
+
+ * ``object_id`` is a unique ID identifying the cache file operated on.
+
+ * ``data`` indicates the payload of this request.
+
+ * ``len`` indicates the whole length of this request, including the
+ header and following type-specific payload.
+
+
+Turning on On-demand Mode
+-------------------------
+
+An optional parameter becomes available to the "bind" command::
+
+ bind [ondemand]
+
+When the "bind" command is given no argument, it defaults to the original mode.
+When it is given the "ondemand" argument, i.e. "bind ondemand", on-demand read
+mode will be enabled.
+
+
+The OPEN Request
+----------------
+
+When the netfs opens a cache file for the first time, a request with the
+CACHEFILES_OP_OPEN opcode, a.k.a an OPEN request will be sent to the user
+daemon. The payload format is of the form::
+
+ struct cachefiles_open {
+ __u32 volume_key_size;
+ __u32 cookie_key_size;
+ __u32 fd;
+ __u32 flags;
+ __u8 data[];
+ };
+
+where:
+
+ * ``data`` contains the volume_key followed directly by the cookie_key.
+ The volume key is a NUL-terminated string; the cookie key is binary
+ data.
+
+ * ``volume_key_size`` indicates the size of the volume key in bytes.
+
+ * ``cookie_key_size`` indicates the size of the cookie key in bytes.
+
+ * ``fd`` indicates an anonymous fd referring to the cache file, through
+ which the user daemon can perform write/llseek file operations on the
+ cache file.
+
+
+The user daemon can use the given (volume_key, cookie_key) pair to distinguish
+the requested cache file. With the given anonymous fd, the user daemon can
+fetch the data and write it to the cache file in the background, even when
+kernel has not triggered a cache miss yet.
+
+Be noted that each cache file has a unique object_id, while it may have multiple
+anonymous fds. The user daemon may duplicate anonymous fds from the initial
+anonymous fd indicated by the @fd field through dup(). Thus each object_id can
+be mapped to multiple anonymous fds, while the usr daemon itself needs to
+maintain the mapping.
+
+When implementing a user daemon, please be careful of RLIMIT_NOFILE,
+``/proc/sys/fs/nr_open`` and ``/proc/sys/fs/file-max``. Typically these needn't
+be huge since they're related to the number of open device blobs rather than
+open files of each individual filesystem.
+
+The user daemon should reply the OPEN request by issuing a "copen" (complete
+open) command on the devnode::
+
+ copen <msg_id>,<cache_size>
+
+where:
+
+ * ``msg_id`` must match the msg_id field of the OPEN request.
+
+ * When >= 0, ``cache_size`` indicates the size of the cache file;
+ when < 0, ``cache_size`` indicates any error code encountered by the
+ user daemon.
+
+
+The CLOSE Request
+-----------------
+
+When a cookie withdrawn, a CLOSE request (opcode CACHEFILES_OP_CLOSE) will be
+sent to the user daemon. This tells the user daemon to close all anonymous fds
+associated with the given object_id. The CLOSE request has no extra payload,
+and shouldn't be replied.
+
+
+The READ Request
+----------------
+
+When a cache miss is encountered in on-demand read mode, CacheFiles will send a
+READ request (opcode CACHEFILES_OP_READ) to the user daemon. This tells the user
+daemon to fetch the contents of the requested file range. The payload is of the
+form::
+
+ struct cachefiles_read {
+ __u64 off;
+ __u64 len;
+ };
+
+where:
+
+ * ``off`` indicates the starting offset of the requested file range.
+
+ * ``len`` indicates the length of the requested file range.
+
+
+When it receives a READ request, the user daemon should fetch the requested data
+and write it to the cache file identified by object_id.
+
+When it has finished processing the READ request, the user daemon should reply
+by using the CACHEFILES_IOC_READ_COMPLETE ioctl on one of the anonymous fds
+associated with the object_id given in the READ request. The ioctl is of the
+form::
+
+ ioctl(fd, CACHEFILES_IOC_READ_COMPLETE, msg_id);
+
+where:
+
+ * ``fd`` is one of the anonymous fds associated with the object_id
+ given.
+
+ * ``msg_id`` must match the msg_id field of the READ request.
diff --git a/Documentation/filesystems/caching/netfs-api.rst b/Documentation/filesystems/caching/netfs-api.rst
index 7308d76a29dc..1d18e9def183 100644
--- a/Documentation/filesystems/caching/netfs-api.rst
+++ b/Documentation/filesystems/caching/netfs-api.rst
@@ -433,11 +433,11 @@ has done a write and then the page it wrote from has been released by the VM,
after which it *has* to look in the cache.
To inform fscache that a page might now be in the cache, the following function
-should be called from the ``releasepage`` address space op::
+should be called from the ``release_folio`` address space op::
void fscache_note_page_release(struct fscache_cookie *cookie);
-if the page has been released (ie. releasepage returned true).
+if the page has been released (ie. release_folio returned true).
Page release and page invalidation should also wait for any mark left on the
page to say that a DIO write is underway from that page::
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 6ccd5efb25b7..2e9aaa295125 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -1256,7 +1256,7 @@ inline encryption hardware will encrypt/decrypt the file contents.
When inline encryption isn't used, filesystems must encrypt/decrypt
the file contents themselves, as described below:
-For the read path (->readpage()) of regular files, filesystems can
+For the read path (->read_folio()) of regular files, filesystems can
read the ciphertext into the page cache and decrypt it in-place. The
page lock must be held until decryption has finished, to prevent the
page from becoming visible to userspace prematurely.
diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index 8cc536d08f51..756f2c215ba1 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -70,12 +70,23 @@ must live on a read-write filesystem because they are independently
updated and potentially user-installed, so dm-verity cannot be used.
The base fs-verity feature is a hashing mechanism only; actually
-authenticating the files is up to userspace. However, to meet some
-users' needs, fs-verity optionally supports a simple signature
-verification mechanism where users can configure the kernel to require
-that all fs-verity files be signed by a key loaded into a keyring; see
-`Built-in signature verification`_. Support for fs-verity file hashes
-in IMA (Integrity Measurement Architecture) policies is also planned.
+authenticating the files may be done by:
+
+* Userspace-only
+
+* Builtin signature verification + userspace policy
+
+ fs-verity optionally supports a simple signature verification
+ mechanism where users can configure the kernel to require that
+ all fs-verity files be signed by a key loaded into a keyring;
+ see `Built-in signature verification`_.
+
+* Integrity Measurement Architecture (IMA)
+
+ IMA supports including fs-verity file digests and signatures in the
+ IMA measurement list and verifying fs-verity based file signatures
+ stored as security.ima xattrs, based on policy.
+
User API
========
@@ -548,7 +559,7 @@ already verified). Below, we describe how filesystems implement this.
Pagecache
~~~~~~~~~
-For filesystems using Linux's pagecache, the ``->readpage()`` and
+For filesystems using Linux's pagecache, the ``->read_folio()`` and
``->readahead()`` methods must be modified to verify pages before they
are marked Uptodate. Merely hooking ``->read_iter()`` would be
insufficient, since ``->read_iter()`` is not used for memory maps.
@@ -653,12 +664,12 @@ weren't already directly answered in other parts of this document.
hashed and what to do with those hashes, such as log them,
authenticate them, or add them to a measurement list.
- IMA is planned to support the fs-verity hashing mechanism as an
- alternative to doing full file hashes, for people who want the
- performance and security benefits of the Merkle tree based hash.
- But it doesn't make sense to force all uses of fs-verity to be
- through IMA. As a standalone filesystem feature, fs-verity
- already meets many users' needs, and it's testable like other
+ IMA supports the fs-verity hashing mechanism as an alternative
+ to full file hashes, for those who want the performance and
+ security benefits of the Merkle tree based hash. However, it
+ doesn't make sense to force all uses of fs-verity to be through
+ IMA. fs-verity already meets many users' needs even as a
+ standalone filesystem feature, and it's testable like other
filesystem features e.g. with xfstests.
:Q: Isn't fs-verity useless because the attacker can just modify the
diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst
index 7a879ec3b6bf..c1db8748389c 100644
--- a/Documentation/filesystems/idmappings.rst
+++ b/Documentation/filesystems/idmappings.rst
@@ -369,6 +369,11 @@ kernel maps the caller's userspace id down into a kernel id according to the
caller's idmapping and then maps that kernel id up according to the
filesystem's idmapping.
+Let's see some examples with caller/filesystem idmapping but without mount
+idmappings. This will exhibit some problems we can hit. After that we will
+revisit/reconsider these examples, this time using mount idmappings, to see how
+they can solve the problems we observed before.
+
Example 1
~~~~~~~~~
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index c26d854275a0..515bc48ab58b 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -237,20 +237,20 @@ address_space_operations
prototypes::
int (*writepage)(struct page *page, struct writeback_control *wbc);
- int (*readpage)(struct file *, struct page *);
+ int (*read_folio)(struct file *, struct folio *);
int (*writepages)(struct address_space *, struct writeback_control *);
bool (*dirty_folio)(struct address_space *, struct folio *folio);
void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidate_folio) (struct folio *, size_t start, size_t len);
- int (*releasepage) (struct page *, int);
- void (*freepage)(struct page *);
+ bool (*release_folio)(struct folio *, gfp_t);
+ void (*free_folio)(struct folio *);
int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
bool (*isolate_page) (struct page *, isolate_mode_t);
int (*migratepage)(struct address_space *, struct page *, struct page *);
@@ -262,22 +262,22 @@ prototypes::
int (*swap_deactivate)(struct file *);
locking rules:
- All except dirty_folio and freepage may block
+ All except dirty_folio and free_folio may block
====================== ======================== ========= ===============
-ops PageLocked(page) i_rwsem invalidate_lock
+ops folio locked i_rwsem invalidate_lock
====================== ======================== ========= ===============
writepage: yes, unlocks (see below)
-readpage: yes, unlocks shared
+read_folio: yes, unlocks shared
writepages:
-dirty_folio maybe
+dirty_folio: maybe
readahead: yes, unlocks shared
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
invalidate_folio: yes exclusive
-releasepage: yes
-freepage: yes
+release_folio: yes
+free_folio: yes
direct_IO:
isolate_page: yes
migratepage: yes (both)
@@ -289,13 +289,13 @@ swap_activate: no
swap_deactivate: no
====================== ======================== ========= ===============
-->write_begin(), ->write_end() and ->readpage() may be called from
+->write_begin(), ->write_end() and ->read_folio() may be called from
the request handler (/dev/loop).
-->readpage() unlocks the page, either synchronously or via I/O
+->read_folio() unlocks the folio, either synchronously or via I/O
completion.
-->readahead() unlocks the pages that I/O is attempted on like ->readpage().
+->readahead() unlocks the folios that I/O is attempted on like ->read_folio().
->writepage() is used for two purposes: for "memory cleansing" and for
"sync". These are quite different operations and the behaviour may differ
@@ -372,12 +372,12 @@ invalidate_lock before invalidating page cache in truncate / hole punch
path (and thus calling into ->invalidate_folio) to block races between page
cache invalidation and page cache filling functions (fault, read, ...).
-->releasepage() is called when the kernel is about to try to drop the
-buffers from the page in preparation for freeing it. It returns zero to
-indicate that the buffers are (or may be) freeable. If ->releasepage is zero,
-the kernel assumes that the fs has no private interest in the buffers.
+->release_folio() is called when the kernel is about to try to drop the
+buffers from the folio in preparation for freeing it. It returns false to
+indicate that the buffers are (or may be) freeable. If ->release_folio is
+NULL, the kernel assumes that the fs has no private interest in the buffers.
-->freepage() is called when the kernel is done dropping the page
+->free_folio() is called when the kernel has dropped the folio
from the page cache.
->launder_folio() may be called prior to releasing a folio if
diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst
index 69f00179fdfe..a80a59941d2f 100644
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -96,7 +96,7 @@ attached to an inode (or NULL if fscache is disabled)::
Buffered Read Helpers
=====================
-The library provides a set of read helpers that handle the ->readpage(),
+The library provides a set of read helpers that handle the ->read_folio(),
->readahead() and much of the ->write_begin() VM operations and translate them
into a common call framework.
@@ -136,20 +136,19 @@ Read Helper Functions
Three read helpers are provided::
void netfs_readahead(struct readahead_control *ractl);
- int netfs_readpage(struct file *file,
- struct page *page);
+ int netfs_read_folio(struct file *file,
+ struct folio *folio);
int netfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos,
unsigned int len,
- unsigned int flags,
struct folio **_folio,
void **_fsdata);
Each corresponds to a VM address space operation. These operations use the
state in the per-inode context.
-For ->readahead() and ->readpage(), the network filesystem just point directly
+For ->readahead() and ->read_folio(), the network filesystem just point directly
at the corresponding read helper; whereas for ->write_begin(), it may be a
little more complicated as the network filesystem might want to flush
conflicting writes or track dirty data and needs to put the acquired folio if
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 7c1583dbeb59..2e0e4f0e0c6f 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -624,7 +624,7 @@ any symlink that might use page_follow_link_light/page_put_link() must
have inode_nohighmem(inode) called before anything might start playing with
its pagecache. No highmem pages should end up in the pagecache of such
symlinks. That includes any preseeding that might be done during symlink
-creation. __page_symlink() will honour the mapping gfp flags, so once
+creation. page_symlink() will honour the mapping gfp flags, so once
you've done inode_nohighmem() it's safe to use, but if you allocate and
insert the page manually, make sure to use the right gfp flags.
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 061744c436d9..6a0dd99786f9 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1183,85 +1183,7 @@ Provides counts of softirq handlers serviced since boot time, for each CPU.
HRTIMER: 0 0 0 0
RCU: 1678 1769 2178 2250
-
-1.3 IDE devices in /proc/ide
-----------------------------
-
-The subdirectory /proc/ide contains information about all IDE devices of which
-the kernel is aware. There is one subdirectory for each IDE controller, the
-file drivers and a link for each IDE device, pointing to the device directory
-in the controller specific subtree.
-
-The file 'drivers' contains general information about the drivers used for the
-IDE devices::
-
- > cat /proc/ide/drivers
- ide-cdrom version 4.53
- ide-disk version 1.08
-
-More detailed information can be found in the controller specific
-subdirectories. These are named ide0, ide1 and so on. Each of these
-directories contains the files shown in table 1-6.
-
-
-.. table:: Table 1-6: IDE controller info in /proc/ide/ide?
-
- ======= =======================================
- File Content
- ======= =======================================
- channel IDE channel (0 or 1)
- config Configuration (only for PCI/IDE bridge)
- mate Mate name
- model Type/Chipset of IDE controller
- ======= =======================================
-
-Each device connected to a controller has a separate subdirectory in the
-controllers directory. The files listed in table 1-7 are contained in these
-directories.
-
-
-.. table:: Table 1-7: IDE device information
-
- ================ ==========================================
- File Content
- ================ ==========================================
- cache The cache
- capacity Capacity of the medium (in 512Byte blocks)
- driver driver and version
- geometry physical and logical geometry
- identify device identify block
- media media type
- model device identifier
- settings device setup
- smart_thresholds IDE disk management thresholds
- smart_values IDE disk management values
- ================ ==========================================
-
-The most interesting file is ``settings``. This file contains a nice
-overview of the drive parameters::
-
- # cat /proc/ide/ide0/hda/settings
- name value min max mode
- ---- ----- --- --- ----
- bios_cyl 526 0 65535 rw
- bios_head 255 0 255 rw
- bios_sect 63 0 63 rw
- breada_readahead 4 0 127 rw
- bswap 0 0 1 r
- file_readahead 72 0 2097151 rw
- io_32bit 0 0 3 rw
- keepsettings 0 0 1 rw
- max_kb_per_request 122 1 127 rw
- multcount 0 0 8 rw
- nice1 1 0 1 rw
- nowerr 0 0 1 rw
- pio_mode write-only 0 255 w
- slow 0 0 1 rw
- unmaskirq 0 0 1 rw
- using_dma 0 0 1 rw
-
-
-1.4 Networking info in /proc/net
+1.3 Networking info in /proc/net
--------------------------------
The subdirectory /proc/net follows the usual pattern. Table 1-8 shows the
@@ -1340,7 +1262,7 @@ It will contain information that is specific to that bond, such as the
current slaves of the bond, the link status of the slaves, and how
many times the slaves link has failed.
-1.5 SCSI info
+1.4 SCSI info
-------------
If you have a SCSI host adapter in your system, you'll find a subdirectory
@@ -1403,7 +1325,7 @@ AHA-2940 SCSI adapter::
Total transfers 0 (0 reads and 0 writes)
-1.6 Parallel port info in /proc/parport
+1.5 Parallel port info in /proc/parport
---------------------------------------
The directory /proc/parport contains information about the parallel ports of
@@ -1428,7 +1350,7 @@ These directories contain the four files shown in Table 1-10.
number or none).
========= ====================================================================
-1.7 TTY info in /proc/tty
+1.6 TTY info in /proc/tty
-------------------------
Information about the available and actually used tty's can be found in the
@@ -1463,7 +1385,7 @@ To see which tty's are currently in use, you can simply look into the file
unknown /dev/tty 4 1-63 console
-1.8 Miscellaneous kernel statistics in /proc/stat
+1.7 Miscellaneous kernel statistics in /proc/stat
-------------------------------------------------
Various pieces of information about kernel activity are available in the
@@ -1536,7 +1458,7 @@ softirqs serviced; each subsequent column is the total for that particular
softirq.
-1.9 Ext4 file system parameters
+1.8 Ext4 file system parameters
-------------------------------
Information about mounted ext4 file systems can be found in
@@ -1552,7 +1474,7 @@ in Table 1-12, below.
mb_groups details of multiblock allocator buddy cache of free blocks
============== ==========================================================
-1.10 /proc/consoles
+1.9 /proc/consoles
-------------------
Shows registered system console lines.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 794bd1a66bfb..12a011d2cbc6 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -620,9 +620,9 @@ Writeback.
The first can be used independently to the others. The VM can try to
either write dirty pages in order to clean them, or release clean pages
in order to reuse them. To do this it can call the ->writepage method
-on dirty pages, and ->releasepage on clean pages with PagePrivate set.
-Clean pages without PagePrivate and with no external references will be
-released without notice being given to the address_space.
+on dirty pages, and ->release_folio on clean folios with the private
+flag set. Clean pages without PagePrivate and with no external references
+will be released without notice being given to the address_space.
To achieve this functionality, pages need to be placed on an LRU with
lru_cache_add and mark_page_active needs to be called whenever the page
@@ -656,7 +656,7 @@ by memory-mapping the page. Data is written into the address space by
the application, and then written-back to storage typically in whole
pages, however the address_space has finer control of write sizes.
-The read process essentially only requires 'readpage'. The write
+The read process essentially only requires 'read_folio'. The write
process is more complicated and uses write_begin/write_end or
dirty_folio to write data into the address_space, and writepage and
writepages to writeback data to storage.
@@ -722,20 +722,20 @@ cache in your filesystem. The following members are defined:
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
- int (*readpage)(struct file *, struct page *);
+ int (*read_folio)(struct file *, struct folio *);
int (*writepages)(struct address_space *, struct writeback_control *);
bool (*dirty_folio)(struct address_space *, struct folio *);
void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidate_folio) (struct folio *, size_t start, size_t len);
- int (*releasepage) (struct page *, int);
- void (*freepage)(struct page *);
+ bool (*release_folio)(struct folio *, gfp_t);
+ void (*free_folio)(struct folio *);
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
/* isolate a page for migration */
bool (*isolate_page) (struct page *, isolate_mode_t);
@@ -747,7 +747,7 @@ cache in your filesystem. The following members are defined:
bool (*is_partially_uptodate) (struct folio *, size_t from,
size_t count);
- void (*is_dirty_writeback) (struct page *, bool *, bool *);
+ void (*is_dirty_writeback)(struct folio *, bool *, bool *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
int (*swap_activate)(struct file *);
int (*swap_deactivate)(struct file *);
@@ -772,14 +772,14 @@ cache in your filesystem. The following members are defined:
See the file "Locking" for more details.
-``readpage``
- called by the VM to read a page from backing store. The page
- will be Locked when readpage is called, and should be unlocked
- and marked uptodate once the read completes. If ->readpage
- discovers that it needs to unlock the page for some reason, it
- can do so, and then return AOP_TRUNCATED_PAGE. In this case,
- the page will be relocated, relocked and if that all succeeds,
- ->readpage will be called again.
+``read_folio``
+ called by the VM to read a folio from backing store. The folio
+ will be locked when read_folio is called, and should be unlocked
+ and marked uptodate once the read completes. If ->read_folio
+ discovers that it cannot perform the I/O at this time, it can
+ unlock the folio and return AOP_TRUNCATED_PAGE. In this case,
+ the folio will be looked up again, relocked and if that all succeeds,
+ ->read_folio will be called again.
``writepages``
called by the VM to write out pages associated with the
@@ -832,9 +832,6 @@ cache in your filesystem. The following members are defined:
passed to write_begin is greater than the number of bytes copied
into the page).
- flags is a field for AOP_FLAG_xxx flags, described in
- include/linux/fs.h.
-
A void * may be returned in fsdata, which then gets passed into
write_end.
@@ -867,36 +864,35 @@ cache in your filesystem. The following members are defined:
address space. This generally corresponds to either a
truncation, punch hole or a complete invalidation of the address
space (in the latter case 'offset' will always be 0 and 'length'
- will be folio_size()). Any private data associated with the page
+ will be folio_size()). Any private data associated with the folio
should be updated to reflect this truncation. If offset is 0
and length is folio_size(), then the private data should be
- released, because the page must be able to be completely
- discarded. This may be done by calling the ->releasepage
+ released, because the folio must be able to be completely
+ discarded. This may be done by calling the ->release_folio
function, but in this case the release MUST succeed.
-``releasepage``
- releasepage is called on PagePrivate pages to indicate that the
- page should be freed if possible. ->releasepage should remove
- any private data from the page and clear the PagePrivate flag.
- If releasepage() fails for some reason, it must indicate failure
- with a 0 return value. releasepage() is used in two distinct
- though related cases. The first is when the VM finds a clean
- page with no active users and wants to make it a free page. If
- ->releasepage succeeds, the page will be removed from the
- address_space and become free.
+``release_folio``
+ release_folio is called on folios with private data to tell the
+ filesystem that the folio is about to be freed. ->release_folio
+ should remove any private data from the folio and clear the
+ private flag. If release_folio() fails, it should return false.
+ release_folio() is used in two distinct though related cases.
+ The first is when the VM wants to free a clean folio with no
+ active users. If ->release_folio succeeds, the folio will be
+ removed from the address_space and be freed.
The second case is when a request has been made to invalidate
- some or all pages in an address_space. This can happen through
- the fadvise(POSIX_FADV_DONTNEED) system call or by the
- filesystem explicitly requesting it as nfs and 9fs do (when they
+ some or all folios in an address_space. This can happen
+ through the fadvise(POSIX_FADV_DONTNEED) system call or by the
+ filesystem explicitly requesting it as nfs and 9p do (when they
believe the cache may be out of date with storage) by calling
invalidate_inode_pages2(). If the filesystem makes such a call,
- and needs to be certain that all pages are invalidated, then its
- releasepage will need to ensure this. Possibly it can clear the
- PageUptodate bit if it cannot free private data yet.
+ and needs to be certain that all folios are invalidated, then
+ its release_folio will need to ensure this. Possibly it can
+ clear the uptodate flag if it cannot free private data yet.
-``freepage``
- freepage is called once the page is no longer visible in the
+``free_folio``
+ free_folio is called once the folio is no longer visible in the
page cache in order to allow the cleanup of any private data.
Since it may be called by the memory reclaimer, it should not
assume that the original address_space mapping still exists, and
@@ -935,14 +931,14 @@ cache in your filesystem. The following members are defined:
without needing I/O to bring the whole page up to date.
``is_dirty_writeback``
- Called by the VM when attempting to reclaim a page. The VM uses
+ Called by the VM when attempting to reclaim a folio. The VM uses
dirty and writeback information to determine if it needs to
stall to allow flushers a chance to complete some IO.
- Ordinarily it can use PageDirty and PageWriteback but some
- filesystems have more complex state (unstable pages in NFS
+ Ordinarily it can use folio_test_dirty and folio_test_writeback but
+ some filesystems have more complex state (unstable folios in NFS
prevent reclaim) or do not set those flags due to locking
problems. This callback allows a filesystem to indicate to the
- VM if a page should be treated as dirty or writeback for the
+ VM if a folio should be treated as dirty or writeback for the
purposes of stalling.
``error_remove_page``
diff --git a/Documentation/filesystems/zonefs.rst b/Documentation/filesystems/zonefs.rst
index 6b213fe9a33e..394b9f15dce0 100644
--- a/Documentation/filesystems/zonefs.rst
+++ b/Documentation/filesystems/zonefs.rst
@@ -306,8 +306,15 @@ Further notes:
Mount options
-------------
-zonefs define the "errors=<behavior>" mount option to allow the user to specify
-zonefs behavior in response to I/O errors, inode size inconsistencies or zone
+zonefs defines several mount options:
+* errors=<behavior>
+* explicit-open
+
+"errors=<behavior>" option
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The "errors=<behavior>" option mount option allows the user to specify zonefs
+behavior in response to I/O errors, inode size inconsistencies or zone
condition changes. The defined behaviors are as follow:
* remount-ro (default)
@@ -326,6 +333,9 @@ discover the amount of data that has been written to the zone. In the case of a
read-only zone discovered at run-time, as indicated in the previous section.
The size of the zone file is left unchanged from its last updated value.
+"explicit-open" option
+~~~~~~~~~~~~~~~~~~~~~~
+
A zoned block device (e.g. an NVMe Zoned Namespace device) may have limits on
the number of zones that can be active, that is, zones that are in the
implicit open, explicit open or closed conditions. This potential limitation
@@ -341,6 +351,44 @@ guaranteed that write requests can be processed. Conversely, the
to the device on the last close() of a zone file if the zone is not full nor
empty.
+Runtime sysfs attributes
+------------------------
+
+zonefs defines several sysfs attributes for mounted devices. All attributes
+are user readable and can be found in the directory /sys/fs/zonefs/<dev>/,
+where <dev> is the name of the mounted zoned block device.
+
+The attributes defined are as follows.
+
+* **max_wro_seq_files**: This attribute reports the maximum number of
+ sequential zone files that can be open for writing. This number corresponds
+ to the maximum number of explicitly or implicitly open zones that the device
+ supports. A value of 0 means that the device has no limit and that any zone
+ (any file) can be open for writing and written at any time, regardless of the
+ state of other zones. When the *explicit-open* mount option is used, zonefs
+ will fail any open() system call requesting to open a sequential zone file for
+ writing when the number of sequential zone files already open for writing has
+ reached the *max_wro_seq_files* limit.
+* **nr_wro_seq_files**: This attribute reports the current number of sequential
+ zone files open for writing. When the "explicit-open" mount option is used,
+ this number can never exceed *max_wro_seq_files*. If the *explicit-open*
+ mount option is not used, the reported number can be greater than
+ *max_wro_seq_files*. In such case, it is the responsibility of the
+ application to not write simultaneously more than *max_wro_seq_files*
+ sequential zone files. Failure to do so can result in write errors.
+* **max_active_seq_files**: This attribute reports the maximum number of
+ sequential zone files that are in an active state, that is, sequential zone
+ files that are partially writen (not empty nor full) or that have a zone that
+ is explicitly open (which happens only if the *explicit-open* mount option is
+ used). This number is always equal to the maximum number of active zones that
+ the device supports. A value of 0 means that the mounted device has no limit
+ on the number of sequential zone files that can be active.
+* **nr_active_seq_files**: This attributes reports the current number of
+ sequential zone files that are active. If *max_active_seq_files* is not 0,
+ then the value of *nr_active_seq_files* can never exceed the value of
+ *nr_active_seq_files*, regardless of the use of the *explicit-open* mount
+ option.
+
Zonefs User Space Tools
=======================
diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst
index 47fb4d6d4557..6b62425ef9cd 100644
--- a/Documentation/firmware-guide/acpi/enumeration.rst
+++ b/Documentation/firmware-guide/acpi/enumeration.rst
@@ -167,8 +167,7 @@ The table below shows an example of its usage::
Name (_DSD, Package () {
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
Package () {
- Package () {"interrupt-names",
- Package (2) {"default", "alert"}},
+ Package () { "interrupt-names", Package () { "default", "alert" } },
}
...
})
diff --git a/Documentation/hwmon/aquacomputer_d5next.rst b/Documentation/hwmon/aquacomputer_d5next.rst
index 3373e27b707d..717e28226cde 100644
--- a/Documentation/hwmon/aquacomputer_d5next.rst
+++ b/Documentation/hwmon/aquacomputer_d5next.rst
@@ -6,7 +6,9 @@ Kernel driver aquacomputer-d5next
Supported devices:
* Aquacomputer D5 Next watercooling pump
+* Aquacomputer Farbwerk RGB controller
* Aquacomputer Farbwerk 360 RGB controller
+* Aquacomputer Octo fan controller
Author: Aleksa Savic
@@ -28,7 +30,10 @@ seems to require sending it a complete configuration. That includes addressable
RGB LEDs, for which there is no standard sysfs interface. Thus, that task is
better suited for userspace tools.
-The Farbwerk 360 exposes four temperature sensors. Depending on the device,
+The Octo exposes four temperature sensors and eight PWM controllable fans, along
+with their speed (in RPM), power, voltage and current.
+
+The Farbwerk and Farbwerk 360 expose four temperature sensors. Depending on the device,
not all sysfs and debugfs entries will be available.
Usage notes
diff --git a/Documentation/hwmon/asus_ec_sensors.rst b/Documentation/hwmon/asus_ec_sensors.rst
index e7e8f1640f45..78ca69eda877 100644
--- a/Documentation/hwmon/asus_ec_sensors.rst
+++ b/Documentation/hwmon/asus_ec_sensors.rst
@@ -4,17 +4,20 @@ Kernel driver asus_ec_sensors
=================================
Supported boards:
- * PRIME X570-PRO,
- * Pro WS X570-ACE,
- * ROG CROSSHAIR VIII DARK HERO,
+ * PRIME X470-PRO
+ * PRIME X570-PRO
+ * Pro WS X570-ACE
+ * ProArt X570-CREATOR WIFI
+ * ROG CROSSHAIR VIII DARK HERO
* ROG CROSSHAIR VIII HERO (WI-FI)
- * ROG CROSSHAIR VIII FORMULA,
- * ROG CROSSHAIR VIII HERO,
- * ROG CROSSHAIR VIII IMPACT,
- * ROG STRIX B550-E GAMING,
- * ROG STRIX B550-I GAMING,
- * ROG STRIX X570-E GAMING,
- * ROG STRIX X570-F GAMING,
+ * ROG CROSSHAIR VIII FORMULA
+ * ROG CROSSHAIR VIII HERO
+ * ROG CROSSHAIR VIII IMPACT
+ * ROG STRIX B550-E GAMING
+ * ROG STRIX B550-I GAMING
+ * ROG STRIX X570-E GAMING
+ * ROG STRIX X570-E GAMING WIFI II
+ * ROG STRIX X570-F GAMING
* ROG STRIX X570-I GAMING
Authors:
@@ -52,3 +55,5 @@ Module Parameters
the path is mostly identical for them). If ASUS changes this path
in a future BIOS update, this parameter can be used to override
the stored in the driver value until it gets updated.
+ A special string ":GLOBAL_LOCK" can be passed to use the ACPI
+ global lock instead of a dedicated mutex.
diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst
index d3323a96665d..e5d85e40972c 100644
--- a/Documentation/hwmon/dell-smm-hwmon.rst
+++ b/Documentation/hwmon/dell-smm-hwmon.rst
@@ -86,6 +86,13 @@ probe the BIOS on your machine and discover the appropriate codes.
Again, when you find new codes, we'd be happy to have your patches!
+``thermal`` interface
+---------------------------
+
+The driver also exports the fans as thermal cooling devices with
+``type`` set to ``dell-smm-fan[1-3]``. This allows for easy fan control
+using one of the thermal governors.
+
Module parameters
-----------------
@@ -324,6 +331,8 @@ Reading of fan types causes erratic fan behaviour. Studio XPS 8000
Inspiron 580
+ Inspiron 3505
+
Fan-related SMM calls take too long (about 500ms). Inspiron 7720
Vostro 3360
diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst
index c41eb6108103..f3276b3a381a 100644
--- a/Documentation/hwmon/hwmon-kernel-api.rst
+++ b/Documentation/hwmon/hwmon-kernel-api.rst
@@ -50,6 +50,10 @@ register/unregister functions::
void devm_hwmon_device_unregister(struct device *dev);
+ char *hwmon_sanitize_name(const char *name);
+
+ char *devm_hwmon_sanitize_name(struct device *dev, const char *name);
+
hwmon_device_register_with_groups registers a hardware monitoring device.
The first parameter of this function is a pointer to the parent device.
The name parameter is a pointer to the hwmon device name. The registration
@@ -72,7 +76,7 @@ hwmon_device_register_with_info is the most comprehensive and preferred means
to register a hardware monitoring device. It creates the standard sysfs
attributes in the hardware monitoring core, letting the driver focus on reading
from and writing to the chip instead of having to bother with sysfs attributes.
-The parent device parameter cannot be NULL with non-NULL chip info. Its
+The parent device parameter as well as the chip parameter must not be NULL. Its
parameters are described in more detail below.
devm_hwmon_device_register_with_info is similar to
@@ -95,6 +99,18 @@ All supported hwmon device registration functions only accept valid device
names. Device names including invalid characters (whitespace, '*', or '-')
will be rejected. The 'name' parameter is mandatory.
+If the driver doesn't use a static device name (for example it uses
+dev_name()), and therefore cannot make sure the name only contains valid
+characters, hwmon_sanitize_name can be used. This convenience function
+will duplicate the string and replace any invalid characters with an
+underscore. It will allocate memory for the new string and it is the
+responsibility of the caller to release the memory when the device is
+removed.
+
+devm_hwmon_sanitize_name is the resource managed version of
+hwmon_sanitize_name; the memory will be freed automatically on device
+removal.
+
Using devm_hwmon_device_register_with_info()
--------------------------------------------
diff --git a/Documentation/hwmon/index.rst b/Documentation/hwmon/index.rst
index 863b76289159..a72c16872ec2 100644
--- a/Documentation/hwmon/index.rst
+++ b/Documentation/hwmon/index.rst
@@ -90,6 +90,7 @@ Hardware Monitoring Kernel Drivers
jc42
k10temp
k8temp
+ lan966x
lineage-pem
lm25066
lm63
@@ -223,6 +224,7 @@ Hardware Monitoring Kernel Drivers
wm8350
xgene-hwmon
xdpe12284
+ xdpe152c4
zl6100
.. only:: subproject and html
diff --git a/Documentation/hwmon/lan966x.rst b/Documentation/hwmon/lan966x.rst
new file mode 100644
index 000000000000..1d1724afa5d2
--- /dev/null
+++ b/Documentation/hwmon/lan966x.rst
@@ -0,0 +1,40 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Kernel driver lan966x-hwmon
+===========================
+
+Supported chips:
+
+ * Microchip LAN9668 (sensor in SoC)
+
+ Prefix: 'lan9668-hwmon'
+
+ Datasheet: https://microchip-ung.github.io/lan9668_reginfo
+
+Authors:
+
+ Michael Walle <michael@walle.cc>
+
+Description
+-----------
+
+This driver implements support for the Microchip LAN9668 on-chip
+temperature sensor as well as its fan controller. It provides one
+temperature sensor and one fan controller. The temperature range
+of the sensor is specified from -40 to +125 degrees Celsius and
+its accuracy is +/- 5 degrees Celsius. The fan controller has a
+tacho input and a PWM output with a customizable PWM output
+frequency ranging from ~20Hz to ~650kHz.
+
+No alarms are supported by the SoC.
+
+The driver exports temperature values, fan tacho input and PWM
+settings via the following sysfs files:
+
+**temp1_input**
+
+**fan1_input**
+
+**pwm1**
+
+**pwm1_freq**
diff --git a/Documentation/hwmon/max16601.rst b/Documentation/hwmon/max16601.rst
index 92c0a7d7808c..6a4eef8efbaf 100644
--- a/Documentation/hwmon/max16601.rst
+++ b/Documentation/hwmon/max16601.rst
@@ -21,6 +21,14 @@ Supported chips:
Datasheet: Not published
+ * Maxim MAX16602
+
+ Prefix: 'max16602'
+
+ Addresses scanned: -
+
+ Datasheet: https://datasheets.maximintegrated.com/en/ds/MAX16602.pdf
+
Author: Guenter Roeck <linux@roeck-us.net>
diff --git a/Documentation/hwmon/xdpe152c4.rst b/Documentation/hwmon/xdpe152c4.rst
new file mode 100644
index 000000000000..ab92c32d4d69
--- /dev/null
+++ b/Documentation/hwmon/xdpe152c4.rst
@@ -0,0 +1,118 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Kernel driver xdpe152
+=====================
+
+Supported chips:
+
+ * Infineon XDPE152C4
+
+ Prefix: 'xdpe152c4'
+
+ * Infineon XDPE15284
+
+ Prefix: 'xdpe15284'
+
+Authors:
+
+ Greg Schwendimann <greg.schwendimann@infineon.com>
+
+Description
+-----------
+
+This driver implements support for Infineon Digital Multi-phase Controller
+XDPE152C4 and XDPE15284 dual loop voltage regulators.
+The devices are compliant with:
+
+- Intel VR13, VR13HC and VR14 rev 1.86
+ converter specification.
+- Intel SVID rev 1.93. protocol.
+- PMBus rev 1.3.1 interface.
+
+Devices support linear format for reading input and output voltage, input
+and output current, input and output power and temperature.
+
+Devices support two pages for telemetry.
+
+The driver provides for current: input, maximum and critical thresholds
+and maximum and critical alarms. Low Critical thresholds and Low critical alarm are
+supported only for current output.
+The driver exports the following attributes for via the sysfs files, where
+indexes 1, 2 are for "iin" and 3, 4 for "iout":
+
+**curr[1-4]_crit**
+
+**curr[1-4]_crit_alarm**
+
+**curr[1-4]_input**
+
+**curr[1-4]_label**
+
+**curr[1-4]_max**
+
+**curr[1-4]_max_alarm**
+
+**curr[3-4]_lcrit**
+
+**curr[3-4]_lcrit_alarm**
+
+**curr[3-4]_rated_max**
+
+The driver provides for voltage: input, critical and low critical thresholds
+and critical and low critical alarms.
+The driver exports the following attributes for via the sysfs files, where
+indexes 1, 2 are for "vin" and 3, 4 for "vout":
+
+**in[1-4]_min**
+
+**in[1-4]_crit**
+
+**in[1-4_crit_alarm**
+
+**in[1-4]_input**
+
+**in[1-4]_label**
+
+**in[1-4]_max**
+
+**in[1-4]_max_alarm**
+
+**in[1-4]_min**
+
+**in[1-4]_min_alarm**
+
+**in[3-4]_lcrit**
+
+**in[3-4]_lcrit_alarm**
+
+**in[3-4]_rated_max**
+
+**in[3-4]_rated_min**
+
+The driver provides for power: input and alarms.
+The driver exports the following attributes for via the sysfs files, where
+indexes 1, 2 are for "pin" and 3, 4 for "pout":
+
+**power[1-2]_alarm**
+
+**power[1-4]_input**
+
+**power[1-4]_label**
+
+**power[1-4]_max**
+
+**power[1-4]_rated_max**
+
+The driver provides for temperature: input, maximum and critical thresholds
+and maximum and critical alarms.
+The driver exports the following attributes for via the sysfs files:
+
+**temp[1-2]_crit**
+
+**temp[1-2]_crit_alarm**
+
+**temp[1-2]_input**
+
+**temp[1-2]_max**
+
+**temp[1-2]_max_alarm**
diff --git a/Documentation/ide/ChangeLog.ide-cd.1994-2004 b/Documentation/ide/ChangeLog.ide-cd.1994-2004
deleted file mode 100644
index 4cc3ad99f39b..000000000000
--- a/Documentation/ide/ChangeLog.ide-cd.1994-2004
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * 1.00 Oct 31, 1994 -- Initial version.
- * 1.01 Nov 2, 1994 -- Fixed problem with starting request in
- * cdrom_check_status.
- * 1.03 Nov 25, 1994 -- leaving unmask_intr[] as a user-setting (as for disks)
- * (from mlord) -- minor changes to cdrom_setup()
- * -- renamed ide_dev_s to ide_drive_t, enable irq on command
- * 2.00 Nov 27, 1994 -- Generalize packet command interface;
- * add audio ioctls.
- * 2.01 Dec 3, 1994 -- Rework packet command interface to handle devices
- * which send an interrupt when ready for a command.
- * 2.02 Dec 11, 1994 -- Cache the TOC in the driver.
- * Don't use SCMD_PLAYAUDIO_TI; it's not included
- * in the current version of ATAPI.
- * Try to use LBA instead of track or MSF addressing
- * when possible.
- * Don't wait for READY_STAT.
- * 2.03 Jan 10, 1995 -- Rewrite block read routines to handle block sizes
- * other than 2k and to move multiple sectors in a
- * single transaction.
- * 2.04 Apr 21, 1995 -- Add work-around for Creative Labs CD220E drives.
- * Thanks to Nick Saw <cwsaw@pts7.pts.mot.com> for
- * help in figuring this out. Ditto for Acer and
- * Aztech drives, which seem to have the same problem.
- * 2.04b May 30, 1995 -- Fix to match changes in ide.c version 3.16 -ml
- * 2.05 Jun 8, 1995 -- Don't attempt to retry after an illegal request
- * or data protect error.
- * Use HWIF and DEV_HWIF macros as in ide.c.
- * Always try to do a request_sense after
- * a failed command.
- * Include an option to give textual descriptions
- * of ATAPI errors.
- * Fix a bug in handling the sector cache which
- * showed up if the drive returned data in 512 byte
- * blocks (like Pioneer drives). Thanks to
- * Richard Hirst <srh@gpt.co.uk> for diagnosing this.
- * Properly supply the page number field in the
- * MODE_SELECT command.
- * PLAYAUDIO12 is broken on the Aztech; work around it.
- * 2.05x Aug 11, 1995 -- lots of data structure renaming/restructuring in ide.c
- * (my apologies to Scott, but now ide-cd.c is independent)
- * 3.00 Aug 22, 1995 -- Implement CDROMMULTISESSION ioctl.
- * Implement CDROMREADAUDIO ioctl (UNTESTED).
- * Use input_ide_data() and output_ide_data().
- * Add door locking.
- * Fix usage count leak in cdrom_open, which happened
- * when a read-write mount was attempted.
- * Try to load the disk on open.
- * Implement CDROMEJECT_SW ioctl (off by default).
- * Read total cdrom capacity during open.
- * Rearrange logic in cdrom_decode_status. Issue
- * request sense commands for failed packet commands
- * from here instead of from cdrom_queue_packet_command.
- * Fix a race condition in retrieving error information.
- * Suppress printing normal unit attention errors and
- * some drive not ready errors.
- * Implement CDROMVOLREAD ioctl.
- * Implement CDROMREADMODE1/2 ioctls.
- * Fix race condition in setting up interrupt handlers
- * when the `serialize' option is used.
- * 3.01 Sep 2, 1995 -- Fix ordering of reenabling interrupts in
- * cdrom_queue_request.
- * Another try at using ide_[input,output]_data.
- * 3.02 Sep 16, 1995 -- Stick total disk capacity in partition table as well.
- * Make VERBOSE_IDE_CD_ERRORS dump failed command again.
- * Dump out more information for ILLEGAL REQUEST errs.
- * Fix handling of errors occurring before the
- * packet command is transferred.
- * Fix transfers with odd bytelengths.
- * 3.03 Oct 27, 1995 -- Some Creative drives have an id of just `CD'.
- * `DCI-2S10' drives are broken too.
- * 3.04 Nov 20, 1995 -- So are Vertos drives.
- * 3.05 Dec 1, 1995 -- Changes to go with overhaul of ide.c and ide-tape.c
- * 3.06 Dec 16, 1995 -- Add support needed for partitions.
- * More workarounds for Vertos bugs (based on patches
- * from Holger Dietze <dietze@aix520.informatik.uni-leipzig.de>).
- * Try to eliminate byteorder assumptions.
- * Use atapi_cdrom_subchnl struct definition.
- * Add STANDARD_ATAPI compilation option.
- * 3.07 Jan 29, 1996 -- More twiddling for broken drives: Sony 55D,
- * Vertos 300.
- * Add NO_DOOR_LOCKING configuration option.
- * Handle drive_cmd requests w/NULL args (for hdparm -t).
- * Work around sporadic Sony55e audio play problem.
- * 3.07a Feb 11, 1996 -- check drive->id for NULL before dereferencing, to fix
- * problem with "hde=cdrom" with no drive present. -ml
- * 3.08 Mar 6, 1996 -- More Vertos workarounds.
- * 3.09 Apr 5, 1996 -- Add CDROMCLOSETRAY ioctl.
- * Switch to using MSF addressing for audio commands.
- * Reformat to match kernel tabbing style.
- * Add CDROM_GET_UPC ioctl.
- * 3.10 Apr 10, 1996 -- Fix compilation error with STANDARD_ATAPI.
- * 3.11 Apr 29, 1996 -- Patch from Heiko Eißfeldt <heiko@colossus.escape.de>
- * to remove redundant verify_area calls.
- * 3.12 May 7, 1996 -- Rudimentary changer support. Based on patches
- * from Gerhard Zuber <zuber@berlin.snafu.de>.
- * Let open succeed even if there's no loaded disc.
- * 3.13 May 19, 1996 -- Fixes for changer code.
- * 3.14 May 29, 1996 -- Add work-around for Vertos 600.
- * (From Hennus Bergman <hennus@sky.ow.nl>.)
- * 3.15 July 2, 1996 -- Added support for Sanyo 3 CD changers
- * from Ben Galliart <bgallia@luc.edu> with
- * special help from Jeff Lightfoot
- * <jeffml@pobox.com>
- * 3.15a July 9, 1996 -- Improved Sanyo 3 CD changer identification
- * 3.16 Jul 28, 1996 -- Fix from Gadi to reduce kernel stack usage for ioctl.
- * 3.17 Sep 17, 1996 -- Tweak audio reads for some drives.
- * Start changing CDROMLOADFROMSLOT to CDROM_SELECT_DISC.
- * 3.18 Oct 31, 1996 -- Added module and DMA support.
- *
- * 4.00 Nov 5, 1996 -- New ide-cd maintainer,
- * Erik B. Andersen <andersee@debian.org>
- * -- Newer Creative drives don't always set the error
- * register correctly. Make sure we see media changes
- * regardless.
- * -- Integrate with generic cdrom driver.
- * -- CDROMGETSPINDOWN and CDROMSETSPINDOWN ioctls, based on
- * a patch from Ciro Cattuto <>.
- * -- Call set_device_ro.
- * -- Implement CDROMMECHANISMSTATUS and CDROMSLOTTABLE
- * ioctls, based on patch by Erik Andersen
- * -- Add some probes of drive capability during setup.
- *
- * 4.01 Nov 11, 1996 -- Split into ide-cd.c and ide-cd.h
- * -- Removed CDROMMECHANISMSTATUS and CDROMSLOTTABLE
- * ioctls in favor of a generalized approach
- * using the generic cdrom driver.
- * -- Fully integrated with the 2.1.X kernel.
- * -- Other stuff that I forgot (lots of changes)
- *
- * 4.02 Dec 01, 1996 -- Applied patch from Gadi Oxman <gadio@netvision.net.il>
- * to fix the drive door locking problems.
- *
- * 4.03 Dec 04, 1996 -- Added DSC overlap support.
- * 4.04 Dec 29, 1996 -- Added CDROMREADRAW ioclt based on patch
- * by Ales Makarov (xmakarov@sun.felk.cvut.cz)
- *
- * 4.05 Nov 20, 1997 -- Modified to print more drive info on init
- * Minor other changes
- * Fix errors on CDROMSTOP (If you have a "Dolphin",
- * you must define IHAVEADOLPHIN)
- * Added identifier so new Sanyo CD-changer works
- * Better detection if door locking isn't supported
- *
- * 4.06 Dec 17, 1997 -- fixed endless "tray open" messages -ml
- * 4.07 Dec 17, 1997 -- fallback to set pc->stat on "tray open"
- * 4.08 Dec 18, 1997 -- spew less noise when tray is empty
- * -- fix speed display for ACER 24X, 18X
- * 4.09 Jan 04, 1998 -- fix handling of the last block so we return
- * an end of file instead of an I/O error (Gadi)
- * 4.10 Jan 24, 1998 -- fixed a bug so now changers can change to a new
- * slot when there is no disc in the current slot.
- * -- Fixed a memory leak where info->changer_info was
- * malloc'ed but never free'd when closing the device.
- * -- Cleaned up the global namespace a bit by making more
- * functions static that should already have been.
- * 4.11 Mar 12, 1998 -- Added support for the CDROM_SELECT_SPEED ioctl
- * based on a patch for 2.0.33 by Jelle Foks
- * <jelle@scintilla.utwente.nl>, a patch for 2.0.33
- * by Toni Giorgino <toni@pcape2.pi.infn.it>, the SCSI
- * version, and my own efforts. -erik
- * -- Fixed a stupid bug which egcs was kind enough to
- * inform me of where "Illegal mode for this track"
- * was never returned due to a comparison on data
- * types of limited range.
- * 4.12 Mar 29, 1998 -- Fixed bug in CDROM_SELECT_SPEED so write speed is
- * now set ionly for CD-R and CD-RW drives. I had
- * removed this support because it produced errors.
- * It produced errors _only_ for non-writers. duh.
- * 4.13 May 05, 1998 -- Suppress useless "in progress of becoming ready"
- * messages, since this is not an error.
- * -- Change error messages to be const
- * -- Remove a "\t" which looks ugly in the syslogs
- * 4.14 July 17, 1998 -- Change to pointing to .ps version of ATAPI spec
- * since the .pdf version doesn't seem to work...
- * -- Updated the TODO list to something more current.
- *
- * 4.15 Aug 25, 1998 -- Updated ide-cd.h to respect machine endianness,
- * patch thanks to "Eddie C. Dost" <ecd@skynet.be>
- *
- * 4.50 Oct 19, 1998 -- New maintainers!
- * Jens Axboe <axboe@image.dk>
- * Chris Zwilling <chris@cloudnet.com>
- *
- * 4.51 Dec 23, 1998 -- Jens Axboe <axboe@image.dk>
- * - ide_cdrom_reset enabled since the ide subsystem
- * handles resets fine now. <axboe@image.dk>
- * - Transfer size fix for Samsung CD-ROMs, thanks to
- * "Ville Hallik" <ville.hallik@mail.ee>.
- * - other minor stuff.
- *
- * 4.52 Jan 19, 1999 -- Jens Axboe <axboe@image.dk>
- * - Detect DVD-ROM/RAM drives
- *
- * 4.53 Feb 22, 1999 - Include other model Samsung and one Goldstar
- * drive in transfer size limit.
- * - Fix the I/O error when doing eject without a medium
- * loaded on some drives.
- * - CDROMREADMODE2 is now implemented through
- * CDROMREADRAW, since many drives don't support
- * MODE2 (even though ATAPI 2.6 says they must).
- * - Added ignore parameter to ide-cd (as a module), eg
- * insmod ide-cd ignore='hda hdb'
- * Useful when using ide-cd in conjunction with
- * ide-scsi. TODO: non-modular way of doing the
- * same.
- *
- * 4.54 Aug 5, 1999 - Support for MMC2 class commands through the generic
- * packet interface to cdrom.c.
- * - Unified audio ioctl support, most of it.
- * - cleaned up various deprecated verify_area().
- * - Added ide_cdrom_packet() as the interface for
- * the Uniform generic_packet().
- * - bunch of other stuff, will fill in logs later.
- * - report 1 slot for non-changers, like the other
- * cd-rom drivers. don't report select disc for
- * non-changers as well.
- * - mask out audio playing, if the device can't do it.
- *
- * 4.55 Sep 1, 1999 - Eliminated the rest of the audio ioctls, except
- * for CDROMREADTOC[ENTRY|HEADER]. Some of the drivers
- * use this independently of the actual audio handling.
- * They will disappear later when I get the time to
- * do it cleanly.
- * - Minimize the TOC reading - only do it when we
- * know a media change has occurred.
- * - Moved all the CDROMREADx ioctls to the Uniform layer.
- * - Heiko Eißfeldt <heiko@colossus.escape.de> supplied
- * some fixes for CDI.
- * - CD-ROM leaving door locked fix from Andries
- * Brouwer <Andries.Brouwer@cwi.nl>
- * - Erik Andersen <andersen@xmission.com> unified
- * commands across the various drivers and how
- * sense errors are handled.
- *
- * 4.56 Sep 12, 1999 - Removed changer support - it is now in the
- * Uniform layer.
- * - Added partition based multisession handling.
- * - Mode sense and mode select moved to the
- * Uniform layer.
- * - Fixed a problem with WPI CDS-32X drive - it
- * failed the capabilities
- *
- * 4.57 Apr 7, 2000 - Fixed sense reporting.
- * - Fixed possible oops in ide_cdrom_get_last_session()
- * - Fix locking mania and make ide_cdrom_reset relock
- * - Stop spewing errors to log when magicdev polls with
- * TEST_UNIT_READY on some drives.
- * - Various fixes from Tobias Ringstrom:
- * tray if it was locked prior to the reset.
- * - cdrom_read_capacity returns one frame too little.
- * - Fix real capacity reporting.
- *
- * 4.58 May 1, 2000 - Clean up ACER50 stuff.
- * - Fix small problem with ide_cdrom_capacity
- *
- * 4.59 Aug 11, 2000 - Fix changer problem in cdrom_read_toc, we weren't
- * correctly sensing a disc change.
- * - Rearranged some code
- * - Use extended sense on drives that support it for
- * correctly reporting tray status -- from
- * Michael D Johnson <johnsom@orst.edu>
- * 4.60 Dec 17, 2003 - Add mt rainier support
- * - Bump timeout for packet commands, matches sr
- * - Odd stuff
- * 4.61 Jan 22, 2004 - support hardware sector sizes other than 2kB,
- * Pascal Schmidt <der.eremit@email.de>
- */
diff --git a/Documentation/ide/ChangeLog.ide-floppy.1996-2002 b/Documentation/ide/ChangeLog.ide-floppy.1996-2002
deleted file mode 100644
index 46c19ef32a9e..000000000000
--- a/Documentation/ide/ChangeLog.ide-floppy.1996-2002
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Many thanks to Lode Leroy <Lode.Leroy@www.ibase.be>, who tested so many
- * ALPHA patches to this driver on an EASYSTOR LS-120 ATAPI floppy drive.
- *
- * Ver 0.1 Oct 17 96 Initial test version, mostly based on ide-tape.c.
- * Ver 0.2 Oct 31 96 Minor changes.
- * Ver 0.3 Dec 2 96 Fixed error recovery bug.
- * Ver 0.4 Jan 26 97 Add support for the HDIO_GETGEO ioctl.
- * Ver 0.5 Feb 21 97 Add partitions support.
- * Use the minimum of the LBA and CHS capacities.
- * Avoid hwgroup->rq == NULL on the last irq.
- * Fix potential null dereferencing with DEBUG_LOG.
- * Ver 0.8 Dec 7 97 Increase irq timeout from 10 to 50 seconds.
- * Add media write-protect detection.
- * Issue START command only if TEST UNIT READY fails.
- * Add work-around for IOMEGA ZIP revision 21.D.
- * Remove idefloppy_get_capabilities().
- * Ver 0.9 Jul 4 99 Fix a bug which might have caused the number of
- * bytes requested on each interrupt to be zero.
- * Thanks to <shanos@es.co.nz> for pointing this out.
- * Ver 0.9.sv Jan 6 01 Sam Varshavchik <mrsam@courier-mta.com>
- * Implement low level formatting. Reimplemented
- * IDEFLOPPY_CAPABILITIES_PAGE, since we need the srfp
- * bit. My LS-120 drive barfs on
- * IDEFLOPPY_CAPABILITIES_PAGE, but maybe it's just me.
- * Compromise by not reporting a failure to get this
- * mode page. Implemented four IOCTLs in order to
- * implement formatting. IOCTls begin with 0x4600,
- * 0x46 is 'F' as in Format.
- * Jan 9 01 Userland option to select format verify.
- * Added PC_SUPPRESS_ERROR flag - some idefloppy drives
- * do not implement IDEFLOPPY_CAPABILITIES_PAGE, and
- * return a sense error. Suppress error reporting in
- * this particular case in order to avoid spurious
- * errors in syslog. The culprit is
- * idefloppy_get_capability_page(), so move it to
- * idefloppy_begin_format() so that it's not used
- * unless absolutely necessary.
- * If drive does not support format progress indication
- * monitor the dsc bit in the status register.
- * Also, O_NDELAY on open will allow the device to be
- * opened without a disk available. This can be used to
- * open an unformatted disk, or get the device capacity.
- * Ver 0.91 Dec 11 99 Added IOMEGA Clik! drive support by
- * <paul@paulbristow.net>
- * Ver 0.92 Oct 22 00 Paul Bristow became official maintainer for this
- * driver. Included Powerbook internal zip kludge.
- * Ver 0.93 Oct 24 00 Fixed bugs for Clik! drive
- * no disk on insert and disk change now works
- * Ver 0.94 Oct 27 00 Tidied up to remove strstr(Clik) everywhere
- * Ver 0.95 Nov 7 00 Brought across to kernel 2.4
- * Ver 0.96 Jan 7 01 Actually in line with release version of 2.4.0
- * including set_bit patch from Rusty Russell
- * Ver 0.97 Jul 22 01 Merge 0.91-0.96 onto 0.9.sv for ac series
- * Ver 0.97.sv Aug 3 01 Backported from 2.4.7-ac3
- * Ver 0.98 Oct 26 01 Split idefloppy_transfer_pc into two pieces to
- * fix a lost interrupt problem. It appears the busy
- * bit was being deasserted by my IOMEGA ATAPI ZIP 100
- * drive before the drive was actually ready.
- * Ver 0.98a Oct 29 01 Expose delay value so we can play.
- * Ver 0.99 Feb 24 02 Remove duplicate code, modify clik! detection code
- * to support new PocketZip drives
- */
diff --git a/Documentation/ide/ChangeLog.ide-tape.1995-2002 b/Documentation/ide/ChangeLog.ide-tape.1995-2002
deleted file mode 100644
index 877fac8770b3..000000000000
--- a/Documentation/ide/ChangeLog.ide-tape.1995-2002
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Ver 0.1 Nov 1 95 Pre-working code :-)
- * Ver 0.2 Nov 23 95 A short backup (few megabytes) and restore procedure
- * was successful ! (Using tar cvf ... on the block
- * device interface).
- * A longer backup resulted in major swapping, bad
- * overall Linux performance and eventually failed as
- * we received non serial read-ahead requests from the
- * buffer cache.
- * Ver 0.3 Nov 28 95 Long backups are now possible, thanks to the
- * character device interface. Linux's responsiveness
- * and performance doesn't seem to be much affected
- * from the background backup procedure.
- * Some general mtio.h magnetic tape operations are
- * now supported by our character device. As a result,
- * popular tape utilities are starting to work with
- * ide tapes :-)
- * The following configurations were tested:
- * 1. An IDE ATAPI TAPE shares the same interface
- * and irq with an IDE ATAPI CDROM.
- * 2. An IDE ATAPI TAPE shares the same interface
- * and irq with a normal IDE disk.
- * Both configurations seemed to work just fine !
- * However, to be on the safe side, it is meanwhile
- * recommended to give the IDE TAPE its own interface
- * and irq.
- * The one thing which needs to be done here is to
- * add a "request postpone" feature to ide.c,
- * so that we won't have to wait for the tape to finish
- * performing a long media access (DSC) request (such
- * as a rewind) before we can access the other device
- * on the same interface. This effect doesn't disturb
- * normal operation most of the time because read/write
- * requests are relatively fast, and once we are
- * performing one tape r/w request, a lot of requests
- * from the other device can be queued and ide.c will
- * service all of them after this single tape request.
- * Ver 1.0 Dec 11 95 Integrated into Linux 1.3.46 development tree.
- * On each read / write request, we now ask the drive
- * if we can transfer a constant number of bytes
- * (a parameter of the drive) only to its buffers,
- * without causing actual media access. If we can't,
- * we just wait until we can by polling the DSC bit.
- * This ensures that while we are not transferring
- * more bytes than the constant referred to above, the
- * interrupt latency will not become too high and
- * we won't cause an interrupt timeout, as happened
- * occasionally in the previous version.
- * While polling for DSC, the current request is
- * postponed and ide.c is free to handle requests from
- * the other device. This is handled transparently to
- * ide.c. The hwgroup locking method which was used
- * in the previous version was removed.
- * Use of new general features which are provided by
- * ide.c for use with atapi devices.
- * (Programming done by Mark Lord)
- * Few potential bug fixes (Again, suggested by Mark)
- * Single character device data transfers are now
- * not limited in size, as they were before.
- * We are asking the tape about its recommended
- * transfer unit and send a larger data transfer
- * as several transfers of the above size.
- * For best results, use an integral number of this
- * basic unit (which is shown during driver
- * initialization). I will soon add an ioctl to get
- * this important parameter.
- * Our data transfer buffer is allocated on startup,
- * rather than before each data transfer. This should
- * ensure that we will indeed have a data buffer.
- * Ver 1.1 Dec 14 95 Fixed random problems which occurred when the tape
- * shared an interface with another device.
- * (poll_for_dsc was a complete mess).
- * Removed some old (non-active) code which had
- * to do with supporting buffer cache originated
- * requests.
- * The block device interface can now be opened, so
- * that general ide driver features like the unmask
- * interrupts flag can be selected with an ioctl.
- * This is the only use of the block device interface.
- * New fast pipelined operation mode (currently only on
- * writes). When using the pipelined mode, the
- * throughput can potentially reach the maximum
- * tape supported throughput, regardless of the
- * user backup program. On my tape drive, it sometimes
- * boosted performance by a factor of 2. Pipelined
- * mode is enabled by default, but since it has a few
- * downfalls as well, you may want to disable it.
- * A short explanation of the pipelined operation mode
- * is available below.
- * Ver 1.2 Jan 1 96 Eliminated pipelined mode race condition.
- * Added pipeline read mode. As a result, restores
- * are now as fast as backups.
- * Optimized shared interface behavior. The new behavior
- * typically results in better IDE bus efficiency and
- * higher tape throughput.
- * Pre-calculation of the expected read/write request
- * service time, based on the tape's parameters. In
- * the pipelined operation mode, this allows us to
- * adjust our polling frequency to a much lower value,
- * and thus to dramatically reduce our load on Linux,
- * without any decrease in performance.
- * Implemented additional mtio.h operations.
- * The recommended user block size is returned by
- * the MTIOCGET ioctl.
- * Additional minor changes.
- * Ver 1.3 Feb 9 96 Fixed pipelined read mode bug which prevented the
- * use of some block sizes during a restore procedure.
- * The character device interface will now present a
- * continuous view of the media - any mix of block sizes
- * during a backup/restore procedure is supported. The
- * driver will buffer the requests internally and
- * convert them to the tape's recommended transfer
- * unit, making performance almost independent of the
- * chosen user block size.
- * Some improvements in error recovery.
- * By cooperating with ide-dma.c, bus mastering DMA can
- * now sometimes be used with IDE tape drives as well.
- * Bus mastering DMA has the potential to dramatically
- * reduce the CPU's overhead when accessing the device,
- * and can be enabled by using hdparm -d1 on the tape's
- * block device interface. For more info, read the
- * comments in ide-dma.c.
- * Ver 1.4 Mar 13 96 Fixed serialize support.
- * Ver 1.5 Apr 12 96 Fixed shared interface operation, broken in 1.3.85.
- * Fixed pipelined read mode inefficiency.
- * Fixed nasty null dereferencing bug.
- * Ver 1.6 Aug 16 96 Fixed FPU usage in the driver.
- * Fixed end of media bug.
- * Ver 1.7 Sep 10 96 Minor changes for the CONNER CTT8000-A model.
- * Ver 1.8 Sep 26 96 Attempt to find a better balance between good
- * interactive response and high system throughput.
- * Ver 1.9 Nov 5 96 Automatically cross encountered filemarks rather
- * than requiring an explicit FSF command.
- * Abort pending requests at end of media.
- * MTTELL was sometimes returning incorrect results.
- * Return the real block size in the MTIOCGET ioctl.
- * Some error recovery bug fixes.
- * Ver 1.10 Nov 5 96 Major reorganization.
- * Reduced CPU overhead a bit by eliminating internal
- * bounce buffers.
- * Added module support.
- * Added multiple tape drives support.
- * Added partition support.
- * Rewrote DSC handling.
- * Some portability fixes.
- * Removed ide-tape.h.
- * Additional minor changes.
- * Ver 1.11 Dec 2 96 Bug fix in previous DSC timeout handling.
- * Use ide_stall_queue() for DSC overlap.
- * Use the maximum speed rather than the current speed
- * to compute the request service time.
- * Ver 1.12 Dec 7 97 Fix random memory overwriting and/or last block data
- * corruption, which could occur if the total number
- * of bytes written to the tape was not an integral
- * number of tape blocks.
- * Add support for INTERRUPT DRQ devices.
- * Ver 1.13 Jan 2 98 Add "speed == 0" work-around for HP COLORADO 5GB
- * Ver 1.14 Dec 30 98 Partial fixes for the Sony/AIWA tape drives.
- * Replace cli()/sti() with hwgroup spinlocks.
- * Ver 1.15 Mar 25 99 Fix SMP race condition by replacing hwgroup
- * spinlock with private per-tape spinlock.
- * Ver 1.16 Sep 1 99 Add OnStream tape support.
- * Abort read pipeline on EOD.
- * Wait for the tape to become ready in case it returns
- * "in the process of becoming ready" on open().
- * Fix zero padding of the last written block in
- * case the tape block size is larger than PAGE_SIZE.
- * Decrease the default disconnection time to tn.
- * Ver 1.16e Oct 3 99 Minor fixes.
- * Ver 1.16e1 Oct 13 99 Patches by Arnold Niessen,
- * niessen@iae.nl / arnold.niessen@philips.com
- * GO-1) Undefined code in idetape_read_position
- * according to Gadi's email
- * AJN-1) Minor fix asc == 11 should be asc == 0x11
- * in idetape_issue_packet_command (did effect
- * debugging output only)
- * AJN-2) Added more debugging output, and
- * added ide-tape: where missing. I would also
- * like to add tape->name where possible
- * AJN-3) Added different debug_level's
- * via /proc/ide/hdc/settings
- * "debug_level" determines amount of debugging output;
- * can be changed using /proc/ide/hdx/settings
- * 0 : almost no debugging output
- * 1 : 0+output errors only
- * 2 : 1+output all sensekey/asc
- * 3 : 2+follow all chrdev related procedures
- * 4 : 3+follow all procedures
- * 5 : 4+include pc_stack rq_stack info
- * 6 : 5+USE_COUNT updates
- * AJN-4) Fixed timeout for retension in idetape_queue_pc_tail
- * from 5 to 10 minutes
- * AJN-5) Changed maximum number of blocks to skip when
- * reading tapes with multiple consecutive write
- * errors from 100 to 1000 in idetape_get_logical_blk
- * Proposed changes to code:
- * 1) output "logical_blk_num" via /proc
- * 2) output "current_operation" via /proc
- * 3) Either solve or document the fact that `mt rewind' is
- * required after reading from /dev/nhtx to be
- * able to rmmod the idetape module;
- * Also, sometimes an application finishes but the
- * device remains `busy' for some time. Same cause ?
- * Proposed changes to release-notes:
- * 4) write a simple `quickstart' section in the
- * release notes; I volunteer if you don't want to
- * 5) include a pointer to video4linux in the doc
- * to stimulate video applications
- * 6) release notes lines 331 and 362: explain what happens
- * if the application data rate is higher than 1100 KB/s;
- * similar approach to lower-than-500 kB/s ?
- * 7) 6.6 Comparison; wouldn't it be better to allow different
- * strategies for read and write ?
- * Wouldn't it be better to control the tape buffer
- * contents instead of the bandwidth ?
- * 8) line 536: replace will by would (if I understand
- * this section correctly, a hypothetical and unwanted situation
- * is being described)
- * Ver 1.16f Dec 15 99 Change place of the secondary OnStream header frames.
- * Ver 1.17 Nov 2000 / Jan 2001 Marcel Mol, marcel@mesa.nl
- * - Add idetape_onstream_mode_sense_tape_parameter_page
- * function to get tape capacity in frames: tape->capacity.
- * - Add support for DI-50 drives( or any DI- drive).
- * - 'workaround' for read error/blank block around block 3000.
- * - Implement Early warning for end of media for Onstream.
- * - Cosmetic code changes for readability.
- * - Idetape_position_tape should not use SKIP bit during
- * Onstream read recovery.
- * - Add capacity, logical_blk_num and first/last_frame_position
- * to /proc/ide/hd?/settings.
- * - Module use count was gone in the Linux 2.4 driver.
- * Ver 1.17a Apr 2001 Willem Riede osst@riede.org
- * - Get drive's actual block size from mode sense block descriptor
- * - Limit size of pipeline
- * Ver 1.17b Oct 2002 Alan Stern <stern@rowland.harvard.edu>
- * Changed IDETAPE_MIN_PIPELINE_STAGES to 1 and actually used
- * it in the code!
- * Actually removed aborted stages in idetape_abort_pipeline
- * instead of just changing the command code.
- * Made the transfer byte count for Request Sense equal to the
- * actual length of the data transfer.
- * Changed handling of partial data transfers: they do not
- * cause DMA errors.
- * Moved initiation of DMA transfers to the correct place.
- * Removed reference to unallocated memory.
- * Made __idetape_discard_read_pipeline return the number of
- * sectors skipped, not the number of stages.
- * Replaced errant kfree() calls with __idetape_kfree_stage().
- * Fixed off-by-one error in testing the pipeline length.
- * Fixed handling of filemarks in the read pipeline.
- * Small code optimization for MTBSF and MTBSFM ioctls.
- * Don't try to unlock the door during device close if is
- * already unlocked!
- * Cosmetic fixes to miscellaneous debugging output messages.
- * Set the minimum /proc/ide/hd?/settings values for "pipeline",
- * "pipeline_min", and "pipeline_max" to 1.
- */
diff --git a/Documentation/ide/changelogs.rst b/Documentation/ide/changelogs.rst
deleted file mode 100644
index fdf9d0fb8027..000000000000
--- a/Documentation/ide/changelogs.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Changelog for ide cd
---------------------
-
- .. include:: ChangeLog.ide-cd.1994-2004
- :literal:
-
-Changelog for ide floppy
-------------------------
-
- .. include:: ChangeLog.ide-floppy.1996-2002
- :literal:
-
-Changelog for ide tape
-----------------------
-
- .. include:: ChangeLog.ide-tape.1995-2002
- :literal:
diff --git a/Documentation/ide/ide-tape.rst b/Documentation/ide/ide-tape.rst
deleted file mode 100644
index 3e061d9c0e38..000000000000
--- a/Documentation/ide/ide-tape.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-===============================
-IDE ATAPI streaming tape driver
-===============================
-
-This driver is a part of the Linux ide driver.
-
-The driver, in co-operation with ide.c, basically traverses the
-request-list for the block device interface. The character device
-interface, on the other hand, creates new requests, adds them
-to the request-list of the block device, and waits for their completion.
-
-The block device major and minor numbers are determined from the
-tape's relative position in the ide interfaces, as explained in ide.c.
-
-The character device interface consists of the following devices::
-
- ht0 major 37, minor 0 first IDE tape, rewind on close.
- ht1 major 37, minor 1 second IDE tape, rewind on close.
- ...
- nht0 major 37, minor 128 first IDE tape, no rewind on close.
- nht1 major 37, minor 129 second IDE tape, no rewind on close.
- ...
-
-The general magnetic tape commands compatible interface, as defined by
-include/linux/mtio.h, is accessible through the character device.
-
-General ide driver configuration options, such as the interrupt-unmask
-flag, can be configured by issuing an ioctl to the block device interface,
-as any other ide device.
-
-Our own ide-tape ioctl's can be issued to either the block device or
-the character device interface.
-
-Maximal throughput with minimal bus load will usually be achieved in the
-following scenario:
-
- 1. ide-tape is operating in the pipelined operation mode.
- 2. No buffering is performed by the user backup program.
-
-Testing was done with a 2 GB CONNER CTMA 4000 IDE ATAPI Streaming Tape Drive.
-
-Here are some words from the first releases of hd.c, which are quoted
-in ide.c and apply here as well:
-
-* Special care is recommended. Have Fun!
-
-Possible improvements
-=====================
-
-1. Support for the ATAPI overlap protocol.
-
-In order to maximize bus throughput, we currently use the DSC
-overlap method which enables ide.c to service requests from the
-other device while the tape is busy executing a command. The
-DSC overlap method involves polling the tape's status register
-for the DSC bit, and servicing the other device while the tape
-isn't ready.
-
-In the current QIC development standard (December 1995),
-it is recommended that new tape drives will *in addition*
-implement the ATAPI overlap protocol, which is used for the
-same purpose - efficient use of the IDE bus, but is interrupt
-driven and thus has much less CPU overhead.
-
-ATAPI overlap is likely to be supported in most new ATAPI
-devices, including new ATAPI cdroms, and thus provides us
-a method by which we can achieve higher throughput when
-sharing a (fast) ATA-2 disk with any (slow) new ATAPI device.
diff --git a/Documentation/ide/ide.rst b/Documentation/ide/ide.rst
deleted file mode 100644
index 88bdcba92f7d..000000000000
--- a/Documentation/ide/ide.rst
+++ /dev/null
@@ -1,265 +0,0 @@
-============================================
-Information regarding the Enhanced IDE drive
-============================================
-
- The hdparm utility can be used to control various IDE features on a
- running system. It is packaged separately. Please Look for it on popular
- linux FTP sites.
-
--------------------------------------------------------------------------------
-
-.. important::
-
- BUGGY IDE CHIPSETS CAN CORRUPT DATA!!
-
- PCI versions of the CMD640 and RZ1000 interfaces are now detected
- automatically at startup when PCI BIOS support is configured.
-
- Linux disables the "prefetch" ("readahead") mode of the RZ1000
- to prevent data corruption possible due to hardware design flaws.
-
- For the CMD640, linux disables "IRQ unmasking" (hdparm -u1) on any
- drive for which the "prefetch" mode of the CMD640 is turned on.
- If "prefetch" is disabled (hdparm -p8), then "IRQ unmasking" can be
- used again.
-
- For the CMD640, linux disables "32bit I/O" (hdparm -c1) on any drive
- for which the "prefetch" mode of the CMD640 is turned off.
- If "prefetch" is enabled (hdparm -p9), then "32bit I/O" can be
- used again.
-
- The CMD640 is also used on some Vesa Local Bus (VLB) cards, and is *NOT*
- automatically detected by Linux. For safe, reliable operation with such
- interfaces, one *MUST* use the "cmd640.probe_vlb" kernel option.
-
- Use of the "serialize" option is no longer necessary.
-
--------------------------------------------------------------------------------
-
-Common pitfalls
-===============
-
-- 40-conductor IDE cables are capable of transferring data in DMA modes up to
- udma2, but no faster.
-
-- If possible devices should be attached to separate channels if they are
- available. Typically the disk on the first and CD-ROM on the second.
-
-- If you mix devices on the same cable, please consider using similar devices
- in respect of the data transfer mode they support.
-
-- Even better try to stick to the same vendor and device type on the same
- cable.
-
-This is the multiple IDE interface driver, as evolved from hd.c
-===============================================================
-
-It supports up to 9 IDE interfaces per default, on one or more IRQs (usually
-14 & 15). There can be up to two drives per interface, as per the ATA-6 spec.::
-
- Primary: ide0, port 0x1f0; major=3; hda is minor=0; hdb is minor=64
- Secondary: ide1, port 0x170; major=22; hdc is minor=0; hdd is minor=64
- Tertiary: ide2, port 0x1e8; major=33; hde is minor=0; hdf is minor=64
- Quaternary: ide3, port 0x168; major=34; hdg is minor=0; hdh is minor=64
- fifth.. ide4, usually PCI, probed
- sixth.. ide5, usually PCI, probed
-
-To access devices on interfaces > ide0, device entries please make sure that
-device files for them are present in /dev. If not, please create such
-entries, by using /dev/MAKEDEV.
-
-This driver automatically probes for most IDE interfaces (including all PCI
-ones), for the drives/geometries attached to those interfaces, and for the IRQ
-lines being used by the interfaces (normally 14, 15 for ide0/ide1).
-
-Any number of interfaces may share a single IRQ if necessary, at a slight
-performance penalty, whether on separate cards or a single VLB card.
-The IDE driver automatically detects and handles this. However, this may
-or may not be harmful to your hardware.. two or more cards driving the same IRQ
-can potentially burn each other's bus driver, though in practice this
-seldom occurs. Be careful, and if in doubt, don't do it!
-
-Drives are normally found by auto-probing and/or examining the CMOS/BIOS data.
-For really weird situations, the apparent (fdisk) geometry can also be specified
-on the kernel "command line" using LILO. The format of such lines is::
-
- ide_core.chs=[interface_number.device_number]:cyls,heads,sects
-
-or::
-
- ide_core.cdrom=[interface_number.device_number]
-
-For example::
-
- ide_core.chs=1.0:1050,32,64 ide_core.cdrom=1.1
-
-The results of successful auto-probing may override the physical geometry/irq
-specified, though the "original" geometry may be retained as the "logical"
-geometry for partitioning purposes (fdisk).
-
-If the auto-probing during boot time confuses a drive (ie. the drive works
-with hd.c but not with ide.c), then an command line option may be specified
-for each drive for which you'd like the drive to skip the hardware
-probe/identification sequence. For example::
-
- ide_core.noprobe=0.1
-
-or::
-
- ide_core.chs=1.0:768,16,32
- ide_core.noprobe=1.0
-
-Note that when only one IDE device is attached to an interface, it should be
-jumpered as "single" or "master", *not* "slave". Many folks have had
-"trouble" with cdroms because of this requirement, so the driver now probes
-for both units, though success is more likely when the drive is jumpered
-correctly.
-
-Courtesy of Scott Snyder and others, the driver supports ATAPI cdrom drives
-such as the NEC-260 and the new MITSUMI triple/quad speed drives.
-Such drives will be identified at boot time, just like a hard disk.
-
-If for some reason your cdrom drive is *not* found at boot time, you can force
-the probe to look harder by supplying a kernel command line parameter
-via LILO, such as:::
-
- ide_core.cdrom=1.0 /* "master" on second interface (hdc) */
-
-or::
-
- ide_core.cdrom=1.1 /* "slave" on second interface (hdd) */
-
-For example, a GW2000 system might have a hard drive on the primary
-interface (/dev/hda) and an IDE cdrom drive on the secondary interface
-(/dev/hdc). To mount a CD in the cdrom drive, one would use something like::
-
- ln -sf /dev/hdc /dev/cdrom
- mkdir /mnt/cdrom
- mount /dev/cdrom /mnt/cdrom -t iso9660 -o ro
-
-If, after doing all of the above, mount doesn't work and you see
-errors from the driver (with dmesg) complaining about `status=0xff`,
-this means that the hardware is not responding to the driver's attempts
-to read it. One of the following is probably the problem:
-
- - Your hardware is broken.
-
- - You are using the wrong address for the device, or you have the
- drive jumpered wrong. Review the configuration instructions above.
-
- - Your IDE controller requires some nonstandard initialization sequence
- before it will work properly. If this is the case, there will often
- be a separate MS-DOS driver just for the controller. IDE interfaces
- on sound cards usually fall into this category. Such configurations
- can often be made to work by first booting MS-DOS, loading the
- appropriate drivers, and then warm-booting linux (without powering
- off). This can be automated using loadlin in the MS-DOS autoexec.
-
-If you always get timeout errors, interrupts from the drive are probably
-not making it to the host. Check how you have the hardware jumpered
-and make sure it matches what the driver expects (see the configuration
-instructions above). If you have a PCI system, also check the BIOS
-setup; I've had one report of a system which was shipped with IRQ 15
-disabled by the BIOS.
-
-The kernel is able to execute binaries directly off of the cdrom,
-provided it is mounted with the default block size of 1024 (as above).
-
-Please pass on any feedback on any of this stuff to the maintainer,
-whose address can be found in linux/MAINTAINERS.
-
-The IDE driver is modularized. The high level disk/CD-ROM/tape/floppy
-drivers can always be compiled as loadable modules, the chipset drivers
-can only be compiled into the kernel, and the core code (ide.c) can be
-compiled as a loadable module provided no chipset support is needed.
-
-When using ide.c as a module in combination with kmod, add::
-
- alias block-major-3 ide-probe
-
-to a configuration file in /etc/modprobe.d/.
-
-When ide.c is used as a module, you can pass command line parameters to the
-driver using the "options=" keyword to insmod, while replacing any ',' with
-';'.
-
-
-Summary of ide driver parameters for kernel command line
-========================================================
-
-For legacy IDE VLB host drivers (ali14xx/dtc2278/ht6560b/qd65xx/umc8672)
-you need to explicitly enable probing by using "probe" kernel parameter,
-i.e. to enable probing for ALI M14xx chipsets (ali14xx host driver) use:
-
-* "ali14xx.probe" boot option when ali14xx driver is built-in the kernel
-
-* "probe" module parameter when ali14xx driver is compiled as module
- ("modprobe ali14xx probe")
-
-Also for legacy CMD640 host driver (cmd640) you need to use "probe_vlb"
-kernel paremeter to enable probing for VLB version of the chipset (PCI ones
-are detected automatically).
-
-You also need to use "probe" kernel parameter for ide-4drives driver
-(support for IDE generic chipset with four drives on one port).
-
-To enable support for IDE doublers on Amiga use "doubler" kernel parameter
-for gayle host driver (i.e. "gayle.doubler" if the driver is built-in).
-
-To force ignoring cable detection (this should be needed only if you're using
-short 40-wires cable which cannot be automatically detected - if this is not
-a case please report it as a bug instead) use "ignore_cable" kernel parameter:
-
-* "ide_core.ignore_cable=[interface_number]" boot option if IDE is built-in
- (i.e. "ide_core.ignore_cable=1" to force ignoring cable for "ide1")
-
-* "ignore_cable=[interface_number]" module parameter (for ide_core module)
- if IDE is compiled as module
-
-Other kernel parameters for ide_core are:
-
-* "nodma=[interface_number.device_number]" to disallow DMA for a device
-
-* "noflush=[interface_number.device_number]" to disable flush requests
-
-* "nohpa=[interface_number.device_number]" to disable Host Protected Area
-
-* "noprobe=[interface_number.device_number]" to skip probing
-
-* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
-
-* "cdrom=[interface_number.device_number]" to force device as a CD-ROM
-
-* "chs=[interface_number.device_number]" to force device as a disk (using CHS)
-
-
-Some Terminology
-================
-
-IDE
- Integrated Drive Electronics, meaning that each drive has a built-in
- controller, which is why an "IDE interface card" is not a "controller card".
-
-ATA
- AT (the old IBM 286 computer) Attachment Interface, a draft American
- National Standard for connecting hard drives to PCs. This is the official
- name for "IDE".
-
- The latest standards define some enhancements, known as the ATA-6 spec,
- which grew out of vendor-specific "Enhanced IDE" (EIDE) implementations.
-
-ATAPI
- ATA Packet Interface, a new protocol for controlling the drives,
- similar to SCSI protocols, created at the same time as the ATA2 standard.
- ATAPI is currently used for controlling CDROM, TAPE and FLOPPY (ZIP or
- LS120/240) devices, removable R/W cartridges, and for high capacity hard disk
- drives.
-
-mlord@pobox.com
-
-
-Wed Apr 17 22:52:44 CEST 2002 edited by Marcin Dalecki, the current
-maintainer.
-
-Wed Aug 20 22:31:29 CEST 2003 updated ide boot options to current ide.c
-comments at 2.6.0-test4 time. Maciej Soltysiak <solt@dns.toxicfilms.tv>
diff --git a/Documentation/ide/index.rst b/Documentation/ide/index.rst
deleted file mode 100644
index 813dfe611a31..000000000000
--- a/Documentation/ide/index.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==================================
-Integrated Drive Electronics (IDE)
-==================================
-
-.. toctree::
- :maxdepth: 1
-
- ide
- ide-tape
- warm-plug-howto
-
- changelogs
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/ide/warm-plug-howto.rst b/Documentation/ide/warm-plug-howto.rst
deleted file mode 100644
index c245242ef2f1..000000000000
--- a/Documentation/ide/warm-plug-howto.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-===================
-IDE warm-plug HOWTO
-===================
-
-To warm-plug devices on a port 'idex'::
-
- # echo -n "1" > /sys/class/ide_port/idex/delete_devices
-
-unplug old device(s) and plug new device(s)::
-
- # echo -n "1" > /sys/class/ide_port/idex/scan
-
-done
-
-NOTE: please make sure that partitions are unmounted and that there are
-no other active references to devices before doing "delete_devices" step,
-also do not attempt "scan" step on devices currently in use -- otherwise
-results may be unpredictable and lead to data loss if you're unlucky
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 1988c19d9daf..062cf88c2216 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -103,7 +103,6 @@ needed).
block/index
cdrom/index
cpu-freq/index
- ide/index
fb/index
fpga/index
hid/index
@@ -169,7 +168,6 @@ to ReStructured Text format, or are simply too old.
tools/index
staging/index
- watch_queue
Translations
diff --git a/Documentation/input/devices/atarikbd.rst b/Documentation/input/devices/atarikbd.rst
index 745e7a1ff122..0c4c7804ccb2 100644
--- a/Documentation/input/devices/atarikbd.rst
+++ b/Documentation/input/devices/atarikbd.rst
@@ -288,7 +288,7 @@ between 0 and large positive numbers. Excess motion below 0 is ignored. The
command sets the maximum positive value that can be attained in the scaled
coordinate system. Motion beyond that value is also ignored.
-SET MOUSE KEYCODE MOSE
+SET MOUSE KEYCODE MODE
----------------------
::
@@ -333,7 +333,7 @@ occur before the internally maintained coordinate is changed by one
(independently scaled for each axis). Remember that the mouse position
information is available only by interrogating the ikbd in the ABSOLUTE MOUSE
POSITIONING mode unless the ikbd has been commanded to report on button press
-or release (see SET MOSE BUTTON ACTION).
+or release (see SET MOUSE BUTTON ACTION).
INTERROGATE MOUSE POSITION
--------------------------
diff --git a/Documentation/input/devices/ntrig.rst b/Documentation/input/devices/ntrig.rst
index a6b22ce6c61c..1559f53495cb 100644
--- a/Documentation/input/devices/ntrig.rst
+++ b/Documentation/input/devices/ntrig.rst
@@ -32,7 +32,7 @@ The following parameters are used to configure filters to reduce noise:
|activation_height, |size threshold to activate immediately |
|activation_width | |
+-----------------------+-----------------------------------------------------+
-|min_height, |size threshold bellow which fingers are ignored |
+|min_height, |size threshold below which fingers are ignored |
|min_width |both to decide activation and during activity |
+-----------------------+-----------------------------------------------------+
|deactivate_slack |the number of "no contact" frames to ignore before |
diff --git a/Documentation/kbuild/reproducible-builds.rst b/Documentation/kbuild/reproducible-builds.rst
index 3b25655e441b..071f0151a7a4 100644
--- a/Documentation/kbuild/reproducible-builds.rst
+++ b/Documentation/kbuild/reproducible-builds.rst
@@ -99,10 +99,10 @@ unreproducible parts can be treated as sources:
Structure randomisation
-----------------------
-If you enable ``CONFIG_GCC_PLUGIN_RANDSTRUCT``, you will need to
-pre-generate the random seed in
-``scripts/gcc-plugins/randomize_layout_seed.h`` so the same value
-is used in rebuilds.
+If you enable ``CONFIG_RANDSTRUCT``, you will need to pre-generate
+the random seed in ``scripts/basic/randstruct.seed`` so the same
+value is used by each build. See ``scripts/gen-randstruct-seed.sh``
+for details.
Debug info conflicts
--------------------
diff --git a/Documentation/kernel-hacking/hacking.rst b/Documentation/kernel-hacking/hacking.rst
index 55bd37a2efb0..ebd9d90882ea 100644
--- a/Documentation/kernel-hacking/hacking.rst
+++ b/Documentation/kernel-hacking/hacking.rst
@@ -112,8 +112,7 @@ time, although different tasklets can run simultaneously.
.. warning::
The name 'tasklet' is misleading: they have nothing to do with
- 'tasks', and probably more to do with some bad vodka Alexey
- Kuznetsov had at the time.
+ 'tasks'.
You can tell you are in a softirq (or tasklet) using the
:c:func:`in_softirq()` macro (``include/linux/preempt.h``).
@@ -290,8 +289,8 @@ userspace.
Unlike :c:func:`put_user()` and :c:func:`get_user()`, they
return the amount of uncopied data (ie. 0 still means success).
-[Yes, this moronic interface makes me cringe. The flamewar comes up
-every year or so. --RR.]
+[Yes, this objectionable interface makes me cringe. The flamewar comes
+up every year or so. --RR.]
The functions may sleep implicitly. This should never be called outside
user context (it makes no sense), with interrupts disabled, or a
@@ -645,8 +644,9 @@ names in development kernels; this is not done just to keep everyone on
their toes: it reflects a fundamental change (eg. can no longer be
called with interrupts on, or does extra checks, or doesn't do checks
which were caught before). Usually this is accompanied by a fairly
-complete note to the linux-kernel mailing list; search the archive.
-Simply doing a global replace on the file usually makes things **worse**.
+complete note to the appropriate kernel development mailing list; search
+the archives. Simply doing a global replace on the file usually makes
+things **worse**.
Initializing structure members
------------------------------
@@ -723,14 +723,14 @@ Putting Your Stuff in the Kernel
In order to get your stuff into shape for official inclusion, or even to
make a neat patch, there's administrative work to be done:
-- Figure out whose pond you've been pissing in. Look at the top of the
- source files, inside the ``MAINTAINERS`` file, and last of all in the
- ``CREDITS`` file. You should coordinate with this person to make sure
- you're not duplicating effort, or trying something that's already
- been rejected.
+- Figure out who are the owners of the code you've been modifying. Look
+ at the top of the source files, inside the ``MAINTAINERS`` file, and
+ last of all in the ``CREDITS`` file. You should coordinate with these
+ people to make sure you're not duplicating effort, or trying something
+ that's already been rejected.
- Make sure you put your name and EMail address at the top of any files
- you create or mangle significantly. This is the first place people
+ Make sure you put your name and email address at the top of any files
+ you create or modify significantly. This is the first place people
will look when they find a bug, or when **they** want to make a change.
- Usually you want a configuration option for your kernel hack. Edit
@@ -748,11 +748,11 @@ make a neat patch, there's administrative work to be done:
can usually just add a "obj-$(CONFIG_xxx) += xxx.o" line. The syntax
is documented in ``Documentation/kbuild/makefiles.rst``.
-- Put yourself in ``CREDITS`` if you've done something noteworthy,
- usually beyond a single file (your name should be at the top of the
- source files anyway). ``MAINTAINERS`` means you want to be consulted
- when changes are made to a subsystem, and hear about bugs; it implies
- a more-than-passing commitment to some part of the code.
+- Put yourself in ``CREDITS`` if you consider what you've done
+ noteworthy, usually beyond a single file (your name should be at the
+ top of the source files anyway). ``MAINTAINERS`` means you want to be
+ consulted when changes are made to a subsystem, and hear about bugs;
+ it implies a more-than-passing commitment to some part of the code.
- Finally, don't forget to read
``Documentation/process/submitting-patches.rst`` and possibly
diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index 4cbd50edf277..6805ae6e86e6 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -941,8 +941,7 @@ lock.
A classic problem here is when you provide callbacks or hooks: if you
call these with the lock held, you risk simple deadlock, or a deadly
-embrace (who knows what the callback will do?). Remember, the other
-programmers are out to get you, so don't do this.
+embrace (who knows what the callback will do?).
Overzealous Prevention Of Deadlocks
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -952,8 +951,6 @@ grabs a read lock, searches a list, fails to find what it wants, drops
the read lock, grabs a write lock and inserts the object has a race
condition.
-If you don't see why, please stay away from my code.
-
Racing Timers: A Kernel Pastime
-------------------------------
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
index 49549aab41b4..feb257b7f350 100644
--- a/Documentation/power/energy-model.rst
+++ b/Documentation/power/energy-model.rst
@@ -123,6 +123,26 @@ allows a platform to register EM power values which are reflecting total power
(static + dynamic). These power values might be coming directly from
experiments and measurements.
+Registration of 'artificial' EM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is an option to provide a custom callback for drivers missing detailed
+knowledge about power value for each performance state. The callback
+.get_cost() is optional and provides the 'cost' values used by the EAS.
+This is useful for platforms that only provide information on relative
+efficiency between CPU types, where one could use the information to
+create an abstract power model. But even an abstract power model can
+sometimes be hard to fit in, given the input power value size restrictions.
+The .get_cost() allows to provide the 'cost' values which reflect the
+efficiency of the CPUs. This would allow to provide EAS information which
+has different relation than what would be forced by the EM internal
+formulas calculating 'cost' values. To register an EM for such platform, the
+driver must set the flag 'milliwatts' to 0, provide .get_power() callback
+and provide .get_cost() callback. The EM framework would handle such platform
+properly during registration. A flag EM_PERF_DOMAIN_ARTIFICIAL is set for such
+platform. Special care should be taken by other frameworks which are using EM
+to test and treat this flag properly.
+
Registration of 'simple' EM
~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -181,8 +201,8 @@ EM framework::
-> drivers/cpufreq/foo_cpufreq.c
- 01 static int est_power(unsigned long *mW, unsigned long *KHz,
- 02 struct device *dev)
+ 01 static int est_power(struct device *dev, unsigned long *mW,
+ 02 unsigned long *KHz)
03 {
04 long freq, power;
05
diff --git a/Documentation/process/3.Early-stage.rst b/Documentation/process/3.Early-stage.rst
index 6bfd60d77d1a..894a920041c6 100644
--- a/Documentation/process/3.Early-stage.rst
+++ b/Documentation/process/3.Early-stage.rst
@@ -154,10 +154,11 @@ that the kernel developers have added a script to ease the process:
This script will return the current maintainer(s) for a given file or
directory when given the "-f" option. If passed a patch on the
command line, it will list the maintainers who should probably receive
-copies of the patch. There are a number of options regulating how hard
-get_maintainer.pl will search for maintainers; please be careful about
-using the more aggressive options as you may end up including developers
-who have no real interest in the code you are modifying.
+copies of the patch. This is the preferred way (unlike "-f" option) to get the
+list of people to Cc for your patches. There are a number of options
+regulating how hard get_maintainer.pl will search for maintainers; please be
+careful about using the more aggressive options as you may end up including
+developers who have no real interest in the code you are modifying.
If all else fails, talking to Andrew Morton can be an effective way to
track down a maintainer for a specific piece of code.
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index a337e8eabfe1..34415ae1af1b 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -7,7 +7,7 @@ Intro
=====
This document is designed to provide a list of the minimum levels of
-software necessary to run the 4.x kernels.
+software necessary to run the current kernel version.
This document is originally based on my "Changes" file for 2.0.x kernels
and therefore owes credit to the same people as that file (Jared Mauch,
@@ -56,6 +56,7 @@ iptables 1.4.2 iptables -V
openssl & libcrypto 1.0.0 openssl version
bc 1.06.95 bc --version
Sphinx\ [#f1]_ 1.7 sphinx-build --version
+cpio any cpio --version
====================== =============== ========================================
.. [#f1] Sphinx is needed only to build the Kernel documentation
@@ -458,6 +459,11 @@ mcelog
- <http://www.mcelog.org/>
+cpio
+----
+
+- <https://www.gnu.org/software/cpio/>
+
Networking
**********
diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst
index c74f4a81588b..572a3289c9cb 100644
--- a/Documentation/process/maintainer-tip.rst
+++ b/Documentation/process/maintainer-tip.rst
@@ -437,6 +437,20 @@ in a private repository which allows interested people to easily pull the
series for testing. The usual way to offer this is a git URL in the cover
letter of the patch series.
+Testing
+^^^^^^^
+
+Code should be tested before submitting to the tip maintainers. Anything
+other than minor changes should be built, booted and tested with
+comprehensive (and heavyweight) kernel debugging options enabled.
+
+These debugging options can be found in kernel/configs/x86_debug.config
+and can be added to an existing kernel config by running:
+
+ make x86_debug.config
+
+Some of these options are x86-specific and can be left out when testing
+on other architectures.
Coding style notes
------------------
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index fb496b2ebfd3..a1cb6280fbcf 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -77,7 +77,7 @@ as you intend it to.
The maintainer will thank you if you write your patch description in a
form which can be easily pulled into Linux's source code management
-system, ``git``, as a "commit log". See :ref:`explicit_in_reply_to`.
+system, ``git``, as a "commit log". See :ref:`the_canonical_patch_format`.
Solve only one problem per patch. If your description starts to get
long, that's a sign that you probably need to split up your patch.
@@ -227,9 +227,10 @@ Select the recipients for your patch
You should always copy the appropriate subsystem maintainer(s) on any patch
to code that they maintain; look through the MAINTAINERS file and the
source code revision history to see who those maintainers are. The
-script scripts/get_maintainer.pl can be very useful at this step. If you
-cannot find a maintainer for the subsystem you are working on, Andrew
-Morton (akpm@linux-foundation.org) serves as a maintainer of last resort.
+script scripts/get_maintainer.pl can be very useful at this step (pass paths to
+your patches as arguments to scripts/get_maintainer.pl). If you cannot find a
+maintainer for the subsystem you are working on, Andrew Morton
+(akpm@linux-foundation.org) serves as a maintainer of last resort.
You should also normally choose at least one mailing list to receive a copy
of your patch set. linux-kernel@vger.kernel.org should be used by default
@@ -318,7 +319,10 @@ understands what is going on.
Be sure to tell the reviewers what changes you are making and to thank them
for their time. Code review is a tiring and time-consuming process, and
reviewers sometimes get grumpy. Even in that case, though, respond
-politely and address the problems they have pointed out.
+politely and address the problems they have pointed out. When sending a next
+version, add a ``patch changelog`` to the cover letter or to individual patches
+explaining difference aganst previous submission (see
+:ref:`the_canonical_patch_format`).
See Documentation/process/email-clients.rst for recommendations on email
clients and mailing list etiquette.
diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst
index dd9b99a025f7..03c062915998 100644
--- a/Documentation/scheduler/sched-stats.rst
+++ b/Documentation/scheduler/sched-stats.rst
@@ -56,9 +56,9 @@ Next two are try_to_wake_up() statistics:
Next three are statistics describing scheduling latency:
- 7) sum of all time spent running by tasks on this processor (in jiffies)
+ 7) sum of all time spent running by tasks on this processor (in nanoseconds)
8) sum of all time spent waiting to run by tasks on this processor (in
- jiffies)
+ nanoseconds)
9) # of timeslices run on this cpu
@@ -155,8 +155,8 @@ schedstats also adds a new /proc/<pid>/schedstat file to include some of
the same information on a per-process level. There are three fields in
this file correlating for that process to:
- 1) time spent on the cpu
- 2) time spent waiting on a runqueue
+ 1) time spent on the cpu (in nanoseconds)
+ 2) time spent waiting on a runqueue (in nanoseconds)
3) # of timeslices run on this cpu
A program could be easily written to make use of these extra fields to
diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst
index 1a91d92950a7..15b4add314fc 100644
--- a/Documentation/security/IMA-templates.rst
+++ b/Documentation/security/IMA-templates.rst
@@ -66,12 +66,13 @@ descriptors by adding their identifier to the format string
calculated with the SHA1 or MD5 hash algorithm;
- 'n': the name of the event (i.e. the file name), with size up to 255 bytes;
- 'd-ng': the digest of the event, calculated with an arbitrary hash
- algorithm (field format: [<hash algo>:]digest, where the digest
- prefix is shown only if the hash algorithm is not SHA1 or MD5);
+ algorithm (field format: <hash algo>:digest);
+ - 'd-ngv2': same as d-ng, but prefixed with the "ima" or "verity" digest type
+ (field format: <digest type>:<hash algo>:digest);
- 'd-modsig': the digest of the event without the appended modsig;
- 'n-ng': the name of the event, without size limitations;
- - 'sig': the file signature, or the EVM portable signature if the file
- signature is not found;
+ - 'sig': the file signature, based on either the file's/fsverity's digest[1],
+ or the EVM portable signature, if 'security.ima' contains a file hash.
- 'modsig' the appended file signature;
- 'buf': the buffer data that was used to generate the hash without size limitations;
- 'evmsig': the EVM portable signature;
@@ -88,7 +89,9 @@ Below, there is the list of defined template descriptors:
- "ima": its format is ``d|n``;
- "ima-ng" (default): its format is ``d-ng|n-ng``;
+ - "ima-ngv2": its format is ``d-ngv2|n-ng``;
- "ima-sig": its format is ``d-ng|n-ng|sig``;
+ - "ima-sigv2": its format is ``d-ngv2|n-ng|sig``;
- "ima-buf": its format is ``d-ng|n-ng|buf``;
- "ima-modsig": its format is ``d-ng|n-ng|sig|d-modsig|modsig``;
- "evm-sig": its format is ``d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode``;
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index 16335de04e8c..6ed8d2fa6f9e 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -17,3 +17,4 @@ Security Documentation
tpm/index
digsig
landlock
+ secrets/index
diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst
index f614dad7de12..0bfb4c339748 100644
--- a/Documentation/security/keys/trusted-encrypted.rst
+++ b/Documentation/security/keys/trusted-encrypted.rst
@@ -35,6 +35,13 @@ safe.
Rooted to Hardware Unique Key (HUK) which is generally burnt in on-chip
fuses and is accessible to TEE only.
+ (3) CAAM (Cryptographic Acceleration and Assurance Module: IP on NXP SoCs)
+
+ When High Assurance Boot (HAB) is enabled and the CAAM is in secure
+ mode, trust is rooted to the OTPMK, a never-disclosed 256-bit key
+ randomly generated and fused into each SoC at manufacturing time.
+ Otherwise, a common fixed test key is used instead.
+
* Execution isolation
(1) TPM
@@ -46,6 +53,10 @@ safe.
Customizable set of operations running in isolated execution
environment verified via Secure/Trusted boot process.
+ (3) CAAM
+
+ Fixed set of operations running in isolated execution environment.
+
* Optional binding to platform integrity state
(1) TPM
@@ -63,6 +74,11 @@ safe.
Relies on Secure/Trusted boot process for platform integrity. It can
be extended with TEE based measured boot process.
+ (3) CAAM
+
+ Relies on the High Assurance Boot (HAB) mechanism of NXP SoCs
+ for platform integrity.
+
* Interfaces and APIs
(1) TPM
@@ -74,10 +90,13 @@ safe.
TEEs have well-documented, standardized client interface and APIs. For
more details refer to ``Documentation/staging/tee.rst``.
+ (3) CAAM
+
+ Interface is specific to silicon vendor.
* Threat model
- The strength and appropriateness of a particular TPM or TEE for a given
+ The strength and appropriateness of a particular trust source for a given
purpose must be assessed when using them to protect security-relevant data.
@@ -87,22 +106,32 @@ Key Generation
Trusted Keys
------------
-New keys are created from random numbers generated in the trust source. They
-are encrypted/decrypted using a child key in the storage key hierarchy.
-Encryption and decryption of the child key must be protected by a strong
-access control policy within the trust source.
+New keys are created from random numbers. They are encrypted/decrypted using
+a child key in the storage key hierarchy. Encryption and decryption of the
+child key must be protected by a strong access control policy within the
+trust source. The random number generator in use differs according to the
+selected trust source:
- * TPM (hardware device) based RNG
+ * TPM: hardware device based RNG
- Strength of random numbers may vary from one device manufacturer to
- another.
+ Keys are generated within the TPM. Strength of random numbers may vary
+ from one device manufacturer to another.
- * TEE (OP-TEE based on Arm TrustZone) based RNG
+ * TEE: OP-TEE based on Arm TrustZone based RNG
RNG is customizable as per platform needs. It can either be direct output
from platform specific hardware RNG or a software based Fortuna CSPRNG
which can be seeded via multiple entropy sources.
+ * CAAM: Kernel RNG
+
+ The normal kernel random number generator is used. To seed it from the
+ CAAM HWRNG, enable CRYPTO_DEV_FSL_CAAM_RNG_API and ensure the device
+ is probed.
+
+Users may override this by specifying ``trusted.rng=kernel`` on the kernel
+command-line to override the used RNG with the kernel's random number pool.
+
Encrypted Keys
--------------
@@ -189,6 +218,19 @@ Usage::
specific to TEE device implementation. The key length for new keys is always
in bytes. Trusted Keys can be 32 - 128 bytes (256 - 1024 bits).
+Trusted Keys usage: CAAM
+------------------------
+
+Usage::
+
+ keyctl add trusted name "new keylen" ring
+ keyctl add trusted name "load hex_blob" ring
+ keyctl print keyid
+
+"keyctl print" returns an ASCII hex copy of the sealed key, which is in a
+CAAM-specific format. The key length for new keys is always in bytes.
+Trusted Keys can be 32 - 128 bytes (256 - 1024 bits).
+
Encrypted Keys usage
--------------------
diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index 3df68cb1d10f..5c77730b4479 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -7,7 +7,7 @@ Landlock LSM: kernel documentation
==================================
:Author: Mickaël Salaün
-:Date: March 2021
+:Date: May 2022
Landlock's goal is to create scoped access-control (i.e. sandboxing). To
harden a whole system, this feature should be available to any process,
@@ -42,6 +42,21 @@ Guiding principles for safe access controls
* Computation related to Landlock operations (e.g. enforcing a ruleset) shall
only impact the processes requesting them.
+Design choices
+==============
+
+Filesystem access rights
+------------------------
+
+All access rights are tied to an inode and what can be accessed through it.
+Reading the content of a directory doesn't imply to be allowed to read the
+content of a listed inode. Indeed, a file name is local to its parent
+directory, and an inode can be referenced by multiple file names thanks to
+(hard) links. Being able to unlink a file only has a direct impact on the
+directory, not the unlinked inode. This is the reason why
+`LANDLOCK_ACCESS_FS_REMOVE_FILE` or `LANDLOCK_ACCESS_FS_REFER` are not allowed
+to be tied to files but only to directories.
+
Tests
=====
diff --git a/Documentation/security/secrets/coco.rst b/Documentation/security/secrets/coco.rst
new file mode 100644
index 000000000000..262e7abb1b24
--- /dev/null
+++ b/Documentation/security/secrets/coco.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Confidential Computing secrets
+==============================
+
+This document describes how Confidential Computing secret injection is handled
+from the firmware to the operating system, in the EFI driver and the efi_secret
+kernel module.
+
+
+Introduction
+============
+
+Confidential Computing (coco) hardware such as AMD SEV (Secure Encrypted
+Virtualization) allows guest owners to inject secrets into the VMs
+memory without the host/hypervisor being able to read them. In SEV,
+secret injection is performed early in the VM launch process, before the
+guest starts running.
+
+The efi_secret kernel module allows userspace applications to access these
+secrets via securityfs.
+
+
+Secret data flow
+================
+
+The guest firmware may reserve a designated memory area for secret injection,
+and publish its location (base GPA and length) in the EFI configuration table
+under a ``LINUX_EFI_COCO_SECRET_AREA_GUID`` entry
+(``adf956ad-e98c-484c-ae11-b51c7d336447``). This memory area should be marked
+by the firmware as ``EFI_RESERVED_TYPE``, and therefore the kernel should not
+be use it for its own purposes.
+
+During the VM's launch, the virtual machine manager may inject a secret to that
+area. In AMD SEV and SEV-ES this is performed using the
+``KVM_SEV_LAUNCH_SECRET`` command (see [sev]_). The strucutre of the injected
+Guest Owner secret data should be a GUIDed table of secret values; the binary
+format is described in ``drivers/virt/coco/efi_secret/efi_secret.c`` under
+"Structure of the EFI secret area".
+
+On kernel start, the kernel's EFI driver saves the location of the secret area
+(taken from the EFI configuration table) in the ``efi.coco_secret`` field.
+Later it checks if the secret area is populated: it maps the area and checks
+whether its content begins with ``EFI_SECRET_TABLE_HEADER_GUID``
+(``1e74f542-71dd-4d66-963e-ef4287ff173b``). If the secret area is populated,
+the EFI driver will autoload the efi_secret kernel module, which exposes the
+secrets to userspace applications via securityfs. The details of the
+efi_secret filesystem interface are in [secrets-coco-abi]_.
+
+
+Application usage example
+=========================
+
+Consider a guest performing computations on encrypted files. The Guest Owner
+provides the decryption key (= secret) using the secret injection mechanism.
+The guest application reads the secret from the efi_secret filesystem and
+proceeds to decrypt the files into memory and then performs the needed
+computations on the content.
+
+In this example, the host can't read the files from the disk image
+because they are encrypted. Host can't read the decryption key because
+it is passed using the secret injection mechanism (= secure channel).
+Host can't read the decrypted content from memory because it's a
+confidential (memory-encrypted) guest.
+
+Here is a simple example for usage of the efi_secret module in a guest
+to which an EFI secret area with 4 secrets was injected during launch::
+
+ # ls -la /sys/kernel/security/secrets/coco
+ total 0
+ drwxr-xr-x 2 root root 0 Jun 28 11:54 .
+ drwxr-xr-x 3 root root 0 Jun 28 11:54 ..
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+ -r--r----- 1 root root 0 Jun 28 11:54 e6f5a162-d67f-4750-a67c-5d065f2a9910
+
+ # hd /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+ 00000000 74 68 65 73 65 2d 61 72 65 2d 74 68 65 2d 6b 61 |these-are-the-ka|
+ 00000010 74 61 2d 73 65 63 72 65 74 73 00 01 02 03 04 05 |ta-secrets......|
+ 00000020 06 07 |..|
+ 00000022
+
+ # rm /sys/kernel/security/secrets/coco/e6f5a162-d67f-4750-a67c-5d065f2a9910
+
+ # ls -la /sys/kernel/security/secrets/coco
+ total 0
+ drwxr-xr-x 2 root root 0 Jun 28 11:55 .
+ drwxr-xr-x 3 root root 0 Jun 28 11:54 ..
+ -r--r----- 1 root root 0 Jun 28 11:54 736870e5-84f0-4973-92ec-06879ce3da0b
+ -r--r----- 1 root root 0 Jun 28 11:54 83c83f7f-1356-4975-8b7e-d3a0b54312c6
+ -r--r----- 1 root root 0 Jun 28 11:54 9553f55d-3da2-43ee-ab5d-ff17f78864d2
+
+
+References
+==========
+
+See [sev-api-spec]_ for more info regarding SEV ``LAUNCH_SECRET`` operation.
+
+.. [sev] Documentation/virt/kvm/amd-memory-encryption.rst
+.. [secrets-coco-abi] Documentation/ABI/testing/securityfs-secrets-coco
+.. [sev-api-spec] https://www.amd.com/system/files/TechDocs/55766_SEV-KM_API_Specification.pdf
diff --git a/Documentation/security/secrets/index.rst b/Documentation/security/secrets/index.rst
new file mode 100644
index 000000000000..ced34e9c43bd
--- /dev/null
+++ b/Documentation/security/secrets/index.rst
@@ -0,0 +1,9 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Secrets documentation
+=====================
+
+.. toctree::
+
+ coco
diff --git a/Documentation/sphinx/kerneldoc-preamble.sty b/Documentation/sphinx/kerneldoc-preamble.sty
index 9d0204dc38be..2a29cbe51396 100644
--- a/Documentation/sphinx/kerneldoc-preamble.sty
+++ b/Documentation/sphinx/kerneldoc-preamble.sty
@@ -20,13 +20,13 @@
% - Indent of 2 chars is preserved for ease of comparison.
% Summary of changes from default params:
% Width of page number (\@pnumwidth): 1.55em -> 2.7em
-% Width of chapter number: 1.5em -> 1.8em
-% Indent of section number: 1.5em -> 1.8em
+% Width of chapter number: 1.5em -> 2.4em
+% Indent of section number: 1.5em -> 2.4em
% Width of section number: 2.6em -> 3.2em
-% Indent of sebsection number: 4.1em -> 5em
+% Indent of subsection number: 4.1em -> 5.6em
% Width of subsection number: 3.5em -> 4.3em
%
-% These params can have 4 digit page counts, 2 digit chapter counts,
+% These params can have 4 digit page counts, 3 digit chapter counts,
% section counts of 4 digits + 1 period (e.g., 18.10), and subsection counts
% of 5 digits + 2 periods (e.g., 18.7.13).
\makeatletter
@@ -37,7 +37,7 @@
\ifnum \c@tocdepth >\m@ne
\addpenalty{-\@highpenalty}%
\vskip 1.0em \@plus\p@
- \setlength\@tempdima{1.8em}%
+ \setlength\@tempdima{2.4em}%
\begingroup
\parindent \z@ \rightskip \@pnumwidth
\parfillskip -\@pnumwidth
@@ -51,8 +51,8 @@
\endgroup
\fi}
%% Redefine \l@section and \l@subsection
-\renewcommand*\l@section{\@dottedtocline{1}{1.8em}{3.2em}}
-\renewcommand*\l@subsection{\@dottedtocline{2}{5em}{4.3em}}
+\renewcommand*\l@section{\@dottedtocline{1}{2.4em}{3.2em}}
+\renewcommand*\l@subsection{\@dottedtocline{2}{5.6em}{4.3em}}
\makeatother
%% Sphinx < 1.8 doesn't have \sphinxtableofcontentshook
\providecommand{\sphinxtableofcontentshook}{}
diff --git a/Documentation/tools/rtla/common_appendix.rst b/Documentation/tools/rtla/common_appendix.rst
index b494084acccd..b5cf2dc223df 100644
--- a/Documentation/tools/rtla/common_appendix.rst
+++ b/Documentation/tools/rtla/common_appendix.rst
@@ -1,6 +1,7 @@
REPORTING BUGS
==============
-Report bugs to <lkml@vger.kernel.org>
+Report bugs to <linux-kernel@vger.kernel.org>
+and <linux-trace-devel@vger.kernel.org>
LICENSE
=======
diff --git a/Documentation/translations/ja_JP/SubmittingPatches b/Documentation/translations/ja_JP/SubmittingPatches
index 0d308edef781..66ce0d8b0526 100644
--- a/Documentation/translations/ja_JP/SubmittingPatches
+++ b/Documentation/translations/ja_JP/SubmittingPatches
@@ -81,9 +81,7 @@ Linux カーãƒãƒ«ã«å¯¾ã™ã‚‹å…¨ã¦ã®å¤‰æ›´ã¯ diff(1) コマンドã«ã‚ˆã‚‹ãƒ
dontdiff ファイルã«ã¯ Linux カーãƒãƒ«ã®ãƒ“ルドプロセスã®éŽç¨‹ã§ç”Ÿæˆã•ã‚ŒãŸ
ファイルã®ä¸€è¦§ãŒã®ã£ã¦ã„ã¾ã™ã€‚ãã—ã¦ã€ãれらã¯ãƒ‘ッãƒã‚’生æˆã™ã‚‹ diff(1)
コマンドã§ç„¡è¦–ã•ã‚Œã‚‹ã¹ãã§ã™ã€‚dontdiff ファイル㯠2.6.12 以後ã®ãƒãƒ¼ã‚¸ãƒ§
-ン㮠Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã«å«ã¾ã‚Œã¦ã„ã¾ã™ã€‚ãれよりå‰ã®ãƒãƒ¼ã‚¸ãƒ§ãƒ³
-ã® Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã«å¯¾ã™ã‚‹ dontdiff ファイルã¯ã€
-<http://www.xenotime.net/linux/doc/dontdiff>ã‹ã‚‰å–å¾—ã™ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚
+ン㮠Linux カーãƒãƒ«ã‚½ãƒ¼ã‚¹ãƒ„リーã«å«ã¾ã‚Œã¦ã„ã¾ã™ã€‚
投稿ã™ã‚‹ãƒ‘ッãƒã®ä¸­ã«é–¢ä¿‚ã®ãªã„余分ãªãƒ•ã‚¡ã‚¤ãƒ«ãŒå«ã¾ã‚Œã¦ã„ãªã„ã“ã¨ã‚’確
èªã—ã¦ãã ã•ã„。diff(1) コマンドã§ç”Ÿæˆã—ãŸãƒ‘ッãƒãŒã‚ãªãŸã®æ„図ã—ãŸã¨ãŠ
@@ -125,6 +123,17 @@ http://savannah.nongnu.org/projects/quilt
登録済ã¿ã®ãƒã‚°ã‚¨ãƒ³ãƒˆãƒªã‚’修正ã™ã‚‹ãƒ‘ッãƒã§ã‚ã‚Œã°ã€ãã®ãƒã‚°ã‚¨ãƒ³ãƒˆãƒªã‚’示ã™ãƒã‚° ID
ã‚„ URL を明記ã—ã¦ãã ã•ã„。
+特定ã®ã‚³ãƒŸãƒƒãƒˆã‚’å‚ç…§ã—ãŸã„å ´åˆã¯ã€ãã® SHA-1 ID ã ã‘ã§ãªãã€ä¸€è¡Œã‚µãƒžãƒª
+ã‚‚å«ã‚ã¦ãã ã•ã„。ãã‚Œã«ã‚ˆã‚Šã€ãã‚ŒãŒä½•ã«é–¢ã™ã‚‹ã‚³ãƒŸãƒƒãƒˆãªã®ã‹ãŒãƒ¬ãƒ“ューã™ã‚‹
+人ã«ã‚ã‹ã‚Šã‚„ã™ããªã‚Šã¾ã™ã€‚
+例 (英文ã®ãƒžãƒž):
+
+ Commit e21d2170f36602ae2708 ("video: remove unnecessary
+ platform_set_drvdata()") removed the unnecessary
+ platform_set_drvdata(), but left the variable "dev" unused,
+ delete it.
+
+
3) パッãƒã®åˆ†å‰²
æ„味ã®ã‚ã‚‹ã²ã¨ã¾ã¨ã¾ã‚Šã”ã¨ã«å¤‰æ›´ã‚’個々ã®ãƒ‘ッãƒãƒ•ã‚¡ã‚¤ãƒ«ã«åˆ†ã‘ã¦ãã ã•ã„。
@@ -162,7 +171,8 @@ http://savannah.nongnu.org/projects/quilt
MAINTAINERS ファイルã¨ã‚½ãƒ¼ã‚¹ã‚³ãƒ¼ãƒ‰ã«ç›®ã‚’通ã—ã¦ãã ã•ã„。ãã—ã¦ã€ãã®å¤‰
æ›´ãŒãƒ¡ãƒ³ãƒ†ãƒŠã®ã„る特定ã®ã‚µãƒ–システムã«åŠ ãˆã‚‰ã‚Œã‚‹ã‚‚ã®ã§ã‚ã‚‹ã“ã¨ãŒåˆ†ã‹
-ã‚Œã°ã€ãã®äººã«é›»å­ãƒ¡ãƒ¼ãƒ«ã‚’é€ã£ã¦ãã ã•ã„。
+ã‚Œã°ã€ãã®äººã«é›»å­ãƒ¡ãƒ¼ãƒ«ã‚’é€ã£ã¦ãã ã•ã„。ãã®éš›
+./scripts/get_maintainers.pl ã®ã‚¹ã‚¯ãƒªãƒ—トãŒæœ‰ç”¨ã§ã™ã€‚
ã‚‚ã—ã€ãƒ¡ãƒ³ãƒ†ãƒŠãŒè¼‰ã£ã¦ã„ãªã‹ã£ãŸã‚Šã€ãƒ¡ãƒ³ãƒ†ãƒŠã‹ã‚‰ã®å¿œç­”ãŒãªã„ãªã‚‰ã€
LKML ( linux-kernel@vger.kernel.org )ã¸ãƒ‘ッãƒã‚’é€ã£ã¦ãã ã•ã„。ã»ã¨ã‚“ã©
@@ -400,7 +410,7 @@ Acked-by: ãŒå¿…ãšã—もパッãƒå…¨ä½“ã®æ‰¿èªã‚’示ã—ã¦ã„ã‚‹ã‚ã‘ã§ã¯ã
ã“ã®ã‚¿ã‚°ã¯ãƒ‘ッãƒã«é–¢å¿ƒãŒã‚ã‚‹ã¨æ€ã‚れる人é”ãŒãã®ãƒ‘ッãƒã®è­°è«–ã«å«ã¾ã‚Œã¦ã„ãŸã“ã¨
を明文化ã—ã¾ã™ã€‚
-14) Reported-by 㨠Tested-by: 㨠Reviewed-by: ã®åˆ©ç”¨
+14) Reported-by:, Tested-by:, Reviewed-by: ãŠã‚ˆã³ Suggested-by: ã®åˆ©ç”¨
ä»–ã®èª°ã‹ã«ã‚ˆã£ã¦å ±å‘Šã•ã‚ŒãŸå•é¡Œã‚’修正ã™ã‚‹ãƒ‘ッãƒã§ã‚ã‚Œã°ã€å•é¡Œå ±å‘Šè€…ã¨ã„ã†å¯„与を
クレジットã™ã‚‹ãŸã‚ã«ã€Reported-by: タグを追加ã™ã‚‹ã“ã¨ã‚’検討ã—ã¦ãã ã•ã„。
@@ -449,6 +459,13 @@ Reviewd-by ã‚¿ã‚°ã¯ãã®ãƒ‘ッãƒãŒã‚«ãƒ¼ãƒãƒ«ã«å¯¾ã—ã¦é©åˆ‡ãªä¿®æ­£ã§
レビューを実施ã—ãŸãƒ¬ãƒ“ューアã«ã‚ˆã£ã¦æä¾›ã•ã‚Œã‚‹æ™‚ã€Reviewed-by: ã‚¿ã‚°ãŒã‚ãªãŸã®
パッãƒã‚’カーãƒãƒ«ã«ãƒžãƒ¼ã‚¸ã™ã‚‹å¯èƒ½æ€§ã‚’高ã‚ã‚‹ã§ã—ょã†ã€‚
+Suggested-by: ã‚¿ã‚°ã¯ã€ãƒ‘ッãƒã®ã‚¢ã‚¤ãƒ‡ã‚¢ãŒãã®äººã‹ã‚‰ã®æ案ã«åŸºã¥ãã‚‚ã®ã§ã‚ã‚‹
+ã“ã¨ã‚’示ã—ã€ã‚¢ã‚¤ãƒ‡ã‚¢ã®æ供をクレジットã™ã‚‹ã‚‚ã®ã§ã™ã€‚æ案者ã®æ˜Žç¤ºçš„ãªè¨±å¯ãŒ
+ãªã„å ´åˆã€ç‰¹ã«ãã®ã‚¢ã‚¤ãƒ‡ã‚¢ãŒå…¬é–‹ã®ãƒ•ã‚©ãƒ¼ãƒ©ãƒ ã§ç¤ºã•ã‚Œã¦ã„ãªã„å ´åˆã«ã¯ã€ã“ã®
+ã‚¿ã‚°ã‚’ã¤ã‘ãªã„よã†ã«æ³¨æ„ã—ã¦ãã ã•ã„。ã¨ã¯ã„ãˆã€ã‚¢ã‚¤ãƒ‡ã‚¢ã®æ供者をã“ã¤ã“ã¤
+クレジットã—ã¦ã„ã‘ã°ã€æœ›ã‚€ã‚‰ãã¯ãã®äººãŸã¡ãŒå°†æ¥åˆ¥ã®æ©Ÿä¼šã«å†åº¦åŠ›ã‚’貸ã™æ°—ã«
+ãªã£ã¦ãれるã‹ã‚‚ã—ã‚Œã¾ã›ã‚“。
+
15) 標準的ãªãƒ‘ッãƒã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆ
標準的ãªãƒ‘ッãƒã®ã‚µãƒ–ジェクトã¯ä»¥ä¸‹ã®ã¨ãŠã‚Šã§ã™ã€‚
@@ -681,10 +698,11 @@ Jeff Garzik, "Linux kernel patch submission format".
<https://web.archive.org/web/20180829112450/http://linux.yyz.us/patch-format.html>
Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
- <http://www.kroah.com/log/2005/03/31/>
- <http://www.kroah.com/log/2005/07/08/>
- <http://www.kroah.com/log/2005/10/19/>
- <http://www.kroah.com/log/2006/01/11/>
+ <http://www.kroah.com/log/linux/maintainer.html>
+ <http://www.kroah.com/log/linux/maintainer-02.html>
+ <http://www.kroah.com/log/linux/maintainer-03.html>
+ <http://www.kroah.com/log/linux/maintainer-04.html>
+ <http://www.kroah.com/log/linux/maintainer-05.html>
NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
<https://lore.kernel.org/r/20050711.125305.08322243.davem@davemloft.net>
diff --git a/Documentation/translations/ja_JP/howto.rst b/Documentation/translations/ja_JP/howto.rst
index d667f9d8a02a..38fed6fe62fe 100644
--- a/Documentation/translations/ja_JP/howto.rst
+++ b/Documentation/translations/ja_JP/howto.rst
@@ -262,21 +262,21 @@ Linux カーãƒãƒ«ã®é–‹ç™ºãƒ—ロセスã¯ç¾åœ¨å¹¾ã¤ã‹ã®ç•°ãªã‚‹ãƒ¡ã‚¤ãƒ³ã‚
ãƒã€ã¨å¤šæ•°ã®ã‚µãƒ–システム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ–ランãƒã‹ã‚‰æ§‹æˆã•ã‚Œã¾ã™ã€‚ã“れらã®
ブランãƒã¨ã¯ -
- - メイン㮠4.x カーãƒãƒ«ãƒ„リー
- - 4.x.y -stable カーãƒãƒ«ãƒ„リー
- - サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リーã¨ãƒ‘ッãƒ
- - çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® 4.x -next カーãƒãƒ«ãƒ„リー
+ - Linus ã®ãƒ¡ã‚¤ãƒ³ãƒ©ã‚¤ãƒ³ãƒ„リー
+ - メジャー番å·ã‚’ã¾ãŸã数本ã®å®‰å®šç‰ˆãƒ„リー
+ - サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リー
+ - çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® linux-next カーãƒãƒ«ãƒ„リー
-4.x カーãƒãƒ«ãƒ„リー
+メインラインツリー
~~~~~~~~~~~~~~~~~~
-4.x カーãƒãƒ«ã¯ Linus Torvalds ã«ã‚ˆã£ã¦ãƒ¡ãƒ³ãƒ†ãƒŠãƒ³ã‚¹ã•ã‚Œã€
-https://kernel.org ã® pub/linux/kernel/v4.x/ ディレクトリã«å­˜åœ¨ã—ã¾ã™ã€‚
+メインラインツリー㯠Linus Torvalds ã«ã‚ˆã£ã¦ãƒ¡ãƒ³ãƒ†ãƒŠãƒ³ã‚¹ã•ã‚Œã€
+https://kernel.org ã®ãƒªãƒã‚¸ãƒˆãƒªã«å­˜åœ¨ã—ã¾ã™ã€‚
ã“ã®é–‹ç™ºãƒ—ロセスã¯ä»¥ä¸‹ã®ã¨ãŠã‚Š -
- æ–°ã—ã„カーãƒãƒ«ãŒãƒªãƒªãƒ¼ã‚¹ã•ã‚ŒãŸç›´å¾Œã«ã€2週間ã®ç‰¹åˆ¥æœŸé–“ãŒè¨­ã‘られã€
ã“ã®æœŸé–“中ã«ã€ãƒ¡ãƒ³ãƒ†ãƒŠé”㯠Linus ã«å¤§ããªå·®åˆ†ã‚’é€ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚
- ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ -next カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§ã™ã€‚
+ ã“ã®ã‚ˆã†ãªå·®åˆ†ã¯é€šå¸¸ linux-next カーãƒãƒ«ã«æ•°é€±é–“å«ã¾ã‚Œã¦ããŸãƒ‘ッãƒã§ã™ã€‚
大ããªå¤‰æ›´ã¯ git(カーãƒãƒ«ã®ã‚½ãƒ¼ã‚¹ç®¡ç†ãƒ„ールã€è©³ç´°ã¯
http://git-scm.com/ å‚ç…§) を使ã£ã¦é€ã‚‹ã®ãŒå¥½ã¾ã—ã„ã‚„ã‚Šæ–¹ã§ã™ãŒã€ãƒ‘ッ
ãƒãƒ•ã‚¡ã‚¤ãƒ«ã®å½¢å¼ã®ã¾ã¾é€ã‚‹ã®ã§ã‚‚å分ã§ã™ã€‚
@@ -303,20 +303,18 @@ Andrew Morton ㌠Linux-kernel メーリングリストã«ã‚«ãƒ¼ãƒãƒ«ãƒªãƒªãƒ¼ã
å‰ã‚‚ã£ã¦æ±ºã‚られãŸè¨ˆç”»ã«ã‚ˆã£ã¦ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã‚‹ã‚‚ã®ã§ã¯ãªã„ã‹ã‚‰
ã§ã™ã€‚ã€*
-4.x.y -stable カーãƒãƒ«ãƒ„リー
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+メジャー番å·ã‚’ã¾ãŸã数本ã®å®‰å®šç‰ˆãƒ„リー
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ãƒãƒ¼ã‚¸ãƒ§ãƒ³ç•ªå·ãŒ3ã¤ã®æ•°å­—ã«åˆ†ã‹ã‚Œã¦ã„るカーãƒãƒ«ã¯ -stable カーãƒãƒ«ã§ã™ã€‚
-ã“ã‚Œã«ã¯ã€4.x カーãƒãƒ«ã§è¦‹ã¤ã‹ã£ãŸã‚»ã‚­ãƒ¥ãƒªãƒ†ã‚£å•é¡Œã‚„é‡å¤§ãªå¾Œæˆ»ã‚Šã«å¯¾ã™
-る比較的å°ã•ã„é‡è¦ãªä¿®æ­£ãŒå«ã¾ã‚Œã¾ã™ã€‚
+ã“ã‚Œã«ã¯æœ€åˆã®2ã¤ã®ãƒãƒ¼ã‚¸ãƒ§ãƒ³ç•ªå·ã®æ•°å­—ã«å¯¾å¿œã—ãŸã€
+メインラインリリースã§è¦‹ã¤ã‹ã£ãŸã‚»ã‚­ãƒ¥ãƒªãƒ†ã‚£å•é¡Œã‚„
+é‡å¤§ãªå¾Œæˆ»ã‚Šã«å¯¾ã™ã‚‹æ¯”較的å°ã•ã„é‡è¦ãªä¿®æ­£ãŒå«ã¾ã‚Œã¾ã™ã€‚
ã“ã‚Œã¯ã€é–‹ç™º/実験的ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã®ãƒ†ã‚¹ãƒˆã«å”力ã™ã‚‹ã“ã¨ã«èˆˆå‘³ãŒç„¡ãã€æœ€æ–°
ã®å®‰å®šã—ãŸã‚«ãƒ¼ãƒãƒ«ã‚’使ã„ãŸã„ユーザã«æŽ¨å¥¨ã™ã‚‹ãƒ–ランãƒã§ã™ã€‚
-ã‚‚ã—ã€4.x.y カーãƒãƒ«ãŒå­˜åœ¨ã—ãªã„å ´åˆã«ã¯ã€ç•ªå·ãŒä¸€ç•ªå¤§ãã„ 4.x ãŒæœ€æ–°
-ã®å®‰å®šç‰ˆã‚«ãƒ¼ãƒãƒ«ã§ã™ã€‚
-
-4.x.y 㯠"stable" ãƒãƒ¼ãƒ  <stable@vger.kernel.org> ã§ãƒ¡ãƒ³ãƒ†ã•ã‚Œã¦ãŠã‚Šã€
+安定版ツリーã¯"stable" ãƒãƒ¼ãƒ  <stable@vger.kernel.org> ã§ãƒ¡ãƒ³ãƒ†ã•ã‚Œã¦ãŠã‚Šã€
å¿…è¦ã«å¿œã˜ã¦ãƒªãƒªãƒ¼ã‚¹ã•ã‚Œã¾ã™ã€‚通常ã®ãƒªãƒªãƒ¼ã‚¹æœŸé–“㯠2週間毎ã§ã™ãŒã€å·®
ã—è¿«ã£ãŸå•é¡ŒãŒãªã‘ã‚Œã°ã‚‚ã†å°‘ã—é•·ããªã‚‹ã“ã¨ã‚‚ã‚ã‚Šã¾ã™ã€‚セキュリティ関
連ã®å•é¡Œã®å ´åˆã¯ã“ã‚Œã«å¯¾ã—ã¦ã ã„ãŸã„ã®å ´åˆã€ã™ãã«ãƒªãƒªãƒ¼ã‚¹ãŒã•ã‚Œã¾ã™ã€‚
@@ -326,7 +324,7 @@ Documentation/process/stable-kernel-rules.rst ファイルã«ã¯ã©ã®ã‚ˆã†ãªç
é¡žã®å¤‰æ›´ãŒ -stable ツリーã«å—ã‘入れå¯èƒ½ã‹ã€ã¾ãŸãƒªãƒªãƒ¼ã‚¹ãƒ—ロセスãŒã©ã†
å‹•ãã‹ãŒè¨˜è¿°ã•ã‚Œã¦ã„ã¾ã™ã€‚
-サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リーã¨ãƒ‘ッãƒ
+サブシステム毎ã®ã‚«ãƒ¼ãƒãƒ«ãƒ„リー
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ãã‚Œãžã‚Œã®ã‚«ãƒ¼ãƒãƒ«ã‚µãƒ–システムã®ãƒ¡ãƒ³ãƒ†ãƒŠé”㯠--- ãã—ã¦å¤šãã®ã‚«ãƒ¼ãƒãƒ«
@@ -351,19 +349,19 @@ quilt シリーズã¨ã—ã¦å…¬é–‹ã•ã‚Œã¦ã„るパッãƒã‚­ãƒ¥ãƒ¼ã‚‚使ã‚ã‚Œã
ã‘ã‚‹ã“ã¨ãŒã§ãã¾ã™ã€‚大部分ã®ã“れら㮠patchwork ã®ã‚µã‚¤ãƒˆã¯
https://patchwork.kernel.org/ ã§ãƒªã‚¹ãƒˆã•ã‚Œã¦ã„ã¾ã™ã€‚
-çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® 4.x -next カーãƒãƒ«ãƒ„リー
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+çµ±åˆãƒ†ã‚¹ãƒˆã®ãŸã‚ã® linux-next カーãƒãƒ«ãƒ„リー
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-サブシステムツリーã®æ›´æ–°å†…容ãŒãƒ¡ã‚¤ãƒ³ãƒ©ã‚¤ãƒ³ã® 4.x ツリーã«ãƒžãƒ¼ã‚¸ã•ã‚Œã‚‹
+サブシステムツリーã®æ›´æ–°å†…容ãŒãƒ¡ã‚¤ãƒ³ãƒ©ã‚¤ãƒ³ãƒ„リーã«ãƒžãƒ¼ã‚¸ã•ã‚Œã‚‹
å‰ã«ã€ãれらã¯çµ±åˆãƒ†ã‚¹ãƒˆã•ã‚Œã‚‹å¿…è¦ãŒã‚ã‚Šã¾ã™ã€‚ã“ã®ç›®çš„ã®ãŸã‚ã€å®Ÿè³ªçš„ã«
全サブシステムツリーã‹ã‚‰ã»ã¼æ¯Žæ—¥ãƒ—ルã•ã‚Œã¦ã§ãる特別ãªãƒ†ã‚¹ãƒˆç”¨ã®ãƒªãƒã‚¸
トリãŒå­˜åœ¨ã—ã¾ã™-
https://git.kernel.org/?p=linux/kernel/git/next/linux-next.git
-ã“ã®ã‚„ã‚Šæ–¹ã«ã‚ˆã£ã¦ã€-next カーãƒãƒ«ã¯æ¬¡ã®ãƒžãƒ¼ã‚¸æ©Ÿä¼šã§ã©ã‚“ãªã‚‚ã®ãŒãƒ¡ã‚¤ãƒ³
-ラインカーãƒãƒ«ã«ãƒžãƒ¼ã‚¸ã•ã‚Œã‚‹ã‹ã€ãŠãŠã¾ã‹ãªã®å±•æœ›ã‚’æä¾›ã—ã¾ã™ã€‚-next カー
-ãƒãƒ«ã®å®Ÿè¡Œãƒ†ã‚¹ãƒˆã‚’è¡Œã†å†’険好ããªãƒ†ã‚¹ã‚¿ãƒ¼ã¯å¤§ã„ã«æ­“è¿Žã•ã‚Œã¾ã™ã€‚
+ã“ã®ã‚„ã‚Šæ–¹ã«ã‚ˆã£ã¦ã€linux-next ã¯æ¬¡ã®ãƒžãƒ¼ã‚¸æ©Ÿä¼šã§ã©ã‚“ãªã‚‚ã®ãŒãƒ¡ã‚¤ãƒ³
+ラインã«ãƒžãƒ¼ã‚¸ã•ã‚Œã‚‹ã‹ã€ãŠãŠã¾ã‹ãªå±•æœ›ã‚’æä¾›ã—ã¾ã™ã€‚
+linux-next ã®å®Ÿè¡Œãƒ†ã‚¹ãƒˆã‚’è¡Œã†å†’険好ããªãƒ†ã‚¹ã‚¿ãƒ¼ã¯å¤§ã„ã«æ­“è¿Žã•ã‚Œã¾ã™ã€‚
ãƒã‚°ãƒ¬ãƒãƒ¼ãƒˆ
-------------
diff --git a/Documentation/translations/ja_JP/index.rst b/Documentation/translations/ja_JP/index.rst
index 20738c931d02..43b9fb7246d3 100644
--- a/Documentation/translations/ja_JP/index.rst
+++ b/Documentation/translations/ja_JP/index.rst
@@ -5,7 +5,7 @@
\kerneldocCJKon
\kerneldocBeginJP{
-Japanese translations
+日本語訳
=====================
.. toctree::
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
index 9e541578f38d..1500bdbf338a 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst
@@ -53,8 +53,8 @@ DAMON_RECLAIM找到在特定时间内没有被访问的内存区域并分页。ä
下é¢æ˜¯æ¯ä¸ªå‚æ•°çš„æ述。
-enable
-------
+enabled
+-------
å¯ç”¨æˆ–ç¦ç”¨DAMON_RECLAIM。
diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
index 5d7533347216..eee0e8c5c368 100644
--- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
+++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst
@@ -13,7 +13,7 @@
详细用法
========
-DAMON 为ä¸åŒçš„用户æ供了下é¢ä¸‰ç§æŽ¥å£ã€‚
+DAMON 为ä¸åŒçš„用户æ供了下é¢è¿™äº›æŽ¥å£ã€‚
- *DAMON用户空间工具。*
`è¿™ <https://github.com/awslabs/damo>`_ 为有这特æƒçš„人, 如系统管ç†å‘˜ï¼Œå¸Œæœ›æœ‰ä¸€ä¸ªåˆšå¥½
@@ -21,19 +21,290 @@ DAMON 为ä¸åŒçš„用户æ供了下é¢ä¸‰ç§æŽ¥å£ã€‚
使用它,用户å¯ä»¥ä»¥äººæ€§åŒ–çš„æ–¹å¼ä½¿ç”¨DAMON的主è¦åŠŸèƒ½ã€‚ä¸è¿‡ï¼Œå®ƒå¯èƒ½ä¸ä¼šä¸ºç‰¹æ®Šæƒ…况进行高度调整。
它åŒæ—¶æ”¯æŒè™šæ‹Ÿå’Œç‰©ç†åœ°å€ç©ºé—´çš„监测。更多细节,请å‚考它的 `使用文档
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_。
-- *debugfs接å£ã€‚*
- :ref:`è¿™ <debugfs_interface>` 是为那些希望更高级的使用DAMON的特æƒç”¨æˆ·ç©ºé—´ç¨‹åºå‘˜å‡†å¤‡çš„。
- 使用它,用户å¯ä»¥é€šè¿‡è¯»å–和写入特殊的debugfs文件æ¥ä½¿ç”¨DAMON的主è¦åŠŸèƒ½ã€‚因此,你å¯ä»¥ç¼–写和使
- 用你个性化的DAMON debugfs包装程åºï¼Œä»£æ›¿ä½ è¯»/写debugfs文件。 `DAMON用户空间工具
+- *sysfs接å£ã€‚*
+ :ref:`è¿™ <sysfs_interface>` 是为那些希望更高级的使用DAMON的特æƒç”¨æˆ·ç©ºé—´ç¨‹åºå‘˜å‡†å¤‡çš„。
+ 使用它,用户å¯ä»¥é€šè¿‡è¯»å–和写入特殊的sysfs文件æ¥ä½¿ç”¨DAMON的主è¦åŠŸèƒ½ã€‚因此,你å¯ä»¥ç¼–写和使
+ 用你个性化的DAMON sysfs包装程åºï¼Œä»£æ›¿ä½ è¯»/写sysfs文件。 `DAMON用户空间工具
<https://github.com/awslabs/damo>`_ 就是这ç§ç¨‹åºçš„ä¸€ä¸ªä¾‹å­ å®ƒåŒæ—¶æ”¯æŒè™šæ‹Ÿå’Œç‰©ç†åœ°å€
空间的监测。注æ„,这个界é¢åªæ供简å•çš„监测结果 :ref:`统计 <damos_stats>`。对于详细的监测
结果,DAMONæ供了一个:ref:`跟踪点 <tracepoint>`。
-
+- *debugfs interface.*
+ :ref:`这 <debugfs_interface>` 几乎与:ref:`sysfs interface <sysfs_interface>` 接
+ å£ç›¸åŒã€‚这将在下一个LTS内核å‘布åŽè¢«ç§»é™¤ï¼Œæ‰€ä»¥ç”¨æˆ·åº”该转移到
+ :ref:`sysfs interface <sysfs_interface>`。
- *内核空间编程接å£ã€‚*
- :doc:`This </vm/damon/api>` 这是为内核空间程åºå‘˜å‡†å¤‡çš„。使用它,用户å¯ä»¥é€šè¿‡ä¸ºä½ ç¼–写内
+ :doc:`è¿™ </vm/damon/api>` 这是为内核空间程åºå‘˜å‡†å¤‡çš„。使用它,用户å¯ä»¥é€šè¿‡ä¸ºä½ ç¼–写内
核空间的DAMON应用程åºï¼Œæœ€çµæ´»æœ‰æ•ˆåœ°åˆ©ç”¨DAMONçš„æ¯ä¸€ä¸ªåŠŸèƒ½ã€‚你甚至å¯ä»¥ä¸ºå„ç§åœ°å€ç©ºé—´æ‰©å±•DAMON。
详细情况请å‚è€ƒæŽ¥å£ :doc:`文件 </vm/damon/api>`。
+sysfs接å£
+=========
+DAMONçš„sysfs接å£æ˜¯åœ¨å®šä¹‰ ``CONFIG_DAMON_SYSFS`` 时建立的。它在其sysfs目录下创建多
+个目录和文件, ``<sysfs>/kernel/mm/damon/`` 。你å¯ä»¥é€šè¿‡å¯¹è¯¥ç›®å½•ä¸‹çš„文件进行写入和
+读å–æ¥æŽ§åˆ¶DAMON。
+
+对于一个简短的例å­ï¼Œç”¨æˆ·å¯ä»¥ç›‘测一个给定工作负载的虚拟地å€ç©ºé—´ï¼Œå¦‚下所示::
+
+ # cd /sys/kernel/mm/damon/admin/
+ # echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
+ # echo vaddr > kdamonds/0/contexts/0/operations
+ # echo 1 > kdamonds/0/contexts/0/targets/nr
+ # echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
+ # echo on > kdamonds/0/state
+
+文件层次结构
+------------
+
+DAMON sysfs接å£çš„文件层次结构如下图所示。在下图中,父å­å…³ç³»ç”¨ç¼©è¿›è¡¨ç¤ºï¼Œæ¯ä¸ªç›®å½•æœ‰
+``/`` åŽç¼€ï¼Œæ¯ä¸ªç›®å½•ä¸­çš„文件用逗å·ï¼ˆ",")分开。 ::
+
+ /sys/kernel/mm/damon/admin
+ │ kdamonds/nr_kdamonds
+ │ │ 0/state,pid
+ │ │ │ contexts/nr_contexts
+ │ │ │ │ 0/operations
+ │ │ │ │ │ monitoring_attrs/
+ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
+ │ │ │ │ │ │ nr_regions/min,max
+ │ │ │ │ │ targets/nr_targets
+ │ │ │ │ │ │ 0/pid_target
+ │ │ │ │ │ │ │ regions/nr_regions
+ │ │ │ │ │ │ │ │ 0/start,end
+ │ │ │ │ │ │ │ │ ...
+ │ │ │ │ │ │ ...
+ │ │ │ │ │ schemes/nr_schemes
+ │ │ │ │ │ │ 0/action
+ │ │ │ │ │ │ │ access_pattern/
+ │ │ │ │ │ │ │ │ sz/min,max
+ │ │ │ │ │ │ │ │ nr_accesses/min,max
+ │ │ │ │ │ │ │ │ age/min,max
+ │ │ │ │ │ │ │ quotas/ms,bytes,reset_interval_ms
+ │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
+ │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low
+ │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds
+ │ │ │ │ │ │ ...
+ │ │ │ │ ...
+ │ │ ...
+
+æ ¹
+--
+
+DAMON sysfs接å£çš„根是 ``<sysfs>/kernel/mm/damon/`` ,它有一个å为 ``admin`` çš„
+目录。该目录包å«ç‰¹æƒç”¨æˆ·ç©ºé—´ç¨‹åºæŽ§åˆ¶DAMON的文件。拥有根æƒé™çš„用户空间工具或deamonså¯ä»¥
+使用这个目录。
+
+kdamonds/
+---------
+
+与监测相关的信æ¯åŒ…括请求规格和结果被称为DAMON上下文。DAMON用一个å«åškdamond的内核线程
+执行æ¯ä¸ªä¸Šä¸‹æ–‡ï¼Œå¤šä¸ªkdamondså¯ä»¥å¹¶è¡Œè¿è¡Œã€‚
+
+在 ``admin`` 目录下,有一个目录,å³``kdamonds``,它有控制kdamonds的文件存在。在开始
+时,这个目录åªæœ‰ä¸€ä¸ªæ–‡ä»¶ï¼Œ``nr_kdamonds``。å‘该文件写入一个数字(``N``),就会创建å为
+``0`` 到 ``N-1`` çš„å­ç›®å½•æ•°é‡ã€‚æ¯ä¸ªç›®å½•ä»£è¡¨æ¯ä¸ªkdamond。
+
+kdamonds/<N>/
+-------------
+
+在æ¯ä¸ªkdamond目录中,存在两个文件(``state`` å’Œ ``pid`` )和一个目录( ``contexts`` )。
+
+è¯»å– ``state`` 时,如果kdamond当å‰æ­£åœ¨è¿è¡Œï¼Œåˆ™è¿”回 ``on`` ,如果没有è¿è¡Œåˆ™è¿”回 ``off`` 。
+写入 ``on`` 或 ``off`` 使kdamond处于状æ€ã€‚å‘ ``state`` 文件写 ``update_schemes_stats`` ,
+æ›´æ–°kdamondçš„æ¯ä¸ªåŸºäºŽDAMONçš„æ“作方案的统计文件的内容。关于统计信æ¯çš„细节,请å‚考
+:ref:`stats section <sysfs_schemes_stats>`.
+
+如果状æ€ä¸º ``on``ï¼Œè¯»å– ``pid`` 显示kdamond线程的pid。
+
+``contexts`` 目录包å«æŽ§åˆ¶è¿™ä¸ªkdamondè¦æ‰§è¡Œçš„监测上下文的文件。
+
+kdamonds/<N>/contexts/
+----------------------
+
+在开始时,这个目录åªæœ‰ä¸€ä¸ªæ–‡ä»¶ï¼Œå³ ``nr_contexts`` 。å‘该文件写入一个数字( ``N`` ),就会创
+建å为``0`` 到 ``N-1`` çš„å­ç›®å½•æ•°é‡ã€‚æ¯ä¸ªç›®å½•ä»£è¡¨æ¯ä¸ªç›‘测背景。目å‰ï¼Œæ¯ä¸ªkdamondåªæ”¯æŒ
+一个上下文,所以åªæœ‰ ``0`` 或 ``1`` å¯ä»¥è¢«å†™å…¥æ–‡ä»¶ã€‚
+
+contexts/<N>/
+-------------
+
+在æ¯ä¸ªä¸Šä¸‹æ–‡ç›®å½•ä¸­ï¼Œå­˜åœ¨ä¸€ä¸ªæ–‡ä»¶(``operations``)和三个目录(``monitoring_attrs``,
+``targets``, 和 ``schemes``)。
+
+DAMON支æŒå¤šç§ç±»åž‹çš„监测æ“作,包括对虚拟地å€ç©ºé—´å’Œç‰©ç†åœ°å€ç©ºé—´çš„监测。你å¯ä»¥é€šè¿‡å‘文件
+中写入以下关键è¯ä¹‹ä¸€ï¼Œå¹¶ä»Žæ–‡ä»¶ä¸­è¯»å–,æ¥è®¾ç½®å’ŒèŽ·å–DAMON将为上下文使用何ç§ç±»åž‹çš„监测æ“作。
+
+ - vaddr: 监测特定进程的虚拟地å€ç©ºé—´
+ - paddr: 监视系统的物ç†åœ°å€ç©ºé—´
+
+contexts/<N>/monitoring_attrs/
+------------------------------
+
+用于指定监测属性的文件,包括所需的监测质é‡å’Œæ•ˆçŽ‡ï¼Œéƒ½åœ¨ ``monitoring_attrs`` 目录中。
+具体æ¥è¯´ï¼Œè¿™ä¸ªç›®å½•ä¸‹æœ‰ä¸¤ä¸ªç›®å½•ï¼Œå³ ``intervals`` å’Œ ``nr_regions`` 。
+
+在 ``intervals`` 目录下,存在DAMON的采样间隔(``sample_us``)ã€èšé›†é—´éš”(``aggr_us``)
+和更新间隔(``update_us``)三个文件。你å¯ä»¥é€šè¿‡å†™å…¥å’Œè¯»å‡ºè¿™äº›æ–‡ä»¶æ¥è®¾ç½®å’ŒèŽ·å–微秒级的值。
+
+在 ``nr_regions`` 目录下,有两个文件分别用于DAMON监测区域的下é™å’Œä¸Šé™ï¼ˆ``min`` å’Œ ``max`` ),
+这两个文件控制ç€ç›‘测的开销。你å¯ä»¥é€šè¿‡å‘这些文件的写入和读出æ¥è®¾ç½®å’ŒèŽ·å–这些值。
+
+关于间隔和监测区域范围的更多细节,请å‚考设计文件 (:doc:`/vm/damon/design`)。
+
+contexts/<N>/targets/
+---------------------
+
+在开始时,这个目录åªæœ‰ä¸€ä¸ªæ–‡ä»¶ ``nr_targets`` 。å‘该文件写入一个数字(``N``),就å¯ä»¥åˆ›å»º
+å为 ``0`` 到 ``N-1`` çš„å­ç›®å½•çš„æ•°é‡ã€‚æ¯ä¸ªç›®å½•ä»£è¡¨æ¯ä¸ªç›‘测目标。
+
+targets/<N>/
+------------
+
+在æ¯ä¸ªç›®æ ‡ç›®å½•ä¸­ï¼Œå­˜åœ¨ä¸€ä¸ªæ–‡ä»¶(``pid_target``)和一个目录(``regions``)。
+
+如果你把 ``vaddr`` 写到 ``contexts/<N>/operations`` 中,æ¯ä¸ªç›®æ ‡åº”该是一个进程。你
+å¯ä»¥é€šè¿‡å°†è¿›ç¨‹çš„pid写到 ``pid_target`` 文件中æ¥æŒ‡å®šDAMON的进程。
+
+targets/<N>/regions
+-------------------
+
+当使用 ``vaddr`` 监测æ“作集时( ``vaddr`` 被写入 ``contexts/<N>/operations`` æ–‡
+件),DAMON自动设置和更新监测目标区域,这样就å¯ä»¥è¦†ç›–目标进程的整个内存映射。然而,用户å¯
+能希望将åˆå§‹ç›‘测区域设置为特定的地å€èŒƒå›´ã€‚
+
+相å,当使用 ``paddr`` 监测æ“作集时,DAMONä¸ä¼šè‡ªåŠ¨è®¾ç½®å’Œæ›´æ–°ç›‘测目标区域( ``paddr``
+被写入 ``contexts/<N>/operations`` 中)。因此,在这ç§æƒ…况下,用户应该自己设置监测目标
+区域。
+
+在这ç§æƒ…况下,用户å¯ä»¥æŒ‰ç…§è‡ªå·±çš„æ„愿明确设置åˆå§‹ç›‘测目标区域,将适当的值写入该目录下的文件。
+
+开始时,这个目录åªæœ‰ä¸€ä¸ªæ–‡ä»¶ï¼Œ ``nr_regions`` 。å‘该文件写入一个数字(``N``),就å¯ä»¥åˆ›
+建å为 ``0`` 到 ``N-1`` çš„å­ç›®å½•ã€‚æ¯ä¸ªç›®å½•ä»£è¡¨æ¯ä¸ªåˆå§‹ç›‘测目标区域。
+
+regions/<N>/
+------------
+
+在æ¯ä¸ªåŒºåŸŸç›®å½•ä¸­ï¼Œä½ ä¼šå‘现两个文件( ``start`` å’Œ ``end`` )。你å¯ä»¥é€šè¿‡å‘文件写入
+和从文件中读出,分别设置和获得åˆå§‹ç›‘测目标区域的起始和结æŸåœ°å€ã€‚
+
+contexts/<N>/schemes/
+---------------------
+
+对于一版的基于DAMONçš„æ•°æ®è®¿é—®æ„ŸçŸ¥çš„内存管ç†ä¼˜åŒ–,用户通常希望系统对特定访问模å¼çš„内存区
+域应用内存管ç†æ“作。DAMON从用户那里接收这ç§å½¢å¼åŒ–çš„æ“作方案,并将这些方案应用于目标内存
+区域。用户å¯ä»¥é€šè¿‡è¯»å–和写入这个目录下的文件æ¥èŽ·å¾—和设置这些方案。
+
+在开始时,这个目录åªæœ‰ä¸€ä¸ªæ–‡ä»¶ï¼Œ``nr_schemes``。å‘该文件写入一个数字(``N``),就å¯ä»¥
+创建å为``0``到``N-1``çš„å­ç›®å½•çš„æ•°é‡ã€‚æ¯ä¸ªç›®å½•ä»£è¡¨æ¯ä¸ªåŸºäºŽDAMONçš„æ“作方案。
+
+schemes/<N>/
+------------
+
+在æ¯ä¸ªæ–¹æ¡ˆç›®å½•ä¸­ï¼Œå­˜åœ¨å››ä¸ªç›®å½•(``access_pattern``, ``quotas``,``watermarks``,
+和 ``stats``)和一个文件(``action``)。
+
+``action`` 文件用于设置和获å–你想应用于具有特定访问模å¼çš„内存区域的动作。å¯ä»¥å†™å…¥æ–‡ä»¶
+和从文件中读å–的关键è¯åŠå…¶å«ä¹‰å¦‚下。
+
+ - ``willneed``: 对有 ``MADV_WILLNEED`` 的区域调用 ``madvise()`` 。
+ - ``cold``: 对具有 ``MADV_COLD`` 的区域调用 ``madvise()`` 。
+ - ``pageout``: 为具有 ``MADV_PAGEOUT`` 的区域调用 ``madvise()`` 。
+ - ``hugepage``: 为带有 ``MADV_HUGEPAGE`` 的区域调用 ``madvise()`` 。
+ - ``nohugepage``: 为带有 ``MADV_NOHUGEPAGE`` 的区域调用 ``madvise()``。
+ - ``stat``: 什么都ä¸åšï¼Œåªè®¡ç®—统计数æ®
+
+schemes/<N>/access_pattern/
+---------------------------
+
+æ¯ä¸ªåŸºäºŽDAMONçš„æ“作方案的目标访问模å¼ç”±ä¸‰ä¸ªèŒƒå›´æž„æˆï¼ŒåŒ…括以字节为å•ä½çš„区域大å°ã€æ¯ä¸ª
+èšåˆåŒºé—´çš„监测访问次数和区域年龄的èšåˆåŒºé—´æ•°ã€‚
+
+在 ``access_pattern`` 目录下,存在三个目录( ``sz``, ``nr_accesses``, 和 ``age`` ),
+æ¯ä¸ªç›®å½•æœ‰ä¸¤ä¸ªæ–‡ä»¶ï¼ˆ``min`` å’Œ ``max`` )。你å¯ä»¥é€šè¿‡å‘ ``sz``, ``nr_accesses``, å’Œ
+``age`` 目录下的 ``min`` å’Œ ``max`` 文件分别写入和读å–æ¥è®¾ç½®å’ŒèŽ·å–给定方案的访问模å¼ã€‚
+
+schemes/<N>/quotas/
+-------------------
+
+æ¯ä¸ª ``动作`` 的最佳 ``目标访问模å¼`` å–决于工作负载,所以ä¸å®¹æ˜“找到。更糟糕的是,将æŸäº›åŠ¨ä½œ
+的方案设置得过于激进会造æˆä¸¥é‡çš„开销。为了é¿å…è¿™ç§å¼€é”€ï¼Œç”¨æˆ·å¯ä»¥ä¸ºæ¯ä¸ªæ–¹æ¡ˆé™åˆ¶æ—¶é—´å’Œå¤§å°é…é¢ã€‚
+具体æ¥è¯´ï¼Œç”¨æˆ·å¯ä»¥è¦æ±‚DAMONå°½é‡åªä½¿ç”¨ç‰¹å®šçš„时间(``时间é…é¢``)æ¥åº”用行动,并且在给定的时间间
+隔(``é‡ç½®é—´éš”``)内,åªå¯¹å…·æœ‰ç›®æ ‡è®¿é—®æ¨¡å¼çš„内存区域应用行动,而ä¸ä½¿ç”¨ç‰¹å®šæ•°é‡ï¼ˆ``大å°é…é¢``)。
+
+当预计超过é…é¢é™åˆ¶æ—¶ï¼ŒDAMONä¼šæ ¹æ® ``目标访问模å¼`` 的大å°ã€è®¿é—®é¢‘率和年龄,对找到的内存区域
+进行优先排åºã€‚为了进行个性化的优先排åºï¼Œç”¨æˆ·å¯ä»¥ä¸ºè¿™ä¸‰ä¸ªå±žæ€§è®¾ç½®æƒé‡ã€‚
+
+在 ``quotas`` 目录下,存在三个文件(``ms``, ``bytes``, ``reset_interval_ms``)和一个
+目录(``weights``),其中有三个文件(``sz_permil``, ``nr_accesses_permil``, 和
+``age_permil``)。
+
+ä½ å¯ä»¥è®¾ç½®ä»¥æ¯«ç§’为å•ä½çš„ ``时间é…é¢`` ,以字节为å•ä½çš„ ``大å°é…é¢`` ,以åŠä»¥æ¯«ç§’为å•ä½çš„ ``é‡
+置间隔`` ,分别å‘这三个文件写入数值。你还å¯ä»¥é€šè¿‡å‘ ``weights`` 目录下的三个文件写入数值æ¥è®¾
+置大å°ã€è®¿é—®é¢‘率和年龄的优先æƒï¼Œå•ä½ä¸ºåƒåˆ†ä¹‹ä¸€ã€‚
+
+schemes/<N>/watermarks/
+-----------------------
+
+为了便于根æ®ç³»ç»ŸçŠ¶æ€æ¿€æ´»å’Œåœç”¨æ¯ä¸ªæ–¹æ¡ˆï¼ŒDAMONæ供了一个称为水ä½çš„功能。该功能接收五个值,称为
+``度é‡`` ã€``é—´éš”`` ã€``高`` ã€``中`` ã€``低`` 。``度é‡å€¼`` 是指å¯ä»¥æµ‹é‡çš„系统度é‡å€¼ï¼Œå¦‚
+自由内存比率。如果系统的度é‡å€¼ ``高`` 于memoent的高值或 ``低`` 于低值,则该方案被åœç”¨ã€‚如果
+该值低于 ``中`` ,则该方案被激活。
+
+在水ä½ç›®å½•ä¸‹ï¼Œå­˜åœ¨äº”个文件(``metric``, ``interval_us``,``high``, ``mid``, and ``low``)
+用于设置æ¯ä¸ªå€¼ã€‚ä½ å¯ä»¥é€šè¿‡å‘这些文件的写入æ¥åˆ†åˆ«è®¾ç½®å’ŒèŽ·å–这五个值。
+
+å¯ä»¥å†™å…¥ ``metric`` 文件的关键è¯å’Œå«ä¹‰å¦‚下。
+
+ - none: 忽略水ä½
+ - free_mem_rate: 系统的自由内存率(åƒåˆ†æ¯”)。
+
+``interval`` 应以微秒为å•ä½å†™å…¥ã€‚
+
+schemes/<N>/stats/
+------------------
+
+DAMON统计æ¯ä¸ªæ–¹æ¡ˆè¢«å°è¯•åº”用的区域的总数é‡å’Œå­—节数,æ¯ä¸ªæ–¹æ¡ˆè¢«æˆåŠŸåº”用的区域的两个数字,以åŠ
+超过é…é¢é™åˆ¶çš„总数é‡ã€‚这些统计数æ®å¯ç”¨äºŽåœ¨çº¿åˆ†æžæˆ–调整方案。
+
+å¯ä»¥é€šè¿‡è¯»å– ``stats`` 目录下的文件(``nr_tried``, ``sz_tried``, ``nr_applied``,
+``sz_applied``, å’Œ ``qt_exceeds``))分别检索这些统计数æ®ã€‚这些文件ä¸æ˜¯å®žæ—¶æ›´æ–°çš„,所以
+你应该è¦æ±‚DAMON sysfs接å£é€šè¿‡åœ¨ç›¸å…³çš„ ``kdamonds/<N>/state`` 文件中写入一个特殊的关键字
+``update_schemes_stats`` æ¥æ›´æ–°ç»Ÿè®¡ä¿¡æ¯çš„文件内容。
+
+用例
+~~~~
+
+下é¢çš„命令应用了一个方案:â€å¦‚果一个大å°ä¸º[4KiB, 8KiB]的内存区域在[10, 20]çš„èšåˆæ—¶é—´é—´éš”内
+显示出æ¯ä¸€ä¸ªèšåˆæ—¶é—´é—´éš”[0, 5]的访问é‡ï¼Œè¯·åˆ†é¡µè¯¥åŒºåŸŸã€‚对于分页,æ¯ç§’最多åªèƒ½ä½¿ç”¨10ms,而且æ¯
+秒分页ä¸èƒ½è¶…过1GiB。在这一é™åˆ¶ä¸‹ï¼Œé¦–先分页出具有较长年龄的内存区域。å¦å¤–,æ¯5秒钟检查一次系统
+çš„å¯ç”¨å†…存率,当å¯ç”¨å†…存率低于50%时开始监测和分页,但如果å¯ç”¨å†…存率大于60%,或低于30%,则åœ
+止监测。“ ::
+
+ # cd <sysfs>/kernel/mm/damon/admin
+ # # populate directories
+ # echo 1 > kdamonds/nr_kdamonds; echo 1 > kdamonds/0/contexts/nr_contexts;
+ # echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
+ # cd kdamonds/0/contexts/0/schemes/0
+ # # set the basic access pattern and the action
+ # echo 4096 > access_patterns/sz/min
+ # echo 8192 > access_patterns/sz/max
+ # echo 0 > access_patterns/nr_accesses/min
+ # echo 5 > access_patterns/nr_accesses/max
+ # echo 10 > access_patterns/age/min
+ # echo 20 > access_patterns/age/max
+ # echo pageout > action
+ # # set quotas
+ # echo 10 > quotas/ms
+ # echo $((1024*1024*1024)) > quotas/bytes
+ # echo 1000 > quotas/reset_interval_ms
+ # # set watermark
+ # echo free_mem_rate > watermarks/metric
+ # echo 5000000 > watermarks/interval_us
+ # echo 600 > watermarks/high
+ # echo 500 > watermarks/mid
+ # echo 300 > watermarks/low
+
+请注æ„,我们强烈建议使用用户空间的工具,如 `damo <https://github.com/awslabs/damo>`_ ,
+而ä¸æ˜¯åƒä¸Šé¢é‚£æ ·æ‰‹åŠ¨è¯»å†™æ–‡ä»¶ã€‚以上åªæ˜¯ä¸€ä¸ªä¾‹å­ã€‚
debugfs接å£
===========
@@ -46,7 +317,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
属性
----
-用户å¯ä»¥é€šè¿‡è¯»å–和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 〠``èšé›†é—´éš”`` 〠``区域更新间隔``
+用户å¯ä»¥é€šè¿‡è¯»å–和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 〠``èšé›†é—´éš”`` 〠``æ›´æ–°é—´éš”``
以åŠç›‘测目标区域的最å°/最大数é‡ã€‚è¦è¯¦ç»†äº†è§£ç›‘测属性,请å‚考 `:doc:/vm/damon/design` 。例如,
下é¢çš„命令将这些值设置为5msã€100msã€1000msã€10å’Œ1000,然åŽå†æ¬¡æ£€æŸ¥::
@@ -108,8 +379,8 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``,
1 20 40
1 50 100" > init_regions
-请注æ„,这åªæ˜¯è®¾ç½®äº†åˆå§‹çš„监测目标区域。在虚拟内存监测的情况下,DAMON会在一个 ``区域更新间隔``
-åŽè‡ªåŠ¨æ›´æ–°åŒºåŸŸçš„边界。因此,在这ç§æƒ…况下,如果用户ä¸å¸Œæœ›æ›´æ–°çš„è¯ï¼Œåº”该把 ``区域的更新间隔`` 设
+请注æ„,这åªæ˜¯è®¾ç½®äº†åˆå§‹çš„监测目标区域。在虚拟内存监测的情况下,DAMON会在一个 ``æ›´æ–°é—´éš”``
+åŽè‡ªåŠ¨æ›´æ–°åŒºåŸŸçš„边界。因此,在这ç§æƒ…况下,如果用户ä¸å¸Œæœ›æ›´æ–°çš„è¯ï¼Œåº”该把 ``æ›´æ–°é—´éš”`` 设
置得足够大。
diff --git a/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst b/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst
new file mode 100644
index 000000000000..17b5ce85a90c
--- /dev/null
+++ b/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst
@@ -0,0 +1,167 @@
+.. highlight:: none
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/dev-tools/gdb-kernel-debugging.rst
+:Translator: 高超 gao chao <gaochao49@huawei.com>
+
+通过gdb调试内核和模å—
+=====================
+
+Kgdb内核调试器ã€QEMU等虚拟机管ç†ç¨‹åºæˆ–基于JTAG的硬件接å£ï¼Œæ”¯æŒåœ¨è¿è¡Œæ—¶ä½¿ç”¨gdb
+调试Linux内核åŠå…¶æ¨¡å—。Gdbæ供了一个强大的python脚本接å£ï¼Œå†…核也æ供了一套
+辅助脚本以简化典型的内核调试步骤。本文档为如何å¯ç”¨å’Œä½¿ç”¨è¿™äº›è„šæœ¬æ供了一个简è¦çš„教程。
+此教程基于QEMU/KVM虚拟机,但文中示例也适用于其他gdb stub。
+
+
+环境é…ç½®è¦æ±‚
+------------
+
+- gdb 7.2+ (推è版本: 7.4+) 且开å¯pythonæ”¯æŒ (通常å‘行版上都已支æŒ)
+
+设置
+----
+
+- 创建一个QEMU/KVMçš„linux虚拟机(详情请å‚考 www.linux-kvm.org å’Œ www.qemu.org )。
+ 对于交å‰å¼€å‘,https://landley.net/aboriginal/bin æ供了一些镜åƒå’Œå·¥å…·é“¾ï¼Œ
+ å¯ä»¥å¸®åŠ©æ­å»ºäº¤å‰å¼€å‘环境。
+
+- 编译内核时开å¯CONFIG_GDB_SCRIPTS,关闭CONFIG_DEBUG_INFO_REDUCED。
+ 如果架构支æŒCONFIG_FRAME_POINTER,请ä¿æŒå¼€å¯ã€‚
+
+- 在guest环境上安装该内核。如有必è¦ï¼Œé€šè¿‡åœ¨å†…æ ¸command line中添加“nokaslrâ€æ¥å…³é—­KASLR。
+ 此外,QEMUå…许通过-kernelã€-appendã€-initrd这些命令行选项直接å¯åŠ¨å†…核。
+ 但这通常仅在ä¸ä¾èµ–内核模å—æ—¶æ‰æœ‰æ•ˆã€‚有关此模å¼çš„更多详细信æ¯ï¼Œè¯·å‚阅QEMU文档。
+ 在这ç§æƒ…况下,如果架构支æŒKASLR,应该在ç¦ç”¨CONFIG_RANDOMIZE_BASE的情况下构建内核。
+
+- å¯ç”¨QEMU/KVMçš„gdb stub,å¯ä»¥é€šè¿‡å¦‚下方å¼å®žçŽ°
+
+ - 在VMå¯åŠ¨æ—¶ï¼Œé€šè¿‡åœ¨QEMU命令行中添加“-sâ€å‚æ•°
+
+ 或
+
+ - 在è¿è¡Œæ—¶é€šè¿‡ä»ŽQEMU监视控制å°å‘é€â€œgdbserverâ€
+
+- 切æ¢åˆ°/path/to/linux-build(内核æºç ç¼–译)目录
+
+- å¯åŠ¨gdb:gdb vmlinux
+
+ 注æ„:æŸäº›å‘行版å¯èƒ½ä¼šå°†gdb脚本的自动加载é™åˆ¶åœ¨å·²çŸ¥çš„安全目录中。
+ 如果gdb报告拒ç»åŠ è½½vmlinux-gdb.py(相关命令找ä¸åˆ°ï¼‰ï¼Œè¯·å°†::
+
+ add-auto-load-safe-path /path/to/linux-build
+
+ 添加到~/.gdbinit。更多详细信æ¯ï¼Œè¯·å‚阅gdb帮助信æ¯ã€‚
+
+- 连接到已å¯åŠ¨çš„guest环境::
+
+ (gdb) target remote :1234
+
+
+使用Linuxæ供的gdb脚本的示例
+----------------------------
+
+- 加载模å—(以åŠä¸»å†…核)符å·::
+
+ (gdb) lx-symbols
+ loading vmlinux
+ scanning for modules in /home/user/linux/build
+ loading @0xffffffffa0020000: /home/user/linux/build/net/netfilter/xt_tcpudp.ko
+ loading @0xffffffffa0016000: /home/user/linux/build/net/netfilter/xt_pkttype.ko
+ loading @0xffffffffa0002000: /home/user/linux/build/net/netfilter/xt_limit.ko
+ loading @0xffffffffa00ca000: /home/user/linux/build/net/packet/af_packet.ko
+ loading @0xffffffffa003c000: /home/user/linux/build/fs/fuse/fuse.ko
+ ...
+ loading @0xffffffffa0000000: /home/user/linux/build/drivers/ata/ata_generic.ko
+
+- 对一些尚未加载的模å—中的函数函数设置断点,例如::
+
+ (gdb) b btrfs_init_sysfs
+ Function "btrfs_init_sysfs" not defined.
+ Make breakpoint pending on future shared library load? (y or [n]) y
+ Breakpoint 1 (btrfs_init_sysfs) pending.
+
+- 继续执行::
+
+ (gdb) c
+
+- 加载模å—并且能观察到正在加载的符å·ä»¥åŠæ–­ç‚¹å‘½ä¸­::
+
+ loading @0xffffffffa0034000: /home/user/linux/build/lib/libcrc32c.ko
+ loading @0xffffffffa0050000: /home/user/linux/build/lib/lzo/lzo_compress.ko
+ loading @0xffffffffa006e000: /home/user/linux/build/lib/zlib_deflate/zlib_deflate.ko
+ loading @0xffffffffa01b1000: /home/user/linux/build/fs/btrfs/btrfs.ko
+
+ Breakpoint 1, btrfs_init_sysfs () at /home/user/linux/fs/btrfs/sysfs.c:36
+ 36 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+
+- 查看内核的日志缓冲区::
+
+ (gdb) lx-dmesg
+ [ 0.000000] Initializing cgroup subsys cpuset
+ [ 0.000000] Initializing cgroup subsys cpu
+ [ 0.000000] Linux version 3.8.0-rc4-dbg+ (...
+ [ 0.000000] Command line: root=/dev/sda2 resume=/dev/sda1 vga=0x314
+ [ 0.000000] e820: BIOS-provided physical RAM map:
+ [ 0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000009fbff] usable
+ [ 0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved
+ ....
+
+- 查看当å‰task struct结构体的字段(仅x86å’Œarm64支æŒï¼‰::
+
+ (gdb) p $lx_current().pid
+ $1 = 4998
+ (gdb) p $lx_current().comm
+ $2 = "modprobe\000\000\000\000\000\000\000"
+
+- 对当å‰æˆ–指定的CPU使用per-cpu函数::
+
+ (gdb) p $lx_per_cpu("runqueues").nr_running
+ $3 = 1
+ (gdb) p $lx_per_cpu("runqueues", 2).nr_running
+ $4 = 0
+
+- 使用container_of查看更多hrtimersä¿¡æ¯::
+
+ (gdb) set $next = $lx_per_cpu("hrtimer_bases").clock_base[0].active.next
+ (gdb) p *$container_of($next, "struct hrtimer", "node")
+ $5 = {
+ node = {
+ node = {
+ __rb_parent_color = 18446612133355256072,
+ rb_right = 0x0 <irq_stack_union>,
+ rb_left = 0x0 <irq_stack_union>
+ },
+ expires = {
+ tv64 = 1835268000000
+ }
+ },
+ _softexpires = {
+ tv64 = 1835268000000
+ },
+ function = 0xffffffff81078232 <tick_sched_timer>,
+ base = 0xffff88003fd0d6f0,
+ state = 1,
+ start_pid = 0,
+ start_site = 0xffffffff81055c1f <hrtimer_start_range_ns+20>,
+ start_comm = "swapper/2\000\000\000\000\000\000"
+ }
+
+
+命令和辅助调试功能列表
+----------------------
+
+命令和辅助调试功能å¯èƒ½ä¼šéšç€æ—¶é—´çš„推移而改进,此文显示的是åˆå§‹ç‰ˆæœ¬çš„部分示例::
+
+ (gdb) apropos lx
+ function lx_current -- Return current task
+ function lx_module -- Find module by name and return the module variable
+ function lx_per_cpu -- Return per-cpu variable
+ function lx_task_by_pid -- Find Linux task by PID and return the task_struct variable
+ function lx_thread_info -- Calculate Linux thread_info from task variable
+ lx-dmesg -- Print Linux kernel log buffer
+ lx-lsmod -- List currently loaded modules
+ lx-symbols -- (Re-)load symbols of Linux kernel and currently loaded modules
+
+å¯ä»¥é€šè¿‡â€œhelp <command-name>â€æˆ–“help function <function-name>â€å‘½ä»¤
+获å–指定命令或指定调试功能的更多详细信æ¯ã€‚
diff --git a/Documentation/translations/zh_CN/dev-tools/index.rst b/Documentation/translations/zh_CN/dev-tools/index.rst
index 77a8c44cdf49..02577c379007 100644
--- a/Documentation/translations/zh_CN/dev-tools/index.rst
+++ b/Documentation/translations/zh_CN/dev-tools/index.rst
@@ -25,6 +25,7 @@ Documentation/translations/zh_CN/dev-tools/testing-overview.rst
sparse
gcov
kasan
+ gdb-kernel-debugging
Todolist:
@@ -34,7 +35,6 @@ Todolist:
- kmemleak
- kcsan
- kfence
- - gdb-kernel-debugging
- kgdb
- kselftest
- kunit/index
diff --git a/Documentation/translations/zh_CN/devicetree/usage-model.rst b/Documentation/translations/zh_CN/devicetree/usage-model.rst
index 318a3c6a0114..accdc33475a0 100644
--- a/Documentation/translations/zh_CN/devicetree/usage-model.rst
+++ b/Documentation/translations/zh_CN/devicetree/usage-model.rst
@@ -120,24 +120,24 @@ dt_compat列表(如果你好奇,该列表定义在arch/arm/include/asm/mach/
表示什么。在Documentation/devicetree/bindings中添加兼容字符串的文档。
åŒæ ·åœ¨ARM上,对于æ¯ä¸ªmachine_desc,内核会查看是å¦æœ‰ä»»ä½•dt_compat列表æ¡
-目出现在兼容属性中。如果有,那么该机器_desc就是驱动该机器的候选者。在æœç´¢
+目出现在兼容属性中。如果有,那么该machine_desc就是驱动该机器的候选者。在æœç´¢
了整个machine_descs表之åŽï¼Œsetup_machine_fdt()æ ¹æ®æ¯ä¸ªmachine_desc
在兼容属性中匹é…çš„æ¡ç›®ï¼Œè¿”回 “最兼容†的machine_desc。如果没有找到匹é…
的machine_desc,那么它将返回NULL。
这个方案背åŽçš„原因是观察到,在大多数情况下,如果它们都使用相åŒçš„SoC或相åŒ
-系列的SoC,一个机器_descå¯ä»¥æ”¯æŒå¤§é‡çš„电路æ¿ã€‚然而,ä¸å¯é¿å…地会有一些例
+系列的SoC,一个machine_descå¯ä»¥æ”¯æŒå¤§é‡çš„电路æ¿ã€‚然而,ä¸å¯é¿å…地会有一些例
外情况,å³ç‰¹å®šçš„æ¿å­éœ€è¦ç‰¹æ®Šçš„设置代ç ï¼Œè¿™åœ¨ä¸€èˆ¬æƒ…况下是没有用的。特殊情况
å¯ä»¥é€šè¿‡åœ¨é€šç”¨è®¾ç½®ä»£ç ä¸­æ˜Žç¡®æ£€æŸ¥æœ‰é—®é¢˜çš„æ¿å­æ¥å¤„ç†ï¼Œä½†å¦‚果超过几个情况下,
这样åšå¾ˆå¿«å°±ä¼šå˜å¾—很难看和/或无法维护。
-相å,兼容列表å…许通用机器_desc通过在dt_compat列表中指定“ä¸å¤ªå…¼å®¹â€çš„值
+相å,兼容列表å…许通用machine_desc通过在dt_compat列表中指定“ä¸å¤ªå…¼å®¹â€çš„值
æ¥æ供对广泛的通用æ¿çš„支æŒã€‚在上é¢çš„例å­ä¸­ï¼Œé€šç”¨æ¿æ”¯æŒå¯ä»¥å£°ç§°ä¸Žâ€œti,ompa3â€
或“ti,ompa3450â€å…¼å®¹ã€‚如果在最åˆçš„beagleboard上å‘现了一个bug,需è¦åœ¨
早期å¯åŠ¨æ—¶ä½¿ç”¨ç‰¹æ®Šçš„å˜é€šä»£ç ï¼Œé‚£ä¹ˆå¯ä»¥æ·»åŠ ä¸€ä¸ªæ–°çš„machine_desc,实现å˜é€šï¼Œ
并且åªåœ¨â€œti,omap3-beagleboardâ€ä¸ŠåŒ¹é…。
-PowerPC使用了一个ç¨å¾®ä¸åŒçš„方案,它从æ¯ä¸ªæœºå™¨_desc中调用.probe()é’©å­ï¼Œ
+PowerPC使用了一个ç¨å¾®ä¸åŒçš„方案,它从æ¯ä¸ªmachine_desc中调用.probe()é’©å­ï¼Œ
并使用第一个返回TRUEçš„é’©å­ã€‚然而,这ç§æ–¹æ³•æ²¡æœ‰è€ƒè™‘到兼容列表的优先级,对于
新的架构支æŒå¯èƒ½åº”该é¿å…。
diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst
index 88d8df957a78..ac32d8e306ac 100644
--- a/Documentation/translations/zh_CN/index.rst
+++ b/Documentation/translations/zh_CN/index.rst
@@ -108,6 +108,7 @@ TODOList:
:maxdepth: 2
core-api/index
+ locking/index
accounting/index
cpu-freq/index
iio/index
@@ -123,7 +124,6 @@ TODOList:
TODOList:
* driver-api/index
-* locking/index
* block/index
* cdrom/index
* ide/index
diff --git a/Documentation/translations/zh_CN/locking/index.rst b/Documentation/translations/zh_CN/locking/index.rst
new file mode 100644
index 000000000000..700df8a2bb70
--- /dev/null
+++ b/Documentation/translations/zh_CN/locking/index.rst
@@ -0,0 +1,42 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/locking/index.rst
+
+:翻译:
+
+ å”艺舟 Tang Yizhou <tangyeechou@gmail.com>
+
+==
+é”
+==
+
+.. toctree::
+ :maxdepth: 1
+
+TODOList:
+
+ * locktypes
+ * lockdep-design
+ * lockstat
+ * locktorture
+ * mutex-design
+ * rt-mutex-design
+ * rt-mutex
+ * seqlock
+ * spinlocks
+ * ww-mutex-design
+ * preempt-locking
+ * pi-futex
+ * futex-requeue-pi
+ * hwspinlock
+ * percpu-rw-semaphore
+ * robust-futexes
+ * robust-futex-ABI
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/translations/zh_CN/locking/spinlocks.rst b/Documentation/translations/zh_CN/locking/spinlocks.rst
new file mode 100644
index 000000000000..2017c01f0a4b
--- /dev/null
+++ b/Documentation/translations/zh_CN/locking/spinlocks.rst
@@ -0,0 +1,149 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/locking/spinlocks.rst
+
+:翻译:
+
+ å”艺舟 Tang Yizhou <tangyeechou@gmail.com>
+
+==========
+加é”的教训
+==========
+
+教训 1:自旋é”
+==============
+
+加é”最基本的原语是自旋é”(spinlock)::
+
+ static DEFINE_SPINLOCK(xxx_lock);
+
+ unsigned long flags;
+
+ spin_lock_irqsave(&xxx_lock, flags);
+ ... 这里是临界区 ..
+ spin_unlock_irqrestore(&xxx_lock, flags);
+
+上述代ç æ€»æ˜¯å®‰å…¨çš„。自旋é”将在 _本地_ ç¦ç”¨ä¸­æ–­ï¼Œä½†å®ƒæœ¬èº«å°†ä¿è¯å…¨å±€é”定。所以它
+å°†ä¿è¯åœ¨è¯¥é”ä¿æŠ¤çš„区域内åªæœ‰ä¸€ä¸ªæŽ§åˆ¶çº¿ç¨‹ã€‚å³ä½¿åœ¨å•å¤„ç†å™¨ï¼ˆUP)下也能很好的工作,
+æ‰€ä»¥ä»£ç  _ä¸_ 需è¦æ‹…心UP还是SMP的问题:自旋é”在两ç§æƒ…况下都能正常工作。
+
+ 注æ„ï¼è‡ªæ—‹é”对内存的潜在影å“由下述文档进一步æ述:
+
+ Documentation/memory-barriers.txt
+
+ (5) ACQUIRE operations.
+
+ (6) RELEASE operations.
+
+上述代ç é€šå¸¸éžå¸¸ç®€å•ï¼ˆå¯¹å¤§éƒ¨åˆ†æƒ…况,你通常需è¦å¹¶ä¸”åªå¸Œæœ›æœ‰ä¸€ä¸ªè‡ªæ—‹é”——使用多个
+自旋é”会使事情å˜å¾—æ›´å¤æ‚,甚至更慢,而且通常仅仅在你 **ç†è§£çš„** åºåˆ—有被拆分的
+需求时æ‰å€¼å¾—这么åšï¼šå¦‚果你ä¸ç¡®å®šçš„è¯ï¼Œè¯·ä¸æƒœä¸€åˆ‡ä»£ä»·é¿å…这样åšï¼‰ã€‚
+
+这是关于自旋é”的唯一真正困难的部分:一旦你开始使用自旋é”,它们往往会扩展到你以å‰
+å¯èƒ½æ²¡æœ‰æ³¨æ„到的领域,因为你必须确ä¿è‡ªæ—‹é”正确地ä¿æŠ¤å…±äº«æ•°æ®ç»“æž„ **æ¯ä¸€å¤„** 被
+使用的地方。自旋é”是最容易被添加到完全独立于其它代ç çš„地方(例如,没有人访问的
+内部驱动数æ®ç»“构)的。
+
+ 注æ„ï¼ä»…当你在跨CPU核访问时使用 **åŒä¸€æŠŠ** 自旋é”,对它的使用æ‰æ˜¯å®‰å…¨çš„。
+ è¿™æ„味ç€æ‰€æœ‰è®¿é—®å…±äº«å˜é‡çš„代ç å¿…须对它们想使用的自旋é”è¾¾æˆä¸€è‡´ã€‚
+
+----
+
+教训 2:读-写自旋é”
+===================
+
+如果你的数æ®è®¿é—®æœ‰ä¸€ä¸ªéžå¸¸è‡ªç„¶çš„模å¼ï¼Œå€¾å‘于从共享å˜é‡ä¸­è¯»å–æ•°æ®ï¼Œè¯»-写自旋é”
+(rw_lock)有时是有用的。它们å…许多个读者åŒæ—¶å‡ºçŽ°åœ¨åŒä¸€ä¸ªä¸´ç•ŒåŒºï¼Œä½†æ˜¯å¦‚果有人想
+改å˜å˜é‡ï¼Œå®ƒå¿…须获得一个独å çš„写é”。
+
+ 注æ„ï¼è¯»-写自旋é”比原始自旋é”需è¦æ›´å¤šçš„原å­å†…å­˜æ“作。除éžè¯»è€…的临界区很长,
+ å¦åˆ™ä½ æœ€å¥½åªä½¿ç”¨åŽŸå§‹è‡ªæ—‹é”。
+
+例程看起æ¥å’Œä¸Šé¢ä¸€æ ·::
+
+ rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+ unsigned long flags;
+
+ read_lock_irqsave(&xxx_lock, flags);
+ .. 仅读å–ä¿¡æ¯çš„临界区 ...
+ read_unlock_irqrestore(&xxx_lock, flags);
+
+ write_lock_irqsave(&xxx_lock, flags);
+ .. 读å–和独å å†™ä¿¡æ¯ ...
+ write_unlock_irqrestore(&xxx_lock, flags);
+
+上é¢è¿™ç§é”对于å¤æ‚çš„æ•°æ®ç»“构如链表å¯èƒ½ä¼šæœ‰ç”¨ï¼Œç‰¹åˆ«æ˜¯åœ¨ä¸æ”¹å˜é“¾è¡¨çš„情况下æœç´¢å…¶ä¸­
+çš„æ¡ç›®ã€‚读é”å…许许多并å‘的读者。任何希望 **修改** 链表的代ç å°†å¿…须先获å–写é”。
+
+ 注æ„ï¼RCUé”更适åˆé历链表,但需è¦ä»”细注æ„设计细节(è§Documentation/RCU/listRCU.rst)。
+
+å¦å¤–,你ä¸èƒ½æŠŠè¯»é”“å‡çº§â€ä¸ºå†™é”,所以如果你在 _任何_ 时候需è¦åšä»»ä½•ä¿®æ”¹
+(å³ä½¿ä½ ä¸æ˜¯æ¯æ¬¡éƒ½è¿™æ ·åšï¼‰ï¼Œä½ å¿…须在一开始就获得写é”。
+
+ 注æ„ï¼æˆ‘们正在努力消除大多数情况下的读-写自旋é”的使用,所以请ä¸è¦åœ¨æ²¡æœ‰è¾¾æˆ
+ 共识的情况下增加一个新的(相å,请å‚阅Documentation/RCU/rcu.rst以获得完整
+ ä¿¡æ¯ï¼‰ã€‚
+
+----
+
+教训 3:é‡æ–°å®¡è§†è‡ªæ—‹é”
+======================
+
+上述的自旋é”原语ç»ä¸æ˜¯å”¯ä¸€çš„。它们是最安全的,在所有情况下都能正常工作,但部分
+**因为** 它们是安全的,它们也是相当慢的。它们比原本需è¦çš„更慢,因为它们必须è¦
+ç¦ç”¨ä¸­æ–­ï¼ˆåœ¨X86上åªæ˜¯ä¸€æ¡æŒ‡ä»¤ï¼Œä½†å´æ˜¯ä¸€æ¡æ˜‚贵的指令——而在其他体系结构上,情况
+å¯èƒ½æ›´ç³Ÿï¼‰ã€‚
+
+如果你有必须ä¿æŠ¤è·¨CPU访问的数æ®ç»“构且你想使用自旋é”的场景,你有å¯èƒ½ä½¿ç”¨ä»£ä»·å°çš„
+自旋é”版本。当且仅当你知é“æŸè‡ªæ—‹é”永远ä¸ä¼šåœ¨ä¸­æ–­å¤„ç†ç¨‹åºä¸­ä½¿ç”¨ï¼Œä½ å¯ä»¥ä½¿ç”¨éžä¸­æ–­
+的版本::
+
+ spin_lock(&lock);
+ ...
+ spin_unlock(&lock);
+
+(当然,也å¯ä»¥ä½¿ç”¨ç›¸åº”的读-写é”版本)。这ç§è‡ªæ—‹é”å°†åŒæ ·å¯ä»¥ä¿è¯ç‹¬å è®¿é—®ï¼Œè€Œä¸”
+速度会快很多。如果你知é“有关的数æ®åªåœ¨â€œè¿›ç¨‹ä¸Šä¸‹æ–‡â€ä¸­è¢«å­˜å–,å³ï¼Œä¸æ¶‰åŠä¸­æ–­ï¼Œ
+è¿™ç§è‡ªæ—‹é”就有用了。
+
+当这些版本的自旋é”涉åŠä¸­æ–­æ—¶ï¼Œä½ ä¸èƒ½ä½¿ç”¨çš„原因是会陷入死é”::
+
+ spin_lock(&lock);
+ ...
+ <- 中断æ¥ä¸´ï¼š
+ spin_lock(&lock);
+
+一个中断试图对一个已ç»é”定的å˜é‡ä¸Šé”。如果中断å‘生在å¦ä¸€ä¸ªCPU上,ä¸ä¼šæœ‰é—®é¢˜ï¼›
+但如果中断å‘生在已ç»æŒæœ‰è‡ªæ—‹é”çš„åŒä¸€ä¸ªCPU上,将 _会_ 有问题,因为该é”显然永远
+ä¸ä¼šè¢«é‡Šæ”¾ï¼ˆå› ä¸ºä¸­æ–­æ­£åœ¨ç­‰å¾…该é”,而é”çš„æŒæœ‰è€…被中断打断,并且无法继续执行,
+直到中断处ç†ç»“æŸï¼‰ã€‚
+
+(这也是自旋é”的中断版本åªéœ€è¦ç¦ç”¨ _本地_ 中断的原因——在å‘生于其它CPU的中断中
+使用åŒä¸€æŠŠè‡ªæ—‹é”是没问题的,因为å‘生于其它CPU的中断ä¸ä¼šæ‰“æ–­å·²ç»æŒé”çš„CPU,所以
+é”çš„æŒæœ‰è€…å¯ä»¥ç»§ç»­æ‰§è¡Œå¹¶æœ€ç»ˆé‡Šæ”¾é”)。
+
+ Linus
+
+----
+
+å‚考信æ¯
+========
+
+对于动æ€åˆå§‹åŒ–,使用spin_lock_init()或rwlock_init()是åˆé€‚çš„::
+
+ spinlock_t xxx_lock;
+ rwlock_t xxx_rw_lock;
+
+ static int __init xxx_init(void)
+ {
+ spin_lock_init(&xxx_lock);
+ rwlock_init(&xxx_rw_lock);
+ ...
+ }
+
+ module_init(xxx_init);
+
+对于é™æ€åˆå§‹åŒ–,使用DEFINE_SPINLOCK() / DEFINE_RWLOCK()或
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED()是åˆé€‚的。
diff --git a/Documentation/translations/zh_CN/process/howto.rst b/Documentation/translations/zh_CN/process/howto.rst
index 2903d7161bc8..1334cdb32a3c 100644
--- a/Documentation/translations/zh_CN/process/howto.rst
+++ b/Documentation/translations/zh_CN/process/howto.rst
@@ -252,7 +252,7 @@ Linux-next 集æˆæµ‹è¯•æ ‘
在将å­ç³»ç»Ÿæ ‘çš„æ›´æ–°åˆå¹¶åˆ°ä¸»çº¿æ ‘之å‰ï¼Œéœ€è¦å¯¹å®ƒä»¬è¿›è¡Œé›†æˆæµ‹è¯•ã€‚为此,存在一个
特殊的测试存储库,其中几乎æ¯å¤©éƒ½ä¼šæå–所有å­ç³»ç»Ÿæ ‘:
- https://git.kernel.org/?p=linux/kernel/git/next/linux-next.git
+ https://git.kernel.org/?p=linux/kernel/git/next/linux-next.git
通过这ç§æ–¹å¼ï¼ŒLinux-next 对下一个åˆå¹¶é˜¶æ®µå°†è¿›å…¥ä¸»çº¿å†…核的内容给出了一个概è¦
展望。éžå¸¸æ¬¢å†’险的测试者è¿è¡Œæµ‹è¯•Linux-next。
diff --git a/Documentation/translations/zh_CN/scheduler/index.rst b/Documentation/translations/zh_CN/scheduler/index.rst
index 12bf3bd02ccf..a8eaa7325f54 100644
--- a/Documentation/translations/zh_CN/scheduler/index.rst
+++ b/Documentation/translations/zh_CN/scheduler/index.rst
@@ -25,8 +25,10 @@ Linux调度器
sched-domains
sched-capacity
sched-energy
+ schedutil
sched-nice-design
sched-stats
+ sched-debug
TODOList:
diff --git a/Documentation/translations/zh_CN/scheduler/sched-debug.rst b/Documentation/translations/zh_CN/scheduler/sched-debug.rst
new file mode 100644
index 000000000000..5e17740c2bf3
--- /dev/null
+++ b/Documentation/translations/zh_CN/scheduler/sched-debug.rst
@@ -0,0 +1,51 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/scheduler/sched-debug.rst
+
+:翻译:
+
+ å”艺舟 Tang Yizhou <tangyeechou@gmail.com>
+
+=============
+调度器debugfs
+=============
+
+用é…置项CONFIG_SCHED_DEBUG=yå¯åŠ¨å†…æ ¸åŽï¼Œå°†å¯ä»¥è®¿é—®/sys/kernel/debug/sched
+下的调度器专用调试文件。其中一些文件æ述如下。
+
+numa_balancing
+==============
+
+`numa_balancing` 目录用æ¥å­˜æ”¾æŽ§åˆ¶éžç»Ÿä¸€å†…存访问(NUMA)平衡特性的相关文件。
+如果该特性导致系统负载太高,那么å¯ä»¥é€šè¿‡ `scan_period_min_ms, scan_delay_ms,
+scan_period_max_ms, scan_size_mb` 文件控制NUMA缺页的内核采样速率。
+
+
+scan_period_min_ms, scan_delay_ms, scan_period_max_ms, scan_size_mb
+-------------------------------------------------------------------
+
+自动NUMA平衡会扫æ任务地å€ç©ºé—´ï¼Œæ£€æµ‹é¡µé¢æ˜¯å¦è¢«æ­£ç¡®æ”¾ç½®ï¼Œæˆ–者数æ®æ˜¯å¦åº”该被
+è¿ç§»åˆ°ä»»åŠ¡æ­£åœ¨è¿è¡Œçš„本地内存结点,此时需解映射页é¢ã€‚æ¯ä¸ªâ€œæ‰«æ延迟â€ï¼ˆscan delay)
+时间之åŽï¼Œä»»åŠ¡æ‰«æ其地å€ç©ºé—´ä¸­ä¸‹ä¸€æ‰¹â€œæ‰«æ大å°â€ï¼ˆscan size)个页é¢ã€‚若抵达
+内存地å€ç©ºé—´æœ«å°¾ï¼Œæ‰«æ器将从头开始é‡æ–°æ‰«æ。
+
+结åˆæ¥çœ‹ï¼Œâ€œæ‰«æ延迟â€å’Œâ€œæ‰«æ大å°â€å†³å®šæ‰«æ速率。当“扫æ延迟â€å‡å°æ—¶ï¼Œæ‰«æ速率
+增加。“扫æ延迟â€å’Œæ¯ä¸ªä»»åŠ¡çš„扫æ速率都是自适应的,且ä¾èµ–历å²è¡Œä¸ºã€‚如果页é¢è¢«
+正确放置,那么扫æ延迟就会增加;å¦åˆ™æ‰«æ延迟就会å‡å°‘。“扫æ大å°â€ä¸æ˜¯è‡ªé€‚应的,
+“扫æ大å°â€è¶Šå¤§ï¼Œæ‰«æ速率越高。
+
+更高的扫æ速率会产生更高的系统开销,因为必须æ•èŽ·ç¼ºé¡µå¼‚常,并且潜在地必须è¿ç§»
+æ•°æ®ã€‚然而,当扫æ速率越高,若工作负载模å¼å‘生å˜åŒ–,任务的内存将越快地è¿ç§»åˆ°
+本地结点,由于远程内存访问而产生的性能影å“å°†é™åˆ°æœ€ä½Žã€‚下é¢è¿™äº›æ–‡ä»¶æŽ§åˆ¶æ‰«æ延迟
+的阈值和被扫æ的页é¢æ•°é‡ã€‚
+
+``scan_period_min_ms`` 是扫æ一个任务虚拟内存的最å°æ—¶é—´ï¼Œå•ä½æ˜¯æ¯«ç§’。它有效地
+控制了æ¯ä¸ªä»»åŠ¡çš„最大扫æ速率。
+
+``scan_delay_ms`` 是一个任务åˆå§‹åŒ–创建(fork)时,第一次使用的“扫æ延迟â€ã€‚
+
+``scan_period_max_ms`` 是扫æ一个任务虚拟内存的最大时间,å•ä½æ˜¯æ¯«ç§’。它有效地
+控制了æ¯ä¸ªä»»åŠ¡çš„最å°æ‰«æ速率。
+
+``scan_size_mb`` 是一次特定的扫æ中,è¦æ‰«æ多少兆字节(MB)对应的页é¢æ•°ã€‚
diff --git a/Documentation/translations/zh_CN/scheduler/schedutil.rst b/Documentation/translations/zh_CN/scheduler/schedutil.rst
new file mode 100644
index 000000000000..d1ea68007520
--- /dev/null
+++ b/Documentation/translations/zh_CN/scheduler/schedutil.rst
@@ -0,0 +1,165 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/scheduler/schedutil.rst
+
+:翻译:
+
+ å”艺舟 Tang Yizhou <tangyeechou@gmail.com>
+
+=========
+Schedutil
+=========
+
+.. note::
+
+ 本文所有内容都å‡è®¾é¢‘率和工作算力之间存在线性关系。我们知é“这是有瑕疵的,
+ 但这是最å¯è¡Œçš„近似处ç†ã€‚
+
+PELT(实体负载跟踪,Per Entity Load Tracking)
+==============================================
+
+通过PELT,我们跟踪了å„ç§è°ƒåº¦å™¨å®žä½“的一些指标,从å•ä¸ªä»»åŠ¡åˆ°ä»»åŠ¡ç»„分片到CPU
+è¿è¡Œé˜Ÿåˆ—。我们使用指数加æƒç§»åŠ¨å¹³å‡æ•°ï¼ˆExponentially Weighted Moving Average,
+EWMA)作为其基础,æ¯ä¸ªå‘¨æœŸï¼ˆ1024us)都会衰å‡ï¼Œè¡°å‡é€ŸçŽ‡æ»¡è¶³y^32 = 0.5。
+也就是说,最近的32ms贡献负载的一åŠï¼Œè€ŒåŽ†å²ä¸Šçš„其它时间则贡献å¦ä¸€åŠã€‚
+
+具体而言:
+
+ ewma_sum(u) := u_0 + u_1*y + u_2*y^2 + ...
+
+ ewma(u) = ewma_sum(u) / ewma_sum(1)
+
+由于这本质上是一个无é™å‡ ä½•çº§æ•°çš„累加,结果是å¯ç»„åˆçš„,å³ewma(A) + ewma(B) = ewma(A+B)。
+这个属性是关键,因为它æ供了在任务è¿ç§»æ—¶é‡æ–°ç»„åˆå¹³å‡æ•°çš„能力。
+
+请注æ„,阻塞æ€çš„任务ä»ç„¶å¯¹ç´¯åŠ å€¼ï¼ˆä»»åŠ¡ç»„分片和CPUè¿è¡Œé˜Ÿåˆ—)有贡献,这å映了
+它们在æ¢å¤è¿è¡ŒåŽçš„预期贡献。
+
+利用这一点,我们跟踪2个关键指标:“è¿è¡Œâ€å’Œâ€œå¯è¿è¡Œâ€ã€‚“è¿è¡Œâ€å映了一个调度实体
+在CPU上花费的时间,而“å¯è¿è¡Œâ€å映了一个调度实体在è¿è¡Œé˜Ÿåˆ—中花费的时间。当åªæœ‰
+一个任务时,这两个指标是相åŒçš„,但一旦出现对CPU的争用,“è¿è¡Œâ€å°†å‡å°‘以å映æ¯ä¸ª
+任务在CPU上花费的时间,而“å¯è¿è¡Œâ€å°†å¢žåŠ ä»¥å映争用的激烈程度。
+
+更多细节è§ï¼škernel/sched/pelt.c
+
+
+频率 / CPUä¸å˜æ€§
+================
+
+因为CPU频率在1GHz时利用率为50%å’ŒCPU频率在2GHz时利用率为50%是ä¸ä¸€æ ·çš„,åŒæ ·
+在å°æ ¸ä¸Šè¿è¡Œæ—¶åˆ©ç”¨çŽ‡ä¸º50%和在大核上è¿è¡Œæ—¶åˆ©ç”¨çŽ‡ä¸º50%是ä¸ä¸€æ ·çš„,我们å…许架构
+以两个比率æ¥ä¼¸ç¼©æ—¶é—´å·®ï¼Œå…¶ä¸­ä¸€ä¸ªæ˜¯åŠ¨æ€ç”µåŽ‹é¢‘率å‡é™ï¼ˆDynamic Voltage and
+Frequency Scaling,DVFS)比率,å¦ä¸€ä¸ªæ˜¯å¾®æž¶æž„比率。
+
+对于简å•çš„DVFS架构(软件有完全控制能力),我们å¯ä»¥å¾ˆå®¹æ˜“地计算该比率为::
+
+ f_cur
+ r_dvfs := -----
+ f_max
+
+对于由硬件控制DVFS的更多动æ€ç³»ç»Ÿï¼Œæˆ‘们使用硬件计数器(Intel APERF/MPERF,
+ARMv8.4-AMU)æ¥è®¡ç®—这一比率。具体到Intel,我们使用::
+
+ APERF
+ f_cur := ----- * P0
+ MPERF
+
+ 4C-turbo; 如果å¯ç”¨å¹¶ä¸”使能了turbo
+ f_max := { 1C-turbo; 如果使能了turbo
+ P0; 其它情况
+
+ f_cur
+ r_dvfs := min( 1, ----- )
+ f_max
+
+我们选择4C turbo而ä¸æ˜¯1C turbo,以使其更æŒä¹…性略微更强。
+
+r_cpu被定义为当å‰CPU的最高性能水平与系统中任何其它CPU的最高性能水平的比率。
+
+ r_tot = r_dvfs * r_cpu
+
+其结果是,上述“è¿è¡Œâ€å’Œâ€œå¯è¿è¡Œâ€çš„指标å˜æˆDVFS无关和CPUåž‹å·æ— å…³äº†ã€‚也就是说,
+我们å¯ä»¥åœ¨CPU之间转移和比较它们。
+
+更多细节è§:
+
+ - kernel/sched/pelt.h:update_rq_clock_pelt()
+ - arch/x86/kernel/smpboot.c:"APERF/MPERF frequency ratio computation."
+ - Documentation/translations/zh_CN/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization"
+
+
+UTIL_EST / UTIL_EST_FASTUP
+==========================
+
+由于周期性任务的平å‡æ•°åœ¨ç¡çœ æ—¶ä¼šè¡°å‡ï¼Œè€Œåœ¨è¿è¡Œæ—¶å…¶é¢„期利用率会和ç¡çœ å‰ç›¸åŒï¼Œ
+因此它们在å†æ¬¡è¿è¡ŒåŽä¼šé¢ä¸´ï¼ˆDVFS)的上涨。
+
+为了缓解这个问题,(一个默认使能的编译选项)UTIL_EST驱动一个无é™è„‰å†²å“应
+(Infinite Impulse Response,IIR)的EWMA,“è¿è¡Œâ€å€¼åœ¨å‡ºé˜Ÿæ—¶æ˜¯æœ€é«˜çš„。
+å¦ä¸€ä¸ªé»˜è®¤ä½¿èƒ½çš„编译选项UTIL_EST_FASTUP修改了IIR滤波器,使其å…许立å³å¢žåŠ ï¼Œ
+仅在利用率下é™æ—¶è¡°å‡ã€‚
+
+进一步,è¿è¡Œé˜Ÿåˆ—的(å¯è¿è¡Œä»»åŠ¡çš„)利用率之和由下å¼è®¡ç®—:
+
+ util_est := \Sum_t max( t_running, t_util_est_ewma )
+
+更多细节è§: kernel/sched/fair.c:util_est_dequeue()
+
+
+UCLAMP
+======
+
+å¯ä»¥åœ¨æ¯ä¸ªCFS或RT任务上设置有效的u_minå’Œu_max clamp值(译注:clampå¯ä»¥ç†è§£
+为类似滤波器的能力,它定义了有效å–值范围的最大值和最å°å€¼ï¼‰ï¼›è¿è¡Œé˜Ÿåˆ—为所有正在
+è¿è¡Œçš„任务ä¿æŒè¿™äº›clamp的最大èšåˆå€¼ã€‚
+
+更多细节è§: include/uapi/linux/sched/types.h
+
+
+Schedutil / DVFS
+================
+
+æ¯å½“调度器的负载跟踪被更新时(任务唤醒ã€ä»»åŠ¡è¿ç§»ã€æ—¶é—´æµé€ï¼‰ï¼Œæˆ‘们都会调用
+schedutilæ¥æ›´æ–°ç¡¬ä»¶DVFS状æ€ã€‚
+
+其基础是CPUè¿è¡Œé˜Ÿåˆ—的“è¿è¡Œâ€æŒ‡æ ‡ï¼Œæ ¹æ®ä¸Šé¢çš„内容,它是CPU的频率ä¸å˜çš„利用率
+估计值。由此我们计算出一个期望的频率,如下::
+
+ max( running, util_est ); 如果使能UTIL_EST
+ u_cfs := { running; 其它情况
+
+ clamp( u_cfs + u_rt, u_min, u_max ); 如果使能UCLAMP_TASK
+ u_clamp := { u_cfs + u_rt; 其它情况
+
+ u := u_clamp + u_irq + u_dl; [估计值。更多细节è§æºä»£ç ]
+
+ f_des := min( f_max, 1.25 u * f_max )
+
+关于IO-wait的说明:当å‘生更新是因为任务从IO完æˆä¸­å”¤é†’时,我们æå‡ä¸Šé¢çš„“uâ€ã€‚
+
+然åŽï¼Œè¿™ä¸ªé¢‘率被用æ¥é€‰æ‹©ä¸€ä¸ªP-state或OPP,或者直接混入一个å‘给硬件的CPPCå¼
+请求。
+
+关于截止期é™è°ƒåº¦å™¨çš„说明: 截止期é™ä»»åŠ¡ï¼ˆå¶å‘任务模型)使我们能够计算出满足
+工作负è·æ‰€éœ€çš„硬f_min值。
+
+因为这些回调函数是直接æ¥è‡ªè°ƒåº¦å™¨çš„,所以DVFS的硬件交互应该是“快速â€å’Œéžé˜»å¡žçš„。
+在硬件交互缓慢和昂贵的时候,schedutil支æŒDVFS请求é™é€Ÿï¼Œä¸è¿‡ä¼šé™ä½Žæ•ˆçŽ‡ã€‚
+
+更多信æ¯è§: kernel/sched/cpufreq_schedutil.c
+
+
+注æ„
+====
+
+ - 在低负载场景下,DVFS是最相关的,“è¿è¡Œâ€çš„值将密切å映利用率。
+
+ - 在负载饱和的场景下,任务è¿ç§»ä¼šå¯¼è‡´ä¸€äº›çž¬æ—¶æ€§çš„使用率下é™ã€‚å‡è®¾æˆ‘们有一个
+ CPU,有4个任务å ç”¨å¯¼è‡´å…¶é¥±å’Œï¼ŒæŽ¥ä¸‹æ¥æˆ‘们将一个任务è¿ç§»åˆ°å¦ä¸€ä¸ªç©ºé—²CPU上,
+ 旧的CPU的“è¿è¡Œâ€å€¼å°†ä¸º0.75,而新的CPU将获得0.25。这是ä¸å¯é¿å…的,而且éšç€
+ 时间æµé€å°†è‡ªåŠ¨ä¿®æ­£ã€‚å¦æ³¨ï¼Œç”±äºŽæ²¡æœ‰ç©ºé—²æ—¶é—´ï¼Œæˆ‘们还能ä¿è¯f_max值å—?
+
+ - 上述大部分内容是关于é¿å…DVFS下滑,以åŠç‹¬ç«‹çš„DVFS域å‘生负载è¿ç§»æ—¶ä¸å¾—ä¸
+ é‡æ–°å­¦ä¹ /æå‡é¢‘率。
+
diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/vm/damon/design.rst
index 05f66c02740a..46128b77c2b3 100644
--- a/Documentation/translations/zh_CN/vm/damon/design.rst
+++ b/Documentation/translations/zh_CN/vm/damon/design.rst
@@ -77,7 +77,7 @@ DAMONç›®å‰ä¸ºç‰©ç†å’Œè™šæ‹Ÿåœ°å€ç©ºé—´æ供了基元的实现。下é¢ä¸¤ä¸ª
========================
下é¢å››ä¸ªéƒ¨åˆ†åˆ†åˆ«æ述了DAMONçš„æ ¸å¿ƒæœºåˆ¶å’Œäº”ä¸ªç›‘æµ‹å±žæ€§ï¼Œå³ ``采样间隔`` 〠``èšé›†é—´éš”`` ã€
-``区域更新间隔`` 〠``最å°åŒºåŸŸæ•°`` å’Œ ``最大区域数`` 。
+``æ›´æ–°é—´éš”`` 〠``最å°åŒºåŸŸæ•°`` å’Œ ``最大区域数`` 。
访问频率监测
@@ -135,5 +135,6 @@ DAMON的输出显示了在给定的时间内哪些页é¢çš„访问频率是多少
监测目标地å€èŒƒå›´å¯ä»¥åŠ¨æ€æ”¹å˜ã€‚例如,虚拟内存å¯ä»¥åŠ¨æ€åœ°è¢«æ˜ å°„和解映射。物ç†å†…å­˜å¯ä»¥è¢«
热æ’拔。
-由于在æŸäº›æƒ…况下å˜åŒ–å¯èƒ½ç›¸å½“频ç¹ï¼ŒDAMON检查动æ€å†…存映射的å˜åŒ–,并仅在用户指定的时间
-间隔( ``区域更新间隔`` )内将其应用于抽象的目标区域。
+由于在æŸäº›æƒ…况下å˜åŒ–å¯èƒ½ç›¸å½“频ç¹ï¼ŒDAMONå…许监控æ“作检查动æ€å˜åŒ–,包括内存映射å˜åŒ–,
+并仅在用户指定的时间间隔( ``æ›´æ–°é—´éš”`` )中的æ¯ä¸ªæ—¶é—´æ®µï¼Œå°†å…¶åº”用于监控æ“作相关的
+æ•°æ®ç»“构,如抽象的监控目标内存区。 \ No newline at end of file
diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/vm/frontswap.rst
new file mode 100644
index 000000000000..3eb07870e2ef
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/frontswap.rst
@@ -0,0 +1,196 @@
+:Original: Documentation/vm/_free_page_reporting.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=========
+Frontswap
+=========
+
+Frontswap为交æ¢é¡µæ供了一个 “transcendent memory†的接å£ã€‚在一些环境中,由
+于交æ¢é¡µè¢«ä¿å­˜åœ¨RAM(或类似RAM的设备)中,而ä¸æ˜¯äº¤æ¢ç£ç›˜ï¼Œå› æ­¤å¯ä»¥èŽ·å¾—巨大的性能
+节çœï¼ˆæ高)。
+
+.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/
+
+Frontswap之所以这么命å,是因为它å¯ä»¥è¢«è®¤ä¸ºæ˜¯ä¸Žswap设备的“backâ€å­˜å‚¨ç›¸å。存
+储器被认为是一个åŒæ­¥å¹¶å‘安全的é¢å‘页é¢çš„“伪RAM设备â€ï¼Œç¬¦åˆtranscendent memory
+(如Xen的“tmemâ€ï¼Œæˆ–内核内压缩内存,åˆç§°â€œzcacheâ€ï¼Œæˆ–未æ¥çš„类似RAM的设备)的è¦
+求;这个伪RAM设备ä¸èƒ½è¢«å†…核直接访问或寻å€ï¼Œå…¶å¤§å°æœªçŸ¥ä¸”å¯èƒ½éšæ—¶é—´å˜åŒ–。驱动程åºé€šè¿‡
+调用frontswap_register_ops将自己与frontswap链接起æ¥ï¼Œä»¥é€‚当地设置frontswap_ops
+的功能,它æ供的功能必须符åˆæŸäº›ç­–略,如下所示:
+
+一个 “init†将设备准备好接收与指定的交æ¢è®¾å¤‡ç¼–å·ï¼ˆåˆç§°â€œç±»åž‹â€ï¼‰ç›¸å…³çš„frontswap
+交æ¢é¡µã€‚一个 “store†将把该页å¤åˆ¶åˆ°transcendent memory,并与该页的类型和å移
+é‡ç›¸å…³è”。一个 “load†将把该页,如果找到的è¯ï¼Œä»Žtranscendent memoryå¤åˆ¶åˆ°å†…æ ¸
+内存,但ä¸ä¼šä»Žtranscendent memory中删除该页。一个 “invalidate_page†将从
+transcendent memory中删除该页,一个 “invalidate_area†将删除所有与交æ¢ç±»åž‹
+相关的页(例如,åƒswapoff)并通知 “device†拒ç»è¿›ä¸€æ­¥å­˜å‚¨è¯¥äº¤æ¢ç±»åž‹ã€‚
+
+一旦一个页é¢è¢«æˆåŠŸå­˜å‚¨ï¼Œåœ¨è¯¥é¡µé¢ä¸Šçš„匹é…加载通常会æˆåŠŸã€‚因此,当内核å‘现自己处于需
+è¦äº¤æ¢é¡µé¢çš„情况时,它首先å°è¯•ä½¿ç”¨frontswap。如果存储的结果是æˆåŠŸçš„,那么数æ®å°±å·²
+ç»æˆåŠŸçš„ä¿å­˜åˆ°äº†transcendent memory中,并且é¿å…了ç£ç›˜å†™å…¥ï¼Œå¦‚æžœåŽæ¥å†è¯»å›žæ•°æ®ï¼Œ
+也é¿å…了ç£ç›˜è¯»å–。如果存储返回失败,transcendent memoryå·²ç»æ‹’ç»äº†è¯¥æ•°æ®ï¼Œä¸”该页
+å¯ä»¥åƒå¾€å¸¸ä¸€æ ·è¢«å†™å…¥äº¤æ¢ç©ºé—´ã€‚
+
+请注æ„,如果一个页é¢è¢«å­˜å‚¨ï¼Œè€Œè¯¥é¡µé¢å·²ç»å­˜åœ¨äºŽtranscendent memory中(一个 “é‡å¤â€
+的存储),è¦ä¹ˆå­˜å‚¨æˆåŠŸï¼Œæ•°æ®è¢«è¦†ç›–,è¦ä¹ˆå­˜å‚¨å¤±è´¥ï¼Œè¯¥é¡µé¢è¢«åºŸæ­¢ã€‚这确ä¿äº†æ—§çš„æ•°æ®æ°¸è¿œ
+ä¸ä¼šä»Žfrontswap中获得。
+
+如果é…置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的
+debugfs完æˆçš„。frontswap的有效性å¯ä»¥é€šè¿‡ä»¥ä¸‹æ–¹å¼æµ‹é‡ï¼ˆåœ¨æ‰€æœ‰äº¤æ¢è®¾å¤‡ä¸­ï¼‰:
+
+``failed_stores``
+ 有多少次存储的å°è¯•æ˜¯å¤±è´¥çš„
+
+``loads``
+ å°è¯•äº†å¤šå°‘次加载(应该全部æˆåŠŸï¼‰
+
+``succ_stores``
+ 有多少次存储的å°è¯•æ˜¯æˆåŠŸçš„
+
+``invalidates``
+ å°è¯•äº†å¤šå°‘次作废
+
+åŽå°å®žçŽ°å¯ä»¥æä¾›é¢å¤–的指标。
+
+ç»å¸¸é—®åˆ°çš„问题
+==============
+
+* 价值在哪里?
+
+当一个工作负载开始交æ¢æ—¶ï¼Œæ€§èƒ½å°±ä¼šä¸‹é™ã€‚Frontswap通过æ供一个干净的ã€åŠ¨æ€çš„接å£æ¥
+读å–和写入交æ¢é¡µåˆ° “transcendent memoryâ€ï¼Œä»Žè€Œå¤§å¤§å¢žåŠ äº†è®¸å¤šè¿™æ ·çš„工作负载的性
+能,å¦åˆ™å†…核是无法直接寻å€çš„。当数æ®è¢«è½¬æ¢ä¸ºä¸åŒçš„å½¢å¼å’Œå¤§å°ï¼ˆæ¯”如压缩)或者被秘密
+移动(对于一些类似RAM的设备æ¥è¯´ï¼Œè¿™å¯èƒ½å¯¹å†™å¹³è¡¡å¾ˆæœ‰ç”¨ï¼‰æ—¶ï¼Œè¿™ä¸ªæŽ¥å£æ˜¯ç†æƒ³çš„。交æ¢
+页(和被驱é€çš„页é¢ç¼“存页)是这ç§æ¯”RAM慢但比ç£ç›˜å¿«å¾—多的“伪RAM设备â€çš„一大用途。
+
+Frontswap对内核的影å“相当å°ï¼Œä¸ºå„ç§ç³»ç»Ÿé…置中更动æ€ã€æ›´çµæ´»çš„RAM利用æ供了巨大的
+çµæ´»æ€§ï¼š
+
+在å•ä¸€å†…核的情况下,åˆç§°â€œzcacheâ€ï¼Œé¡µé¢è¢«åŽ‹ç¼©å¹¶å­˜å‚¨åœ¨æœ¬åœ°å†…存中,从而增加了å¯ä»¥å®‰
+å…¨ä¿å­˜åœ¨RAM中的匿å页é¢æ€»æ•°ã€‚Zcache本质上是用压缩/解压缩的CPU周期æ¢å–更好的内存利
+用率。Benchmarks测试显示,当内存压力较低时,几乎没有影å“,而在高内存压力下的一些
+工作负载上,则有明显的性能改善(25%以上)。
+
+“RAMster†在zcache的基础上增加了对集群系统的 “peer-to-peer†transcendent memory
+的支æŒã€‚Frontswap页é¢åƒzcache一样被本地压缩,但éšåŽè¢«â€œremotified†到å¦ä¸€ä¸ªç³»
+统的RAM。这使得RAMå¯ä»¥æ ¹æ®éœ€è¦åŠ¨æ€åœ°æ¥å›žè´Ÿè½½å¹³è¡¡ï¼Œä¹Ÿå°±æ˜¯è¯´ï¼Œå½“系统A超载时,它å¯ä»¥
+交æ¢åˆ°ç³»ç»ŸB,å之亦然。RAMster也å¯ä»¥è¢«é…ç½®æˆä¸€ä¸ªå†…å­˜æœåŠ¡å™¨ï¼Œå› æ­¤é›†ç¾¤ä¸­çš„许多æœåŠ¡å™¨
+å¯ä»¥æ ¹æ®éœ€è¦åŠ¨æ€åœ°äº¤æ¢åˆ°é…置有大é‡å†…存的å•ä¸€æœåŠ¡å™¨ä¸Š......而ä¸éœ€è¦é¢„å…ˆé…ç½®æ¯ä¸ªå®¢æˆ·
+有多少内存å¯ç”¨
+
+在虚拟情况下,虚拟化的全部æ„义在于统计地将物ç†èµ„æºåœ¨å¤šä¸ªè™šæ‹Ÿæœºçš„ä¸åŒéœ€æ±‚之间进行å¤
+用。对于RAMæ¥è¯´ï¼Œè¿™çœŸçš„很难åšåˆ°ï¼Œè€Œä¸”在ä¸æ”¹å˜å†…核的情况下,è¦åšå¥½è¿™ä¸€ç‚¹çš„努力基本上
+是失败的(除了一些广为人知的特殊情况下的工作负载)。具体æ¥è¯´ï¼ŒXen Transcendent Memory
+åŽç«¯å…许管ç†å™¨æ‹¥æœ‰çš„RAM “fallowâ€ï¼Œä¸ä»…å¯ä»¥åœ¨å¤šä¸ªè™šæ‹Ÿæœºä¹‹é—´è¿›è¡Œâ€œtime-sharedâ€ï¼Œ
+而且页é¢å¯ä»¥è¢«åŽ‹ç¼©å’Œé‡å¤åˆ©ç”¨ï¼Œä»¥ä¼˜åŒ–RAM的利用率。当客户æ“作系统被诱导交出未充分利用
+çš„RAM时(如 “selfballooningâ€ï¼‰ï¼Œçªç„¶å‡ºçŽ°çš„æ„外内存压力å¯èƒ½ä¼šå¯¼è‡´äº¤æ¢ï¼›frontswap
+å…许这些页é¢è¢«äº¤æ¢åˆ°ç®¡ç†å™¨RAM中或从管ç†å™¨RAM中交æ¢ï¼ˆå¦‚果整体主机系统内存æ¡ä»¶å…许),
+从而å‡è½»è®¡åˆ’外交æ¢å¯èƒ½å¸¦æ¥çš„å¯æ€•çš„性能影å“。
+
+一个KVM的实现正在进行中,并且已ç»è¢«RFC'ed到lkml。而且,利用frontswap,对NVM作为
+内存扩展技术的调查也在进行中。
+
+* 当然,在æŸäº›æƒ…况下å¯èƒ½æœ‰æ€§èƒ½ä¸Šçš„优势,但frontswap的空间/时间开销是多少?
+
+如果 CONFIG_FRONTSWAP 被ç¦ç”¨ï¼Œæ¯ä¸ª frontswap é’©å­éƒ½ä¼šç¼–译æˆç©ºï¼Œå”¯ä¸€çš„开销是æ¯
+个 swapon'ed swap 设备的几个é¢å¤–字节。如果 CONFIG_FRONTSWAP 被å¯ç”¨ï¼Œä½†æ²¡æœ‰
+frontswapçš„ “backend†寄存器,æ¯è¯»æˆ–写一个交æ¢é¡µå°±ä¼šæœ‰ä¸€ä¸ªé¢å¤–的全局å˜é‡ï¼Œè€Œä¸
+是零。如果 CONFIG_FRONTSWAP 被å¯ç”¨ï¼Œå¹¶ä¸”有一个frontswapçš„backend寄存器,并且
+åŽç«¯æ¯æ¬¡ “store†请求都失败(å³å°½ç®¡å£°ç§°å¯èƒ½ï¼Œä½†æ²¡æœ‰æ供内存),CPU 的开销ä»ç„¶å¯ä»¥
+忽略ä¸è®¡ - 因为æ¯æ¬¡frontswap失败都是在交æ¢é¡µå†™åˆ°ç£ç›˜ä¹‹å‰ï¼Œç³»ç»Ÿå¾ˆå¯èƒ½æ˜¯ I/O 绑定
+的,无论如何使用一å°éƒ¨åˆ†çš„ CPU 都是ä¸ç›¸å…³çš„。
+
+至于空间,如果CONFIG_FRONTSWAP被å¯ç”¨ï¼Œå¹¶ä¸”有一个frontswapçš„backend注册,那么
+æ¯ä¸ªäº¤æ¢è®¾å¤‡çš„æ¯ä¸ªäº¤æ¢é¡µéƒ½ä¼šè¢«åˆ†é…一个比特。这是在内核已ç»ä¸ºæ¯ä¸ªäº¤æ¢è®¾å¤‡çš„æ¯ä¸ªäº¤æ¢
+页分é…çš„8ä½ï¼ˆåœ¨2.6.34之å‰æ˜¯16ä½ï¼‰ä¸Šå¢žåŠ çš„。(Hugh Dickins观察到,frontswapå¯èƒ½
+会å·å–现有的8个比特,但是我们以åŽå†æ¥æ‹…心这个å°çš„优化问题)。对于标准的4K页é¢å¤§å°çš„
+éžå¸¸å¤§çš„交æ¢ç›˜ï¼ˆè¿™å¾ˆç½•è§ï¼‰ï¼Œè¿™æ˜¯æ¯32GB交æ¢ç›˜1MB开销。
+
+当交æ¢é¡µå­˜å‚¨åœ¨transcendent memory中而ä¸æ˜¯å†™åˆ°ç£ç›˜ä¸Šæ—¶ï¼Œæœ‰ä¸€ä¸ªå‰¯ä½œç”¨ï¼Œå³è¿™å¯èƒ½ä¼š
+产生更多的内存压力,有å¯èƒ½è¶…过其他的优点。一个backend,比如zcache,必须实现策略
+æ¥ä»”细(但动æ€åœ°ï¼‰ç®¡ç†å†…å­˜é™åˆ¶ï¼Œä»¥ç¡®ä¿è¿™ç§æƒ…况ä¸ä¼šå‘生。
+
+* 好å§ï¼Œé‚£å°±ç”¨å†…核骇客能ç†è§£çš„术语æ¥å¿«é€Ÿæ¦‚述一下这个frontswapè¡¥ä¸çš„作用如何?
+
+我们å‡è®¾åœ¨å†…æ ¸åˆå§‹åŒ–过程中,一个frontswap çš„ “backend†已ç»æ³¨å†Œäº†ï¼›è¿™ä¸ªæ³¨å†Œè¡¨
+明这个frontswap çš„ “backend†å¯ä»¥è®¿é—®ä¸€äº›ä¸è¢«å†…核直接访问的“内存â€ã€‚它到底æ
+供了多少内存是完全动æ€å’Œéšæœºçš„。
+
+æ¯å½“一个交æ¢è®¾å¤‡è¢«äº¤æ¢æ—¶ï¼Œå°±ä¼šè°ƒç”¨frontswap_init(),把交æ¢è®¾å¤‡çš„ç¼–å·ï¼ˆåˆç§°â€œç±»
+åž‹â€ï¼‰ä½œä¸ºä¸€ä¸ªå‚数传给它。这就通知了frontswap,以期待 “store†与该å·ç ç›¸å…³çš„交
+æ¢é¡µçš„å°è¯•ã€‚
+
+æ¯å½“交æ¢å­ç³»ç»Ÿå‡†å¤‡å°†ä¸€ä¸ªé¡µé¢å†™å…¥äº¤æ¢è®¾å¤‡æ—¶ï¼ˆå‚è§swap_writepage()),就会调用
+frontswap_store。Frontswap与frontswap backendå商,如果backend说它没有空
+间,frontswap_store返回-1,内核就会照常把页æ¢åˆ°äº¤æ¢è®¾å¤‡ä¸Šã€‚注æ„,æ¥è‡ªfrontswap
+backendçš„å“应对内核æ¥è¯´æ˜¯ä¸å¯é¢„测的;它å¯èƒ½é€‰æ‹©ä»Žä¸æŽ¥å—一个页é¢ï¼Œå¯èƒ½æŽ¥å—æ¯ä¹ä¸ª
+页é¢ï¼Œä¹Ÿå¯èƒ½æŽ¥å—æ¯ä¸€ä¸ªé¡µé¢ã€‚但是如果backend确实接å—了一个页é¢ï¼Œé‚£ä¹ˆè¿™ä¸ªé¡µé¢çš„æ•°
+æ®å·²ç»è¢«å¤åˆ¶å¹¶ä¸Žç±»åž‹å’Œå移é‡ç›¸å…³è”了,而且backendä¿è¯äº†æ•°æ®çš„æŒä¹…性。在这ç§æƒ…况
+下,frontswap在交æ¢è®¾å¤‡çš„“frontswap_map†中设置了一个ä½ï¼Œå¯¹åº”于交æ¢è®¾å¤‡ä¸Šçš„
+页é¢å移é‡ï¼Œå¦åˆ™å®ƒå°±ä¼šå°†æ•°æ®å†™å…¥è¯¥è®¾å¤‡ã€‚
+
+当交æ¢å­ç³»ç»Ÿéœ€è¦äº¤æ¢ä¸€ä¸ªé¡µé¢æ—¶ï¼ˆswap_readpage()),它首先调用frontswap_load(),
+检查frontswap_map,看这个页é¢æ˜¯å¦æ—©å…ˆè¢«frontswap backend接å—。如果是,该页
+çš„æ•°æ®å°±ä¼šä»ŽfrontswapåŽç«¯å¡«å……,æ¢å…¥å°±å®Œæˆäº†ã€‚如果ä¸æ˜¯ï¼Œæ­£å¸¸çš„交æ¢ä»£ç å°†è¢«æ‰§è¡Œï¼Œ
+以便从真正的交æ¢è®¾å¤‡ä¸ŠèŽ·å¾—这一页的数æ®ã€‚
+
+所以æ¯æ¬¡frontswap backend接å—一个页é¢æ—¶ï¼Œäº¤æ¢è®¾å¤‡çš„读å–和(å¯èƒ½ï¼‰äº¤æ¢è®¾å¤‡çš„写
+入都被 “frontswap backend store†和(å¯èƒ½ï¼‰â€œfrontswap backend loadsâ€
+所å–代,这å¯èƒ½ä¼šå¿«å¾—多。
+
+* frontswapä¸èƒ½è¢«é…置为一个 “特殊的†交æ¢è®¾å¤‡ï¼Œå®ƒçš„优先级è¦é«˜äºŽä»»ä½•çœŸæ­£çš„交æ¢
+ 设备(例如åƒzswap,或者å¯èƒ½æ˜¯swap-over-nbd/NFS)?
+
+首先,现有的交æ¢å­ç³»ç»Ÿä¸å…许有任何ç§ç±»çš„交æ¢å±‚次结构。也许它å¯ä»¥è¢«é‡å†™ä»¥é€‚应层次
+结构,但这将需è¦ç›¸å½“大的改å˜ã€‚å³ä½¿å®ƒè¢«é‡å†™ï¼ŒçŽ°æœ‰çš„交æ¢å­ç³»ç»Ÿä¹Ÿä½¿ç”¨äº†å—I/O层,它
+å‡å®šäº¤æ¢è®¾å¤‡æ˜¯å›ºå®šå¤§å°çš„,其中的任何页é¢éƒ½æ˜¯å¯çº¿æ€§å¯»å€çš„。Frontswap几乎没有触
+åŠçŽ°æœ‰çš„交æ¢å­ç³»ç»Ÿï¼Œè€Œæ˜¯å›´ç»•ç€å—I/Oå­ç³»ç»Ÿçš„é™åˆ¶ï¼Œæ供了大é‡çš„çµæ´»æ€§å’ŒåŠ¨æ€æ€§ã€‚
+
+例如,frontswap backend对任何交æ¢é¡µçš„接å—是完全ä¸å¯é¢„测的。这对frontswap backend
+的定义至关é‡è¦ï¼Œå› ä¸ºå®ƒèµ‹äºˆäº†backend完全动æ€çš„决定æƒã€‚在zcache中,人们无法预
+先知é“一个页é¢çš„å¯åŽ‹ç¼©æ€§å¦‚何。å¯åŽ‹ç¼©æ€§ “差†的页é¢ä¼šè¢«æ‹’ç»ï¼Œè€Œ “差†本身也å¯
+以根æ®å½“å‰çš„内存é™åˆ¶åŠ¨æ€åœ°å®šä¹‰ã€‚
+
+此外,frontswap是完全åŒæ­¥çš„,而真正的交æ¢è®¾å¤‡ï¼Œæ ¹æ®å®šä¹‰ï¼Œæ˜¯å¼‚步的,并且使用
+å—I/O。å—I/O层ä¸ä»…是ä¸å¿…è¦çš„,而且å¯èƒ½è¿›è¡Œ “优化â€ï¼Œè¿™å¯¹é¢å‘RAM的设备æ¥è¯´æ˜¯
+ä¸åˆé€‚的,包括将一些页é¢çš„写入延迟相当长的时间。åŒæ­¥æ˜¯å¿…须的,以确ä¿åŽç«¯çš„动
+æ€æ€§ï¼Œå¹¶é¿å…棘手的竞争æ¡ä»¶ï¼Œè¿™å°†ä¸å¿…è¦åœ°å¤§å¤§å¢žåŠ frontswapå’Œ/或å—I/Oå­ç³»ç»Ÿçš„
+å¤æ‚性。也就是说,åªæœ‰æœ€åˆçš„ “store†和 “load†æ“作是需è¦åŒæ­¥çš„。一个独立
+的异步线程å¯ä»¥è‡ªç”±åœ°æ“作由frontswap存储的页é¢ã€‚例如,RAMster中的 “remotificationâ€
+线程使用标准的异步内核套接字,将压缩的frontswap页é¢ç§»åŠ¨åˆ°è¿œç¨‹æœºå™¨ã€‚åŒæ ·ï¼Œ
+KVM的客户方实现å¯ä»¥è¿›è¡Œå®¢æˆ·å†…压缩,并使用 “batched†hypercalls。
+
+在虚拟化环境中,动æ€æ€§å…许管ç†ç¨‹åºï¼ˆæˆ–主机æ“作系统)åšâ€œintelligent overcommitâ€ã€‚
+例如,它å¯ä»¥é€‰æ‹©åªæŽ¥å—页é¢ï¼Œç›´åˆ°ä¸»æœºäº¤æ¢å¯èƒ½å³å°†å‘生,然åŽå¼ºè¿«å®¢æˆ·æœºåšä»–们
+自己的交æ¢ã€‚
+
+transcendent memory规格的frontswap有一个å处。因为任何 “store†都å¯
+能失败,所以必须在一个真正的交æ¢è®¾å¤‡ä¸Šæœ‰ä¸€ä¸ªçœŸæ­£çš„æ’槽æ¥äº¤æ¢é¡µé¢ã€‚因此,
+frontswap必须作为æ¯ä¸ªäº¤æ¢è®¾å¤‡çš„ “影å­â€ æ¥å®žçŽ°ï¼Œå®ƒæœ‰å¯èƒ½å®¹çº³äº¤æ¢è®¾å¤‡å¯èƒ½
+容纳的æ¯ä¸€ä¸ªé¡µé¢ï¼Œä¹Ÿæœ‰å¯èƒ½æ ¹æœ¬ä¸å®¹çº³ä»»ä½•é¡µé¢ã€‚è¿™æ„味ç€frontswapä¸èƒ½åŒ…å«æ¯”
+swap设备总数更多的页é¢ã€‚例如,如果在æŸäº›å®‰è£…上没有é…置交æ¢è®¾å¤‡ï¼Œfrontswap
+就没有用。无交æ¢è®¾å¤‡çš„便æºå¼è®¾å¤‡ä»ç„¶å¯ä»¥ä½¿ç”¨frontswap,但是这ç§è®¾å¤‡çš„
+backendå¿…é¡»é…ç½®æŸç§ “ghost†交æ¢è®¾å¤‡ï¼Œå¹¶ç¡®ä¿å®ƒæ°¸è¿œä¸ä¼šè¢«ä½¿ç”¨ã€‚
+
+
+* 为什么会有这ç§å…³äºŽ “é‡å¤å­˜å‚¨â€ 的奇怪定义?如果一个页é¢ä»¥å‰è¢«æˆåŠŸåœ°å­˜å‚¨è¿‡ï¼Œ
+ éš¾é“它ä¸èƒ½æ€»æ˜¯è¢«æˆåŠŸåœ°è¦†ç›–å—?
+
+几乎总是å¯ä»¥çš„,ä¸ï¼Œæœ‰æ—¶ä¸èƒ½ã€‚考虑一个例å­ï¼Œæ•°æ®è¢«åŽ‹ç¼©äº†ï¼ŒåŽŸæ¥çš„4K页é¢è¢«åŽ‹
+缩到了1K。现在,有人试图用ä¸å¯åŽ‹ç¼©çš„æ•°æ®è¦†ç›–该页,因此会å ç”¨æ•´ä¸ª4K。但是
+backend没有更多的空间了。在这ç§æƒ…况下,这个存储必须被拒ç»ã€‚æ¯å½“frontswap
+æ‹’ç»ä¸€ä¸ªä¼šè¦†ç›–的存储时,它也必须使旧的数æ®ä½œåºŸï¼Œå¹¶ç¡®ä¿å®ƒä¸å†è¢«è®¿é—®ã€‚因为交
+æ¢å­ç³»ç»Ÿä¼šæŠŠæ–°çš„æ•°æ®å†™åˆ°è¯»äº¤æ¢è®¾å¤‡ä¸Šï¼Œè¿™æ˜¯ç¡®ä¿ä¸€è‡´æ€§çš„正确åšæ³•ã€‚
+
+* 为什么frontswapè¡¥ä¸ä¼šåˆ›å»ºæ–°çš„头文件swapfile.h?
+
+frontswap代ç ä¾èµ–于一些swapå­ç³»ç»Ÿå†…部的数æ®ç»“构,这些数æ®ç»“构多年æ¥ä¸€ç›´
+在é™æ€å’Œå…¨å±€ä¹‹é—´æ¥å›žç§»åŠ¨ã€‚这似乎是一个åˆç†çš„妥å:将它们定义为全局,但在一
+个新的包å«æ–‡ä»¶ä¸­å£°æ˜Žå®ƒä»¬ï¼Œè¯¥æ–‡ä»¶ä¸è¢«åŒ…å«swap.h的大é‡æºæ–‡ä»¶æ‰€åŒ…å«ã€‚
+
+Dan Magenheimer,最åŽæ›´æ–°äºŽ2012å¹´4月9æ—¥
diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/vm/hmm.rst
new file mode 100644
index 000000000000..2379df95aa58
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/hmm.rst
@@ -0,0 +1,361 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/vm/hmm.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==================
+å¼‚æž„å†…å­˜ç®¡ç† (HMM)
+==================
+
+æ供基础设施和帮助程åºä»¥å°†éžå¸¸è§„内存(设备内存,如æ¿ä¸Š GPU 内存)集æˆåˆ°å¸¸è§„内核路径中,其
+基石是此类内存的专用struct page(请å‚阅本文档的第 5 至 7 节)。
+
+HMM 还为 SVM(共享虚拟内存)æ供了å¯é€‰çš„帮助程åºï¼Œå³å…许设备é€æ˜Žåœ°è®¿é—®ä¸Ž CPU 一致的程åº
+地å€ï¼Œè¿™æ„å‘³ç€ CPU 上的任何有效指针也是该设备的有效指针。这对于简化高级异构计算的使用å˜å¾—
+å¿…ä¸å¯å°‘,其中 GPUã€DSP 或 FPGA 用于代表进程执行å„ç§è®¡ç®—。
+
+本文档分为以下部分:在第一部分中,我æ­ç¤ºäº†ä¸Žä½¿ç”¨ç‰¹å®šäºŽè®¾å¤‡çš„内存分é…器相关的问题。在第二
+部分中,我æ­ç¤ºäº†è®¸å¤šå¹³å°å›ºæœ‰çš„硬件é™åˆ¶ã€‚第三部分概述了 HMM 设计。第四部分解释了 CPU 页
+表镜åƒçš„工作原ç†ä»¥åŠ HMM 在这ç§æƒ…况下的目的。第五部分处ç†å†…核中如何表示设备内存。最åŽï¼Œ
+最åŽä¸€èŠ‚介ç»äº†ä¸€ä¸ªæ–°çš„è¿ç§»åŠ©æ‰‹ï¼Œå®ƒå…许利用设备 DMA 引擎。
+
+.. contents:: :local:
+
+使用特定于设备的内存分é…器的问题
+================================
+
+具有大é‡æ¿è½½å†…存(几 GB)的设备(如 GPU)历æ¥é€šè¿‡ä¸“用驱动程åºç‰¹å®š API 管ç†å…¶å†…存。这会
+造æˆè®¾å¤‡é©±åŠ¨ç¨‹åºåˆ†é…和管ç†çš„内存与常规应用程åºå†…存(ç§æœ‰åŒ¿åã€å…±äº«å†…存或常规文件支æŒå†…存)
+之间的隔断。从这里开始,我将把这个方é¢ç§°ä¸ºåˆ†å‰²çš„地å€ç©ºé—´ã€‚我使用共享地å€ç©ºé—´æ¥æŒ‡ä»£ç›¸å的情况:
+å³ï¼Œè®¾å¤‡å¯ä»¥é€æ˜Žåœ°ä½¿ç”¨ä»»ä½•åº”用程åºå†…存区域。
+
+分割的地å€ç©ºé—´çš„å‘生是因为设备åªèƒ½è®¿é—®é€šè¿‡è®¾å¤‡ç‰¹å®š API 分é…的内存。这æ„味ç€ä»Žè®¾å¤‡çš„角度æ¥
+看,程åºä¸­çš„所有内存对象并ä¸å¹³ç­‰ï¼Œè¿™ä½¿å¾—ä¾èµ–于广泛的库的大型程åºå˜å¾—å¤æ‚。
+
+具体æ¥è¯´ï¼Œè¿™æ„味ç€æƒ³è¦åˆ©ç”¨åƒ GPU 这样的设备的代ç éœ€è¦åœ¨é€šç”¨åˆ†é…的内存(mallocã€mmap
+ç§æœ‰ã€mmap å…±äº«ï¼‰å’Œé€šè¿‡è®¾å¤‡é©±åŠ¨ç¨‹åº API 分é…的内存之间å¤åˆ¶å¯¹è±¡ï¼ˆè¿™ä»ç„¶ä»¥ mmap 结æŸï¼Œ
+但是是设备文件)。
+
+对于平é¢æ•°æ®é›†ï¼ˆæ•°ç»„ã€ç½‘æ ¼ã€å›¾åƒâ€¦â€¦ï¼‰ï¼Œè¿™å¹¶ä¸éš¾å®žçŽ°ï¼Œä½†å¯¹äºŽå¤æ‚æ•°æ®é›†ï¼ˆåˆ—表ã€æ ‘……),
+很难åšåˆ°æ­£ç¡®ã€‚å¤åˆ¶ä¸€ä¸ªå¤æ‚çš„æ•°æ®é›†éœ€è¦é‡æ–°æ˜ å°„å…¶æ¯ä¸ªå…ƒç´ ä¹‹é—´çš„所有指针关系。这很容易出错,
+而且由于数æ®é›†å’Œåœ°å€çš„é‡å¤ï¼Œç¨‹åºæ›´éš¾è°ƒè¯•ã€‚
+
+分割地å€ç©ºé—´ä¹Ÿæ„味ç€åº“ä¸èƒ½é€æ˜Žåœ°ä½¿ç”¨å®ƒä»¬ä»Žæ ¸å¿ƒç¨‹åºæˆ–å¦ä¸€ä¸ªåº“中获得的数æ®ï¼Œå› æ­¤æ¯ä¸ªåº“å¯èƒ½
+ä¸å¾—ä¸ä½¿ç”¨è®¾å¤‡ç‰¹å®šçš„内存分é…器æ¥é‡å¤å…¶è¾“入数æ®é›†ã€‚大型项目会因此å—到影å“,并因为å„ç§å†…å­˜
+æ‹·è´è€Œæµªè´¹èµ„æºã€‚
+
+å¤åˆ¶æ¯ä¸ªåº“çš„API以接å—æ¯ä¸ªè®¾å¤‡ç‰¹å®šåˆ†é…器分é…的内存作为输入或输出,并ä¸æ˜¯ä¸€ä¸ªå¯è¡Œçš„选择。
+这将导致库入å£ç‚¹çš„组åˆçˆ†ç‚¸ã€‚
+
+最åŽï¼Œéšç€é«˜çº§è¯­è¨€ç»“构(在 C++ 中,当然也在其他语言中)的进步,编译器现在有å¯èƒ½åœ¨æ²¡æœ‰ç¨‹
+åºå‘˜å¹²é¢„的情况下利用 GPU 和其他设备。æŸäº›ç¼–译器识别的模å¼ä»…适用于共享地å€ç©ºé—´ã€‚对所有
+其他模å¼ï¼Œä½¿ç”¨å…±äº«åœ°å€ç©ºé—´ä¹Ÿæ›´åˆç†ã€‚
+
+
+I/O 总线ã€è®¾å¤‡å†…存特性
+======================
+
+由于一些é™åˆ¶ï¼ŒI/O 总线削弱了共享地å€ç©ºé—´ã€‚大多数 I/O 总线åªå…许从设备到主内存的基本
+内存访问;甚至缓存一致性通常是å¯é€‰çš„。从 CPU 访问设备内存甚至更加有é™ã€‚通常情况下,它
+ä¸æ˜¯ç¼“存一致的。
+
+如果我们åªè€ƒè™‘ PCIE 总线,那么设备å¯ä»¥è®¿é—®ä¸»å†…存(通常通过 IOMMU)并与 CPU 缓存一
+致。但是,它åªå…许设备对主存储器进行一组有é™çš„原å­æ“作。这在å¦ä¸€ä¸ªæ–¹å‘上更糟:CPU
+åªèƒ½è®¿é—®æœ‰é™èŒƒå›´çš„设备内存,而ä¸èƒ½å¯¹å…¶æ‰§è¡ŒåŽŸå­æ“作。因此,从内核的角度æ¥çœ‹ï¼Œè®¾å¤‡å†…å­˜ä¸
+能被视为与常规内存等åŒã€‚
+
+å¦ä¸€ä¸ªä¸¥é‡çš„因素是带宽有é™ï¼ˆçº¦ 32GBytes/s,PCIE 4.0 å’Œ 16 通é“)。这比最快的 GPU
+内存 (1 TBytes/s) æ…¢ 33 å€ã€‚最åŽä¸€ä¸ªé™åˆ¶æ˜¯å»¶è¿Ÿã€‚从设备访问主内存的延迟比设备访问自
+己的内存时高一个数é‡çº§ã€‚
+
+一些平å°æ­£åœ¨å¼€å‘æ–°çš„ I/O 总线或对 PCIE 的添加/修改以解决其中一些é™åˆ¶
+(OpenCAPIã€CCIX)。它们主è¦å…许 CPU 和设备之间的åŒå‘缓存一致性,并å…许架构支æŒçš„所
+有原å­æ“作。é—憾的是,并éžæ‰€æœ‰å¹³å°éƒ½éµå¾ªè¿™ä¸€è¶‹åŠ¿ï¼Œå¹¶ä¸”一些主è¦æž¶æž„没有针对这些问题的硬
+件解决方案。
+
+因此,为了使共享地å€ç©ºé—´æœ‰æ„义,我们ä¸ä»…å¿…é¡»å…许设备访问任何内存,而且还必须å…许任何内
+存在设备使用时è¿ç§»åˆ°è®¾å¤‡å†…存(在è¿ç§»æ—¶é˜»æ­¢ CPU 访问)。
+
+
+共享地å€ç©ºé—´å’Œè¿ç§»
+==================
+
+HMM 打算æ供两个主è¦åŠŸèƒ½ã€‚第一个是通过å¤åˆ¶cpu页表到设备页表中æ¥å…±äº«åœ°å€ç©ºé—´ï¼Œå› æ­¤å¯¹
+于进程地å€ç©ºé—´ä¸­çš„任何有效主内存地å€ï¼Œç›¸åŒçš„地å€æŒ‡å‘相åŒçš„物ç†å†…存。
+
+为了实现这一点,HMM æ供了一组帮助程åºæ¥å¡«å……设备页表,åŒæ—¶è·Ÿè¸ª CPU 页表更新。设备页表
+æ›´æ–°ä¸åƒ CPU 页表更新那么容易。è¦æ›´æ–°è®¾å¤‡é¡µè¡¨ï¼Œæ‚¨å¿…须分é…一个缓冲区(或使用预先分é…çš„
+缓冲区池)并在其中写入 GPU 特定命令以执行更新(å–消映射ã€ç¼“存失效和刷新等)。这ä¸èƒ½é€š
+过所有设备的通用代ç æ¥å®Œæˆã€‚因此,为什么HMMæ供了帮助器,在把硬件的具体细节留给设备驱
+动程åºçš„åŒæ—¶ï¼ŒæŠŠä¸€åˆ‡å¯ä»¥è€ƒè™‘的因素都考虑进去了。
+
+HMM æ供的第二ç§æœºåˆ¶æ˜¯ä¸€ç§æ–°çš„ ZONE_DEVICE 内存,它å…许为设备内存的æ¯ä¸ªé¡µé¢åˆ†é…一个
+struct page。这些页é¢å¾ˆç‰¹æ®Šï¼Œå› ä¸º CPU 无法映射它们。然而,它们å…许使用现有的è¿ç§»æœº
+制将主内存è¿ç§»åˆ°è®¾å¤‡å†…存,从 CPU 的角度æ¥çœ‹ï¼Œä¸€åˆ‡çœ‹èµ·æ¥éƒ½åƒæ˜¯æ¢å‡ºåˆ°ç£ç›˜çš„页é¢ã€‚使用
+struct pageå¯ä»¥ä¸ŽçŽ°æœ‰çš„ mm 机制进行最简å•ã€æœ€å¹²å‡€çš„集æˆã€‚å†æ¬¡ï¼ŒHMM ä»…æ供帮助程åºï¼Œ
+首先为设备内存热æ’拔新的 ZONE_DEVICE 内存,然åŽæ‰§è¡Œè¿ç§»ã€‚è¿ç§»å†…容和时间的策略决定留
+给设备驱动程åºã€‚
+
+请注æ„,任何 CPU 对设备页é¢çš„访问都会触å‘缺页异常并è¿ç§»å›žä¸»å†…存。例如,当支æŒç»™å®šCPU
+åœ°å€ A 的页é¢ä»Žä¸»å†…存页é¢è¿ç§»åˆ°è®¾å¤‡é¡µé¢æ—¶ï¼Œå¯¹åœ°å€ A 的任何 CPU 访问都会触å‘缺页异常
+并å¯åŠ¨å‘主内存的è¿ç§»ã€‚
+
+凭借这两个特性,HMM ä¸ä»…å…许设备镜åƒè¿›ç¨‹åœ°å€ç©ºé—´å¹¶ä¿æŒ CPU 和设备页表åŒæ­¥ï¼Œè€Œä¸”还通
+过è¿ç§»è®¾å¤‡æ­£åœ¨ä½¿ç”¨çš„æ•°æ®é›†éƒ¨åˆ†æ¥åˆ©ç”¨è®¾å¤‡å†…存。
+
+
+地å€ç©ºé—´é•œåƒå®žçŽ°å’ŒAPI
+=====================
+
+地å€ç©ºé—´é•œåƒçš„主è¦ç›®æ ‡æ˜¯å…许将一定范围的 CPU 页表å¤åˆ¶åˆ°ä¸€ä¸ªè®¾å¤‡é¡µè¡¨ä¸­ï¼›HMM 有助于
+ä¿æŒä¸¤è€…åŒæ­¥ã€‚想è¦é•œåƒè¿›ç¨‹åœ°å€ç©ºé—´çš„设备驱动程åºå¿…须从注册 mmu_interval_notifier
+开始::
+
+ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops);
+
+在 ops->invalidate() 回调期间,设备驱动程åºå¿…须对范围执行更新æ“作(将范围标记为åª
+读,或完全å–消映射等)。设备必须在驱动程åºå›žè°ƒè¿”回之å‰å®Œæˆæ›´æ–°ã€‚
+
+当设备驱动程åºæƒ³è¦å¡«å……一个虚拟地å€èŒƒå›´æ—¶ï¼Œå®ƒå¯ä»¥ä½¿ç”¨::
+
+ int hmm_range_fault(struct hmm_range *range);
+
+如果请求写访问,它将在丢失或åªè¯»æ¡ç›®ä¸Šè§¦å‘缺页异常(è§ä¸‹æ–‡ï¼‰ã€‚缺页异常使用通用的 mm 缺
+页异常代ç è·¯å¾„ï¼Œå°±åƒ CPU 缺页异常一样。
+
+这两个函数都将 CPU 页表æ¡ç›®å¤åˆ¶åˆ°å®ƒä»¬çš„ pfns 数组å‚数中。该数组中的æ¯ä¸ªæ¡ç›®å¯¹åº”于虚拟
+范围中的一个地å€ã€‚HMM æ供了一组标志æ¥å¸®åŠ©é©±åŠ¨ç¨‹åºè¯†åˆ«ç‰¹æ®Šçš„ CPU 页表项。
+
+在 sync_cpu_device_pagetables() 回调中é”定是驱动程åºå¿…须尊é‡çš„最é‡è¦çš„æ–¹é¢ï¼Œä»¥ä¿
+æŒäº‹ç‰©æ­£ç¡®åŒæ­¥ã€‚使用模å¼æ˜¯::
+
+ int driver_populate_range(...)
+ {
+ struct hmm_range range;
+ ...
+
+ range.notifier = &interval_sub;
+ range.start = ...;
+ range.end = ...;
+ range.hmm_pfns = ...;
+
+ if (!mmget_not_zero(interval_sub->notifier.mm))
+ return -EFAULT;
+
+ again:
+ range.notifier_seq = mmu_interval_read_begin(&interval_sub);
+ mmap_read_lock(mm);
+ ret = hmm_range_fault(&range);
+ if (ret) {
+ mmap_read_unlock(mm);
+ if (ret == -EBUSY)
+ goto again;
+ return ret;
+ }
+ mmap_read_unlock(mm);
+
+ take_lock(driver->update);
+ if (mmu_interval_read_retry(&ni, range.notifier_seq) {
+ release_lock(driver->update);
+ goto again;
+ }
+
+ /* Use pfns array content to update device page table,
+ * under the update lock */
+
+ release_lock(driver->update);
+ return 0;
+ }
+
+driver->update é”与驱动程åºåœ¨å…¶ invalidate() 回调中使用的é”相åŒã€‚该é”必须在调用
+mmu_interval_read_retry() 之å‰ä¿æŒï¼Œä»¥é¿å…ä¸Žå¹¶å‘ CPU 页表更新å‘生任何竞争。
+
+利用 default_flags 和 pfn_flags_mask
+====================================
+
+hmm_range 结构有 2 个字段,default_flags 和 pfn_flags_mask,它们指定整个范围
+的故障或快照策略,而ä¸å¿…为 pfns 数组中的æ¯ä¸ªæ¡ç›®è®¾ç½®å®ƒä»¬ã€‚
+
+例如,如果设备驱动程åºéœ€è¦è‡³å°‘具有读å–æƒé™çš„范围的页é¢ï¼Œå®ƒä¼šè®¾ç½®::
+
+ range->default_flags = HMM_PFN_REQ_FAULT;
+ range->pfn_flags_mask = 0;
+
+并如上所述调用 hmm_range_fault()。这将填充至少具有读å–æƒé™çš„范围内的所有页é¢ã€‚
+
+现在å‡è®¾é©±åŠ¨ç¨‹åºæƒ³è¦åšåŒæ ·çš„事情,除了它想è¦æ‹¥æœ‰å†™æƒé™çš„范围内的一页。现在驱动程åºè®¾
+ç½®::
+
+ range->default_flags = HMM_PFN_REQ_FAULT;
+ range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
+ range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
+
+有了这个,HMM 将在至少读å–(å³æœ‰æ•ˆï¼‰çš„所有页é¢ä¸­å¼‚常,并且对于地å€
+== range->start + (index_of_write << PAGE_SHIFT) 它将异常写入æƒé™ï¼Œå³ï¼Œå¦‚æžœ
+CPU pte 没有设置写æƒé™ï¼Œé‚£ä¹ˆHMM将调用handle_mm_fault()。
+
+hmm_range_fault 完æˆåŽï¼Œæ ‡å¿—ä½è¢«è®¾ç½®ä¸ºé¡µè¡¨çš„当å‰çŠ¶æ€ï¼Œå³ HMM_PFN_VALID | 如果页
+é¢å¯å†™ï¼Œå°†è®¾ç½® HMM_PFN_WRITE。
+
+
+从核心内核的角度表示和管ç†è®¾å¤‡å†…å­˜
+==================================
+
+å°è¯•äº†å‡ ç§ä¸åŒçš„设计æ¥æ”¯æŒè®¾å¤‡å†…存。第一个使用特定于设备的数æ®ç»“æž„æ¥ä¿å­˜æœ‰å…³è¿ç§»å†…å­˜
+çš„ä¿¡æ¯ï¼ŒHMM 将自身挂接到 mm 代ç çš„å„个ä½ç½®ï¼Œä»¥å¤„ç†å¯¹è®¾å¤‡å†…存支æŒçš„地å€çš„任何访问。
+事实è¯æ˜Žï¼Œè¿™æœ€ç»ˆå¤åˆ¶äº† struct page 的大部分字段,并且还需è¦æ›´æ–°è®¸å¤šå†…核代ç è·¯å¾„æ‰
+能ç†è§£è¿™ç§æ–°çš„内存类型。
+
+大多数内核代ç è·¯å¾„从ä¸å°è¯•è®¿é—®é¡µé¢åŽé¢çš„内存,而åªå…³å¿ƒstruct page的内容。正因为如此,
+HMM 切æ¢åˆ°ç›´æŽ¥ä½¿ç”¨ struct page 用于设备内存,这使得大多数内核代ç è·¯å¾„ä¸çŸ¥é“差异。
+我们åªéœ€è¦ç¡®ä¿æ²¡æœ‰äººè¯•å›¾ä»Ž CPU 端映射这些页é¢ã€‚
+
+移入和移出设备内存
+==================
+
+由于 CPU 无法直接访问设备内存,因此设备驱动程åºå¿…须使用硬件 DMA 或设备特定的加载/å­˜
+储指令æ¥è¿ç§»æ•°æ®ã€‚migrate_vma_setup()ã€migrate_vma_pages() å’Œ
+migrate_vma_finalize() 函数旨在使驱动程åºæ›´æ˜“于编写并集中跨驱动程åºçš„通用代ç ã€‚
+
+在将页é¢è¿ç§»åˆ°è®¾å¤‡ç§æœ‰å†…存之å‰ï¼Œéœ€è¦åˆ›å»ºç‰¹æ®Šçš„设备ç§æœ‰ ``struct page`` 。这些将用
+作特殊的“交æ¢â€é¡µè¡¨æ¡ç›®ï¼Œä»¥ä¾¿ CPU 进程在å°è¯•è®¿é—®å·²è¿ç§»åˆ°è®¾å¤‡ä¸“用内存的页é¢æ—¶ä¼šå‘生异常。
+
+这些å¯ä»¥é€šè¿‡ä»¥ä¸‹æ–¹å¼åˆ†é…和释放::
+
+ struct resource *res;
+ struct dev_pagemap pagemap;
+
+ res = request_free_mem_region(&iomem_resource, /* number of bytes */,
+ "name of driver resource");
+ pagemap.type = MEMORY_DEVICE_PRIVATE;
+ pagemap.range.start = res->start;
+ pagemap.range.end = res->end;
+ pagemap.nr_range = 1;
+ pagemap.ops = &device_devmem_ops;
+ memremap_pages(&pagemap, numa_node_id());
+
+ memunmap_pages(&pagemap);
+ release_mem_region(pagemap.range.start, range_len(&pagemap.range));
+
+还有devm_request_free_mem_region(), devm_memremap_pages(),
+devm_memunmap_pages() å’Œ devm_release_mem_region() 当资æºå¯ä»¥ç»‘定到 ``struct device``.
+
+整体è¿ç§»æ­¥éª¤ç±»ä¼¼äºŽåœ¨ç³»ç»Ÿå†…存中è¿ç§» NUMA 页é¢(see :ref:`Page migration <page_migration>`) ,
+但这些步骤分为设备驱动程åºç‰¹å®šä»£ç å’Œå…±äº«å…¬å…±ä»£ç :
+
+1. ``mmap_read_lock()``
+
+ 设备驱动程åºå¿…须将 ``struct vm_area_struct`` 传递给migrate_vma_setup(),
+ 因此需è¦åœ¨è¿ç§»æœŸé—´ä¿ç•™ mmap_read_lock() 或 mmap_write_lock()。
+
+2. ``migrate_vma_setup(struct migrate_vma *args)``
+
+ 设备驱动åˆå§‹åŒ–了 ``struct migrate_vma`` 的字段,并将该指针传递给
+ migrate_vma_setup()。``args->flags`` 字段是用æ¥è¿‡æ»¤å“ªäº›æºé¡µé¢åº”该被è¿ç§»ã€‚
+ 例如,设置 ``MIGRATE_VMA_SELECT_SYSTEM`` å°†åªè¿ç§»ç³»ç»Ÿå†…存,设置
+ ``MIGRATE_VMA_SELECT_DEVICE_PRIVATE`` å°†åªè¿ç§»é©»ç•™åœ¨è®¾å¤‡ç§æœ‰å†…存中的页
+ é¢ã€‚如果åŽè€…被设置, ``args->pgmap_owner`` 字段被用æ¥è¯†åˆ«é©±åŠ¨æ‰€æ‹¥æœ‰çš„设备
+ ç§æœ‰é¡µã€‚这就é¿å…了试图è¿ç§»é©»ç•™åœ¨å…¶ä»–设备中的设备ç§æœ‰é¡µã€‚ç›®å‰ï¼Œåªæœ‰åŒ¿åçš„ç§æœ‰VMA
+ 范围å¯ä»¥è¢«è¿ç§»åˆ°ç³»ç»Ÿå†…存和设备ç§æœ‰å†…存。
+
+ migrate_vma_setup()所åšçš„第一步是用 ``mmu_notifier_invalidate_range_start()``
+ å’Œ ``mmu_notifier_invalidate_range_end()`` 调用æ¥é历设备周围的页表,使
+ 其他设备的MMU无效,以便在 ``args->src`` 数组中填写è¦è¿ç§»çš„PFN。
+ ``invalidate_range_start()`` 回调传递给一个``struct mmu_notifier_range`` ,
+ 其 ``event`` 字段设置为MMU_NOTIFY_MIGRATE, ``owner`` 字段设置为传递给
+ migrate_vma_setup()çš„ ``args->pgmap_owner`` 字段。这å…许设备驱动跳过无
+ 效化回调,åªæ— æ•ˆåŒ–那些实际正在è¿ç§»çš„设备ç§æœ‰MMU映射。这一点将在下一节详细解释。
+
+
+ 在é历页表时,一个 ``pte_none()`` 或 ``is_zero_pfn()`` æ¡ç›®å¯¼è‡´ä¸€ä¸ªæœ‰æ•ˆ
+ çš„ “zero†PFN 存储在 ``args->src`` 阵列中。这让驱动分é…设备ç§æœ‰å†…存并清
+ 除它,而ä¸æ˜¯å¤åˆ¶ä¸€ä¸ªé›¶é¡µã€‚到系统内存或设备ç§æœ‰ç»“构页的有效PTEæ¡ç›®å°†è¢«
+ ``lock_page()``é”定,与LRU隔离(如果系统内存和设备ç§æœ‰é¡µä¸åœ¨LRU上),从进
+ 程中å–消映射,并æ’入一个特殊的è¿ç§»PTEæ¥ä»£æ›¿åŽŸæ¥çš„PTE。 migrate_vma_setup()
+ 还清除了 ``args->dst`` 数组。
+
+3. 设备驱动程åºåˆ†é…目标页é¢å¹¶å°†æºé¡µé¢å¤åˆ¶åˆ°ç›®æ ‡é¡µé¢ã€‚
+
+ 驱动程åºæ£€æŸ¥æ¯ä¸ª ``src`` æ¡ç›®ä»¥æŸ¥çœ‹è¯¥ ``MIGRATE_PFN_MIGRATE`` ä½æ˜¯å¦å·²
+ 设置并跳过未è¿ç§»çš„æ¡ç›®ã€‚设备驱动程åºè¿˜å¯ä»¥é€šè¿‡ä¸å¡«å……页é¢çš„ ``dst`` 数组æ¥é€‰
+ 择跳过页é¢è¿ç§»ã€‚
+
+ 然åŽï¼Œé©±åŠ¨ç¨‹åºåˆ†é…一个设备ç§æœ‰ struct page 或一个系统内存页,用 ``lock_page()``
+ é”定该页,并将 ``dst`` 数组æ¡ç›®å¡«å…¥::
+
+ dst[i] = migrate_pfn(page_to_pfn(dpage));
+
+ 现在驱动程åºçŸ¥é“这个页é¢æ­£åœ¨è¢«è¿ç§»ï¼Œå®ƒå¯ä»¥ä½¿è®¾å¤‡ç§æœ‰ MMU 映射无效并将设备ç§æœ‰
+ 内存å¤åˆ¶åˆ°ç³»ç»Ÿå†…存或å¦ä¸€ä¸ªè®¾å¤‡ç§æœ‰é¡µé¢ã€‚由于核心 Linux å†…æ ¸ä¼šå¤„ç† CPU 页表失
+ 效,因此设备驱动程åºåªéœ€ä½¿å…¶è‡ªå·±çš„ MMU 映射失效。
+
+ 驱动程åºå¯ä»¥ä½¿ç”¨ ``migrate_pfn_to_page(src[i])`` æ¥èŽ·å–æºè®¾å¤‡çš„
+ ``struct page`` é¢ï¼Œå¹¶å°†æºé¡µé¢å¤åˆ¶åˆ°ç›®æ ‡è®¾å¤‡ä¸Šï¼Œå¦‚果指针为 ``NULL`` ,æ„
+ 味ç€æºé¡µé¢æ²¡æœ‰è¢«å¡«å……到系统内存中,则清除目标设备的ç§æœ‰å†…存。
+
+4. ``migrate_vma_pages()``
+
+ 这一步是实际“æ交â€è¿ç§»çš„地方。
+
+ 如果æºé¡µæ˜¯ ``pte_none()`` 或 ``is_zero_pfn()`` 页,这时新分é…的页会被æ’
+ 入到CPU的页表中。如果一个CPU线程在åŒä¸€é¡µé¢ä¸Šå‘生异常,这å¯èƒ½ä¼šå¤±è´¥ã€‚然而,页
+ 表被é”定,åªæœ‰ä¸€ä¸ªæ–°é¡µä¼šè¢«æ’入。如果它失去了竞争,设备驱动将看到
+ ``MIGRATE_PFN_MIGRATE`` ä½è¢«æ¸…除。
+
+ 如果æºé¡µè¢«é”定ã€éš”ç¦»ç­‰ï¼Œæº ``struct page`` ä¿¡æ¯çŽ°åœ¨è¢«å¤åˆ¶åˆ°ç›®æ ‡
+ ``struct page`` ,最终完æˆCPU端的è¿ç§»ã€‚
+
+5. 设备驱动为ä»åœ¨è¿ç§»çš„页é¢æ›´æ–°è®¾å¤‡MMU页表,回滚未è¿ç§»çš„页é¢ã€‚
+
+ 如果 ``src`` æ¡ç›®ä»ç„¶æœ‰ ``MIGRATE_PFN_MIGRATE`` ä½è¢«è®¾ç½®ï¼Œè®¾å¤‡é©±åŠ¨å¯ä»¥
+ 更新设备MMU,如果 ``MIGRATE_PFN_WRITE`` ä½è¢«è®¾ç½®ï¼Œåˆ™è®¾ç½®å†™å¯ç”¨ä½ã€‚
+
+6. ``migrate_vma_finalize()``
+
+ 这一步用新页的页表项替æ¢ç‰¹æ®Šçš„è¿ç§»é¡µè¡¨é¡¹ï¼Œå¹¶é‡Šæ”¾å¯¹æºå’Œç›®çš„ ``struct page``
+ 的引用。
+
+7. ``mmap_read_unlock()``
+
+ 现在å¯ä»¥é‡Šæ”¾é”了。
+
+独å è®¿é—®å­˜å‚¨å™¨
+==============
+
+一些设备具有诸如原å­PTEä½çš„功能,å¯ä»¥ç”¨æ¥å®žçŽ°å¯¹ç³»ç»Ÿå†…存的原å­è®¿é—®ã€‚为了支æŒå¯¹ä¸€
+个共享的虚拟内存页的原å­æ“作,这样的设备需è¦å¯¹è¯¥é¡µçš„访问是排他的,而ä¸æ˜¯æ¥è‡ªCPU
+的任何用户空间访问。 ``make_device_exclusive_range()`` 函数å¯ä»¥ç”¨æ¥ä½¿ä¸€
+个内存范围ä¸èƒ½ä»Žç”¨æˆ·ç©ºé—´è®¿é—®ã€‚
+
+这将用特殊的交æ¢æ¡ç›®æ›¿æ¢ç»™å®šèŒƒå›´å†…的所有页的映射。任何试图访问交æ¢æ¡ç›®çš„行为都会
+导致一个异常,该异常会通过用原始映射替æ¢è¯¥æ¡ç›®è€Œå¾—到æ¢å¤ã€‚驱动程åºä¼šè¢«é€šçŸ¥æ˜ å°„å·²
+ç»è¢«MMU通知器改å˜ï¼Œä¹‹åŽå®ƒå°†ä¸å†æœ‰å¯¹è¯¥é¡µçš„独å è®¿é—®ã€‚独å è®¿é—®è¢«ä¿è¯æŒç»­åˆ°é©±åŠ¨ç¨‹åº
+放弃页é¢é”和页é¢å¼•ç”¨ä¸ºæ­¢ï¼Œè¿™æ—¶é¡µé¢ä¸Šçš„任何CPU异常都å¯ä»¥æŒ‰æ‰€è¿°è¿›è¡Œã€‚
+
+内存 cgroup (memcg) 和 rss 统计
+===============================
+
+ç›®å‰ï¼Œè®¾å¤‡å†…存被视为 rss 计数器中的任何常规页é¢ï¼ˆå¦‚果设备页é¢ç”¨äºŽåŒ¿å,则为匿å,
+如果设备页é¢ç”¨äºŽæ–‡ä»¶æ”¯æŒé¡µé¢ï¼Œåˆ™ä¸ºæ–‡ä»¶ï¼Œå¦‚果设备页é¢ç”¨äºŽå…±äº«å†…存,则为 shmem)。
+这是为了ä¿æŒçŽ°æœ‰åº”用程åºçš„æ•…æ„选择,这些应用程åºå¯èƒ½åœ¨ä¸çŸ¥æƒ…的情况下开始使用设备
+内存,è¿è¡Œä¸å—å½±å“。
+
+一个缺点是 OOM æ€æ‰‹å¯èƒ½ä¼šæ€æ­»ä½¿ç”¨å¤§é‡è®¾å¤‡å†…存而ä¸æ˜¯å¤§é‡å¸¸è§„系统内存的应用程åºï¼Œ
+å› æ­¤ä¸ä¼šé‡Šæ”¾å¤ªå¤šç³»ç»Ÿå†…存。在决定以ä¸åŒæ–¹å¼è®¡ç®—设备内存之å‰ï¼Œæˆ‘们希望收集更多关
+于应用程åºå’Œç³»ç»Ÿåœ¨å­˜åœ¨è®¾å¤‡å†…存的情况下在内存压力下如何å应的实际ç»éªŒã€‚
+
+对内存 cgroup åšå‡ºäº†ç›¸åŒçš„决定。设备内存页é¢æ ¹æ®ç›¸åŒçš„内存 cgroup 计算,常规
+页é¢å°†è¢«è®¡ç®—在内。这确实简化了进出设备内存的è¿ç§»ã€‚这也æ„味ç€ä»Žè®¾å¤‡å†…å­˜è¿ç§»å›žå¸¸è§„
+内存ä¸ä¼šå¤±è´¥ï¼Œå› ä¸ºå®ƒä¼šè¶…过内存 cgroup é™åˆ¶ã€‚一旦我们对设备内存的使用方å¼åŠå…¶å¯¹
+内存资æºæŽ§åˆ¶çš„å½±å“有了更多的了解,我们å¯èƒ½ä¼šåœ¨åŽé¢é‡æ–°è€ƒè™‘这个选择。
+
+请注æ„,设备内存永远ä¸èƒ½ç”±è®¾å¤‡é©±åŠ¨ç¨‹åºæˆ–通过 GUP 固定,因此此类内存在进程退出时
+总是被释放的。或者在共享内存或文件支æŒå†…存的情况下,当删除最åŽä¸€ä¸ªå¼•ç”¨æ—¶ã€‚
diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
new file mode 100644
index 000000000000..c6d471ce2131
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst
@@ -0,0 +1,436 @@
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/vm/hugetlbfs_reserv.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==============
+Hugetlbfs 预留
+==============
+
+概述
+====
+
+:ref:`hugetlbpage` 中æ述的巨页通常是预先分é…给应用程åºä½¿ç”¨çš„。如果VMA指
+示è¦ä½¿ç”¨å·¨é¡µï¼Œè¿™äº›å·¨é¡µä¼šåœ¨ç¼ºé¡µå¼‚常时被实例化到任务的地å€ç©ºé—´ã€‚如果在缺页异常
+时没有巨页存在,任务就会被å‘é€ä¸€ä¸ªSIGBUS,并ç»å¸¸ä¸é«˜å…´åœ°æ­»åŽ»ã€‚在加入巨页支
+æŒåŽä¸ä¹…,人们决定,在mmap()时检测巨页的短缺情况会更好。这个想法是,如果
+没有足够的巨页æ¥è¦†ç›–映射,mmap()将失败。这首先是在mmap()时在代ç ä¸­åšä¸€ä¸ª
+简å•çš„检查,以确定是å¦æœ‰è¶³å¤Ÿçš„空闲巨页æ¥è¦†ç›–映射。就åƒå†…核中的大多数东西一
+样,代ç éšç€æ—¶é—´çš„推移而ä¸æ–­å‘展。然而,基本的想法是在mmap()æ—¶ “预留â€
+巨页,以确ä¿å·¨é¡µå¯ä»¥ç”¨äºŽè¯¥æ˜ å°„中的缺页异常。下é¢çš„æ述试图æ述在v4.10内核
+中是如何进行巨页预留处ç†çš„。
+
+
+读者
+====
+这个æ述主è¦æ˜¯é’ˆå¯¹æ­£åœ¨ä¿®æ”¹hugetlbfs代ç çš„内核开å‘者。
+
+
+æ•°æ®ç»“æž„
+========
+
+resv_huge_pages
+ 这是一个全局的(per-hstate)预留的巨页的计数。预留的巨页åªå¯¹é¢„留它们的任
+ 务å¯ç”¨ã€‚因此,一般å¯ç”¨çš„巨页的数é‡è¢«è®¡ç®—为(``free_huge_pages - resv_huge_pages``)。
+Reserve Map
+ 预留映射由以下结构体æè¿°::
+
+ struct resv_map {
+ struct kref refs;
+ spinlock_t lock;
+ struct list_head regions;
+ long adds_in_progress;
+ struct list_head region_cache;
+ long region_cache_count;
+ };
+
+ 系统中æ¯ä¸ªå·¨é¡µæ˜ å°„都有一个预留映射。resv_map中的regions列表æ述了映射中的
+ 区域。一个区域被æ述为::
+
+ struct file_region {
+ struct list_head link;
+ long from;
+ long to;
+ };
+
+ file_region结构体的 ‘from’ å’Œ ‘to’ 字段是进入映射的巨页索引。根æ®æ˜ å°„的类型,在
+ reserv_map 中的一个区域å¯èƒ½è¡¨ç¤ºè¯¥èŒƒå›´å­˜åœ¨é¢„留,或预留ä¸å­˜åœ¨ã€‚
+Flags for MAP_PRIVATE Reservations
+ 这些被存储在预留的映射指针的底部。
+
+ ``#define HPAGE_RESV_OWNER (1UL << 0)``
+ 表示该任务是与该映射相关的预留的所有者。
+ ``#define HPAGE_RESV_UNMAPPED (1UL << 1)``
+ 表示最åˆæ˜ å°„此范围(并创建储备)的任务由于COW失败而从该任务(å­ä»»åŠ¡ï¼‰ä¸­å–消映
+ 射了一个页é¢ã€‚
+Page Flags
+ PagePrivate页é¢æ ‡å¿—是用æ¥æŒ‡ç¤ºåœ¨é‡Šæ”¾å·¨é¡µæ—¶å¿…é¡»æ¢å¤å·¨é¡µçš„预留。更多细节将在
+ “释放巨页†一节中讨论。
+
+
+预留映射ä½ç½®ï¼ˆç§æœ‰æˆ–共享)
+==========================
+
+一个巨页映射或段è¦ä¹ˆæ˜¯ç§æœ‰çš„,è¦ä¹ˆæ˜¯å…±äº«çš„。如果是ç§æœ‰çš„,它通常åªå¯¹ä¸€ä¸ªåœ°å€ç©ºé—´
+(任务)å¯ç”¨ã€‚如果是共享的,它å¯ä»¥è¢«æ˜ å°„到多个地å€ç©ºé—´ï¼ˆä»»åŠ¡ï¼‰ã€‚对于这两ç§ç±»åž‹çš„映射,
+预留映射的ä½ç½®å’Œè¯­ä¹‰æ˜¯æ˜Žæ˜¾ä¸åŒçš„。ä½ç½®çš„差异是:
+
+- 对于ç§æœ‰æ˜ å°„,预留映射挂在VMA结构体上。具体æ¥è¯´ï¼Œå°±æ˜¯vma->vm_private_data。这个ä¿
+ 留映射是在创建映射(mmap(MAP_PRIVATE))时创建的。
+- 对于共享映射,预留映射挂在inode上。具体æ¥è¯´ï¼Œå°±æ˜¯inode->i_mapping->private_data。
+ 由于共享映射总是由hugetlbfs文件系统中的文件支æŒï¼Œhugetlbfs代ç ç¡®ä¿æ¯ä¸ªèŠ‚点包å«ä¸€ä¸ªé¢„
+ 留映射。因此,预留映射在创建节点时被分é…。
+
+
+创建预留
+========
+当创建一个巨大的有页é¢æ”¯æŒçš„共享内存段(shmget(SHM_HUGETLB))或通过mmap(MAP_HUGETLB)
+创建一个映射时,就会创建预留。这些æ“作会导致对函数hugetlb_reserve_pages()的调用::
+
+ int hugetlb_reserve_pages(struct inode *inode,
+ long from, long to,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+
+hugetlb_reserve_pages()åšçš„第一件事是检查在调用shmget()或mmap()时是å¦æŒ‡å®šäº†NORESERVE
+标志。如果指定了NORESERVE,那么这个函数立å³è¿”回,因为ä¸éœ€è¦é¢„留。
+
+å‚æ•°'from'å’Œ'to'是映射或基础文件的巨页索引。对于shmget(),'from'总是0,'to'对应于段/映射
+的长度。对于mmap(),offsetå‚æ•°å¯ä»¥ç”¨æ¥æŒ‡å®šè¿›å…¥åº•å±‚文件的å移é‡ã€‚在这ç§æƒ…况下,'from'å’Œ'to'
+å‚æ•°å·²ç»è¢«è¿™ä¸ªå移é‡æ‰€è°ƒæ•´ã€‚
+
+PRIVATEå’ŒSHARED映射之间的一个很大的区别是预留在预留映射中的表示方å¼ã€‚
+
+- 对于共享映射,预留映射中的æ¡ç›®è¡¨ç¤ºå¯¹åº”页é¢çš„预留存在或曾ç»å­˜åœ¨ã€‚当预留被消耗时,预留映射ä¸è¢«
+ 修改。
+- 对于ç§æœ‰æ˜ å°„,预留映射中没有æ¡ç›®è¡¨ç¤ºç›¸åº”页é¢å­˜åœ¨é¢„留。éšç€é¢„留被消耗,æ¡ç›®è¢«æ·»åŠ åˆ°é¢„留映射中。
+ 因此,预留映射也å¯ç”¨äºŽç¡®å®šå“ªäº›é¢„留已被消耗。
+
+对于ç§æœ‰æ˜ å°„,hugetlb_reserve_pages()创建预留映射并将其挂在VMA结构体上。此外,
+HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。
+
+预留映射被查阅以确定当å‰æ˜ å°„/段需è¦å¤šå°‘巨页预留。对于ç§æœ‰æ˜ å°„,这始终是一个值(to - from)。
+然而,对于共享映射æ¥è¯´ï¼Œä¸€äº›é¢„ç•™å¯èƒ½å·²ç»å­˜åœ¨äºŽ(to - from)的范围内。关于如何实现这一点的细节,
+请å‚è§ :ref:`预留映射的修改 <resv_map_modifications>` 一节。
+
+该映射å¯èƒ½ä¸Žä¸€ä¸ªå­æ± ï¼ˆsubpool)相关è”。如果是这样,将查询å­æ± ä»¥ç¡®ä¿æœ‰è¶³å¤Ÿçš„空间用于映射。å­æ± 
+有å¯èƒ½å·²ç»é¢„留了å¯ç”¨äºŽæ˜ å°„的预留空间。更多细节请å‚è§ :ref: `å­æ± é¢„ç•™ <sub_pool_resv>`
+一节。
+
+在咨询了预留映射和å­æ± ä¹‹åŽï¼Œå°±çŸ¥é“了需è¦çš„新预留数é‡ã€‚hugetlb_acct_memory()函数被调用以检查
+并获å–所è¦æ±‚的预留数é‡ã€‚hugetlb_acct_memory()调用到å¯èƒ½åˆ†é…和调整剩余页数的函数。然而,在这
+些函数中,代ç åªæ˜¯æ£€æŸ¥ä»¥ç¡®ä¿æœ‰è¶³å¤Ÿçš„空闲的巨页æ¥å®¹çº³é¢„留。如果有的è¯ï¼Œå…¨å±€é¢„留计数resv_huge_pages
+会被调整,如下所示::
+
+ if (resv_needed <= (resv_huge_pages - free_huge_pages))
+ resv_huge_pages += resv_needed;
+
+注æ„,在检查和调整这些计数器时,全局é”hugetlb_lock会被预留。
+
+如果有足够的空闲的巨页,并且全局计数resv_huge_pages被调整,那么与映射相关的预留映射被修改以
+å映预留。在共享映射的情况下,将存在一个file_region,包括'from'-'to'范围。对于ç§æœ‰æ˜ å°„,
+ä¸å¯¹é¢„留映射进行修改,因为没有æ¡ç›®è¡¨ç¤ºå­˜åœ¨é¢„留。
+
+如果hugetlb_reserve_pages()æˆåŠŸï¼Œå…¨å±€é¢„留数和与映射相关的预留映射将根æ®éœ€è¦è¢«ä¿®æ”¹ï¼Œä»¥ç¡®ä¿
+在'from'-'to'范围内存在预留。
+
+消耗预留/分é…一个巨页
+===========================
+
+当与预留相关的巨页在相应的映射中被分é…和实例化时,预留就被消耗了。该分é…是在函数alloc_huge_page()
+中进行的::
+
+ struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve)
+
+alloc_huge_page被传递给一个VMA指针和一个虚拟地å€ï¼Œå› æ­¤å®ƒå¯ä»¥æŸ¥é˜…预留映射以确定是å¦å­˜åœ¨é¢„留。
+此外,alloc_huge_page需è¦ä¸€ä¸ªå‚æ•°avoid_reserve,该å‚数表示å³ä½¿çœ‹èµ·æ¥å·²ç»ä¸ºæŒ‡å®šçš„地å€é¢„留了
+预留,也ä¸åº”该使用预留。avoid_reserveå‚数最常被用于写时拷è´å’Œé¡µé¢è¿ç§»çš„情况下,å³çŽ°æœ‰é¡µé¢çš„é¢
+外拷è´è¢«åˆ†é…。
+
+
+调用辅助函数vma_needs_reservation()æ¥ç¡®å®šæ˜¯å¦å­˜åœ¨å¯¹æ˜ å°„(vma)中地å€çš„预留。关于这个函数的详
+细内容,请å‚è§ :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。从
+vma_needs_reservation()返回的值通常为0或1。如果该地å€å­˜åœ¨é¢„留,则为0,如果ä¸å­˜åœ¨é¢„留,则为1。
+如果ä¸å­˜åœ¨é¢„留,并且有一个与映射相关è”çš„å­æ± ï¼Œåˆ™æŸ¥è¯¢å­æ± ä»¥ç¡®å®šå®ƒæ˜¯å¦åŒ…å«é¢„留。如果å­æ± åŒ…å«é¢„留,
+则å¯å°†å…¶ä¸­ä¸€ä¸ªç”¨äºŽè¯¥åˆ†é…。然而,在任何情况下,avoid_reserveå‚数都会优先考虑为分é…使用预留。在
+确定预留是å¦å­˜åœ¨å¹¶å¯ç”¨äºŽåˆ†é…åŽï¼Œè°ƒç”¨dequeue_huge_page_vma()函数。这个函数需è¦ä¸¤ä¸ªä¸Žé¢„留有关
+çš„å‚数:
+
+- avoid_reserve,这是传递给alloc_huge_page()çš„åŒä¸€ä¸ªå€¼/å‚数。
+- chg,尽管这个å‚数的类型是long,但åªæœ‰0或1的值被传递给dequeue_huge_page_vma。如果该值为0,
+ 则表明存在预留(关于å¯èƒ½çš„问题,请å‚è§ â€œé¢„ç•™å’Œå†…å­˜ç­–ç•¥â€ ä¸€èŠ‚ï¼‰ã€‚å¦‚æžœå€¼
+ 为1,则表示ä¸å­˜åœ¨é¢„留,如果å¯èƒ½çš„è¯ï¼Œå¿…须从全局空闲池中å–出该页。
+
+与VMA的内存策略相关的空闲列表被æœç´¢åˆ°ä¸€ä¸ªç©ºé—²é¡µã€‚如果找到了一个页é¢ï¼Œå½“该页é¢ä»Žç©ºé—²åˆ—表中移除时,
+free_huge_pages的值被递å‡ã€‚如果有一个与该页相关的预留,将进行以下调整::
+
+ SetPagePrivate(page); /* 表示分é…这个页é¢æ¶ˆè€—了一个预留,
+ * 如果é‡åˆ°é”™è¯¯ï¼Œä»¥è‡³äºŽå¿…须释放这个页é¢ï¼Œé¢„留将被
+ * æ¢å¤ã€‚ */
+ resv_huge_pages--; /* å‡å°‘全局预留计数 */
+
+注æ„,如果找ä¸åˆ°æ»¡è¶³VMA内存策略的巨页,将å°è¯•ä½¿ç”¨ä¼™ä¼´åˆ†é…器分é…一个。这就带æ¥äº†è¶…出预留范围
+的剩余巨页和超é¢åˆ†é…的问题。å³ä½¿åˆ†é…了一个多余的页é¢ï¼Œä¹Ÿä¼šè¿›è¡Œä¸Žä¸Šé¢ä¸€æ ·çš„基于预留的调整:
+SetPagePrivate(page) 和 resv_huge_pages--.
+
+在获得一个新的巨页åŽï¼Œ(page)->private被设置为与该页é¢ç›¸å…³çš„å­æ± çš„值,如果它存在的è¯ã€‚当页
+é¢è¢«é‡Šæ”¾æ—¶ï¼Œè¿™å°†è¢«ç”¨äºŽå­æ± çš„计数。
+
+然åŽè°ƒç”¨å‡½æ•°vma_commit_reservation(),根æ®é¢„留的消耗情况调整预留映射。一般æ¥è¯´ï¼Œè¿™æ¶‰åŠ
+到确ä¿é¡µé¢åœ¨åŒºåŸŸæ˜ å°„çš„file_region结构体中被表示。对于预留存在的共享映射,预留映射中的æ¡ç›®
+å·²ç»å­˜åœ¨ï¼Œæ‰€ä»¥ä¸åšä»»ä½•æ”¹å˜ã€‚然而,如果共享映射中没有预留,或者这是一个ç§æœ‰æ˜ å°„,则必须创建一
+个新的æ¡ç›®ã€‚
+
+注æ„,如果找ä¸åˆ°æ»¡è¶³VMA内存策略的巨页,将å°è¯•ä½¿ç”¨ä¼™ä¼´åˆ†é…器分é…一个。这就带æ¥äº†è¶…出预留范围
+的剩余巨页和过度分é…的问题。å³ä½¿åˆ†é…了一个多余的页é¢ï¼Œä¹Ÿä¼šè¿›è¡Œä¸Žä¸Šé¢ä¸€æ ·çš„基于预留的调整。
+SetPagePrivate(page)和resv_huge_pages-。
+
+在获得一个新的巨页åŽï¼Œ(page)->private被设置为与该页é¢ç›¸å…³çš„å­æ± çš„值,如果它存在的è¯ã€‚当页
+é¢è¢«é‡Šæ”¾æ—¶ï¼Œè¿™å°†è¢«ç”¨äºŽå­æ± çš„计数。
+
+然åŽè°ƒç”¨å‡½æ•°vma_commit_reservation(),根æ®é¢„留的消耗情况调整预留映射。一般æ¥è¯´ï¼Œè¿™æ¶‰åŠ
+到确ä¿é¡µé¢åœ¨åŒºåŸŸæ˜ å°„çš„file_region结构体中被表示。对于预留存在的共享映射,预留映射中的æ¡ç›®
+å·²ç»å­˜åœ¨ï¼Œæ‰€ä»¥ä¸åšä»»ä½•æ”¹å˜ã€‚然而,如果共享映射中没有预留,或者这是一个ç§æœ‰æ˜ å°„,则必须创建
+一个新的æ¡ç›®ã€‚
+
+在alloc_huge_page()开始调用vma_needs_reservation()和页é¢åˆ†é…åŽè°ƒç”¨
+vma_commit_reservation()之间,预留映射有å¯èƒ½è¢«æ”¹å˜ã€‚如果hugetlb_reserve_pages在共
+享映射中为åŒä¸€é¡µé¢è¢«è°ƒç”¨ï¼Œè¿™å°†æ˜¯å¯èƒ½çš„。在这ç§æƒ…况下,预留计数和å­æ± ç©ºé—²é¡µè®¡æ•°ä¼šæœ‰ä¸€ä¸ªå差。
+è¿™ç§ç½•è§çš„情况å¯ä»¥é€šè¿‡æ¯”较vma_needs_reservationå’Œvma_commit_reservation的返回值æ¥
+识别。如果检测到这ç§ç«žäº‰ï¼Œå­æ± å’Œå…¨å±€é¢„留计数将被调整以进行补å¿ã€‚关于这些函数的更多信æ¯ï¼Œè¯·
+å‚è§ :ref:`预留映射帮助函数 <resv_map_helpers>` 一节。
+
+
+实例化巨页
+==========
+
+在巨页分é…之åŽï¼Œé¡µé¢é€šå¸¸è¢«æ·»åŠ åˆ°åˆ†é…任务的页表中。在此之å‰ï¼Œå…±äº«æ˜ å°„中的页é¢è¢«æ·»åŠ åˆ°é¡µé¢ç¼“
+存中,ç§æœ‰æ˜ å°„中的页é¢è¢«æ·»åŠ åˆ°åŒ¿ååå‘映射中。在这两ç§æƒ…况下,PagePrivate标志被清除。因此,
+当一个已ç»å®žä¾‹åŒ–的巨页被释放时,ä¸ä¼šå¯¹å…¨å±€é¢„留计数(resv_huge_pages)进行调整。
+
+
+释放巨页
+========
+
+巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfså¤åˆé¡µçš„æžæž„器。因此,它åªä¼ 
+递一个指å‘页é¢ç»“构体的指针。当一个巨页被释放时,å¯èƒ½éœ€è¦è¿›è¡Œé¢„留计算。如果该页与包å«ä¿
+留的å­æ± ç›¸å…³è”,或者该页在错误路径上被释放,必须æ¢å¤å…¨å±€é¢„留计数,就会出现这ç§æƒ…况。
+
+page->private字段指å‘与该页相关的任何å­æ± ã€‚如果PagePrivate标志被设置,它表明全局预留计数
+应该被调整(关于如何设置这些标志的信æ¯ï¼Œè¯·å‚è§
+:ref: `消耗预留/分é…一个巨页 <consume_resv>` )。
+
+
+该函数首先调用hugepage_subpool_put_pages()æ¥å¤„ç†è¯¥é¡µã€‚如果这个函数返回一个0的值(ä¸ç­‰äºŽ
+传递的1的值),它表明预留与å­æ± ç›¸å…³è”,这个新释放的页é¢å¿…须被用æ¥ä¿æŒå­æ± é¢„留的数é‡è¶…过最å°å€¼ã€‚
+因此,在这ç§æƒ…况下,全局resv_huge_pages计数器被递增。
+
+如果页é¢ä¸­è®¾ç½®äº†PagePrivate标志,那么全局resv_huge_pages计数器将永远被递增。
+
+å­æ± é¢„ç•™
+========
+
+有一个结构体hstate与æ¯ä¸ªå·¨é¡µå°ºå¯¸ç›¸å…³è”。hstate跟踪所有指定大å°çš„巨页。一个å­æ± ä»£è¡¨ä¸€
+个hstate中的页é¢å­é›†ï¼Œå®ƒä¸Žä¸€ä¸ªå·²æŒ‚载的hugetlbfs文件系统相关
+
+当一个hugetlbfs文件系统被挂载时,å¯ä»¥æŒ‡å®šmin_size选项,它表示文件系统所需的最å°çš„巨页数é‡ã€‚
+如果指定了这个选项,与min_size相对应的巨页的数é‡å°†è¢«é¢„留给文件系统使用。这个数字在结构体
+hugepage_subpool的min_hpages字段中被跟踪。在挂载时,hugetlb_acct_memory(min_hpages)
+被调用以预留指定数é‡çš„巨页。如果它们ä¸èƒ½è¢«é¢„留,挂载就会失败。
+
+当从å­æ± ä¸­èŽ·å–或释放页é¢æ—¶ï¼Œä¼šè°ƒç”¨hugepage_subpool_get/put_pages()函数。
+hugepage_subpool_get/put_pages被传递给巨页数é‡ï¼Œä»¥æ­¤æ¥è°ƒæ•´å­æ± çš„ “已用页é¢â€ 计数
+(get为下é™ï¼Œput为上å‡ï¼‰ã€‚通常情况下,如果å­æ± ä¸­æ²¡æœ‰è¶³å¤Ÿçš„页é¢ï¼Œå®ƒä»¬ä¼šè¿”回与传递的相åŒçš„值或
+一个错误。
+
+然而,如果预留与å­æ± ç›¸å…³è”,å¯èƒ½ä¼šè¿”回一个å°äºŽä¼ é€’值的返回值。这个返回值表示必须进行的é¢å¤–全局
+池调整的数é‡ã€‚例如,å‡è®¾ä¸€ä¸ªå­æ± åŒ…å«3个预留的巨页,有人è¦æ±‚5个。与å­æ± ç›¸å…³çš„3个预留页å¯ä»¥ç”¨æ¥
+满足部分请求。但是,必须从全局池中获得2个页é¢ã€‚为了å‘调用者转达这一信æ¯ï¼Œå°†è¿”回值2。然åŽï¼Œè°ƒç”¨
+者è¦è´Ÿè´£ä»Žå…¨å±€æ± ä¸­èŽ·å–å¦å¤–两个页é¢ã€‚
+
+
+COW和预留
+==========
+
+由于共享映射都指å‘并使用相åŒçš„底层页é¢ï¼ŒCOW最大的预留问题是ç§æœ‰æ˜ å°„。在这ç§æƒ…况下,两个任务å¯
+以指å‘åŒä¸€ä¸ªå…ˆå‰åˆ†é…的页é¢ã€‚一个任务试图写到该页,所以必须分é…一个新的页,以便æ¯ä¸ªä»»åŠ¡éƒ½æŒ‡å‘它
+自己的页。
+
+当该页最åˆè¢«åˆ†é…时,该页的预留被消耗了。当由于COW而试图分é…一个新的页é¢æ—¶ï¼Œæœ‰å¯èƒ½æ²¡æœ‰ç©ºé—²çš„å·¨
+页,分é…会失败。
+
+当最åˆåˆ›å»ºç§æœ‰æ˜ å°„时,通过设置所有者的预留映射指针中的HPAGE_RESV_OWNERä½æ¥æ ‡è®°æ˜ å°„的所有者。
+由于所有者创建了映射,所有者拥有与映射相关的所有预留。因此,当一个写异常å‘生并且没有å¯ç”¨çš„页é¢
+时,对预留的所有者和éžæ‰€æœ‰è€…采å–ä¸åŒçš„行动。
+
+在å‘生异常的任务ä¸æ˜¯æ‰€æœ‰è€…的情况下,异常将失败,该任务通常会收到一个SIGBUS。
+
+如果所有者是å‘生异常的任务,我们希望它能够æˆåŠŸï¼Œå› ä¸ºå®ƒæ‹¥æœ‰åŽŸå§‹çš„预留。为了达到这个目的,该页被
+从éžæ‰€æœ‰è€…任务中解映射出æ¥ã€‚这样一æ¥ï¼Œå”¯ä¸€çš„引用就是æ¥è‡ªæ‹¥æœ‰è€…的任务。此外,HPAGE_RESV_UNMAPPED
+ä½è¢«è®¾ç½®åœ¨éžæ‹¥æœ‰ä»»åŠ¡çš„预留映射指针中。如果éžæ‹¥æœ‰è€…任务åŽæ¥åœ¨ä¸€ä¸ªä¸å­˜åœ¨çš„页é¢ä¸Šå‘生异常,它å¯èƒ½
+会收到一个SIGBUS。但是,映射/预留的原始拥有者的行为将与预期一致。
+
+预留映射的修改
+==============
+
+以下低级函数用于对预留映射进行修改。通常情况下,这些函数ä¸ä¼šè¢«ç›´æŽ¥è°ƒç”¨ã€‚而是调用一个预留映射辅
+助函数,该函数调用这些低级函数中的一个。这些低级函数在æºä»£ç ï¼ˆmm/hugetlb.c)中得到了相当好的
+记录。这些函数是::
+
+ long region_chg(struct resv_map *resv, long f, long t);
+ long region_add(struct resv_map *resv, long f, long t);
+ void region_abort(struct resv_map *resv, long f, long t);
+ long region_count(struct resv_map *resv, long f, long t);
+
+在预留映射上的æ“作通常涉åŠä¸¤ä¸ªæ“作:
+
+1) region_chg()被调用æ¥æ£€æŸ¥é¢„留映射,并确定在指定的范围[f, t]内有多少页目å‰æ²¡æœ‰è¢«ä»£è¡¨ã€‚
+
+ 调用代ç æ‰§è¡Œå…¨å±€æ£€æŸ¥å’Œåˆ†é…,以确定是å¦æœ‰è¶³å¤Ÿçš„巨页使æ“作æˆåŠŸã€‚
+
+2)
+ a) 如果æ“作能够æˆåŠŸï¼Œregi_add()将被调用,以实际修改先å‰ä¼ é€’ç»™regi_chg()的相åŒèŒƒå›´
+ [f, t]的预留映射。
+ b) 如果æ“作ä¸èƒ½æˆåŠŸï¼Œregion_abort被调用,在相åŒçš„范围[f, t]内中止æ“作。
+
+注æ„,这是一个两步的过程, region_add()å’Œ region_abort()在事先调用 region_chg()åŽä¿è¯
+æˆåŠŸã€‚ region_chg()负责预先分é…任何必è¦çš„æ•°æ®ç»“构以确ä¿åŽç»­æ“作(特别是 region_add())的
+æˆåŠŸã€‚
+
+如上所述,region_chg()确定该范围内当å‰æ²¡æœ‰åœ¨æ˜ å°„中表示的页é¢çš„æ•°é‡ã€‚region_add()返回添加
+到映射中的范围内的页数。在大多数情况下, region_add() 的返回值与 region_chg() 的返回值相
+åŒã€‚然而,在共享映射的情况下,有å¯èƒ½åœ¨è°ƒç”¨ region_chg() å’Œ region_add() 之间对预留映射进
+行更改。在这ç§æƒ…况下,regi_add()的返回值将与regi_chg()的返回值ä¸ç¬¦ã€‚在这ç§æƒ…况下,全局计数
+å’Œå­æ± è®¡æ•°å¾ˆå¯èƒ½æ˜¯ä¸æ­£ç¡®çš„,需è¦è°ƒæ•´ã€‚检查这ç§æƒ…况并进行适当的调整是调用者的责任。
+
+函数region_del()被调用以从预留映射中移除区域。
+它通常在以下情况下被调用:
+
+- 当hugetlbfs文件系统中的一个文件被删除时,该节点将被释放,预留映射也被释放。在释放预留映射
+ 之å‰ï¼Œæ‰€æœ‰å•ç‹¬çš„file_region结构体必须被释放。在这ç§æƒ…况下,region_del的范围是[0, LONG_MAX]。
+- 当一个hugetlbfs文件正在被截断时。在这ç§æƒ…况下,所有在新文件大å°ä¹‹åŽåˆ†é…的页é¢å¿…须被释放。
+ 此外,预留映射中任何超过新文件大å°çš„file_regionæ¡ç›®å¿…须被删除。在这ç§æƒ…况下,region_del
+ 的范围是[new_end_of_file, LONG_MAX]。
+- 当在一个hugetlbfs文件中打洞时。在这ç§æƒ…况下,巨页被一次次从文件的中间移除。当这些页被移除
+ 时,region_del()被调用以从预留映射中移除相应的æ¡ç›®ã€‚在这ç§æƒ…况下,region_del被传递的范
+ 围是[page_idx, page_idx + 1]。
+
+在任何情况下,region_del()都会返回从预留映射中删除的页é¢æ•°é‡ã€‚在éžå¸¸ç½•è§çš„情况下,region_del()
+会失败。这åªèƒ½å‘生在打洞的情况下,å³å®ƒå¿…须分割一个现有的file_regionæ¡ç›®ï¼Œè€Œä¸èƒ½åˆ†é…一个新的
+结构体。在这ç§é”™è¯¯æƒ…况下,region_del()将返回-ENOMEM。这里的问题是,预留映射将显示对该页有
+预留。然而,å­æ± å’Œå…¨å±€é¢„留计数将ä¸å映该预留。为了处ç†è¿™ç§æƒ…况,调用函数hugetlb_fix_reserve_counts()
+æ¥è°ƒæ•´è®¡æ•°å™¨ï¼Œä½¿å…¶ä¸Žä¸èƒ½è¢«åˆ é™¤çš„预留映射æ¡ç›®ç›¸å¯¹åº”。
+
+region_count()在解除ç§æœ‰å·¨é¡µæ˜ å°„时被调用。在ç§æœ‰æ˜ å°„中,预留映射中没有æ¡ç›®è¡¨æ˜Žå­˜åœ¨ä¸€ä¸ªé¢„留。
+因此,通过计算预留映射中的æ¡ç›®æ•°ï¼Œæˆ‘们知é“有多少预留被消耗了,有多少预留是未完æˆçš„
+(Outstanding = (end - start) - region_count(resv, start, end))。由于映射正在消
+失,å­æ± å’Œå…¨å±€é¢„留计数被未完æˆçš„预留数é‡æ‰€å‡åŽ»ã€‚
+
+预留映射帮助函数
+================
+
+有几个辅助函数å¯ä»¥æŸ¥è¯¢å’Œä¿®æ”¹é¢„留映射。这些函数åªå¯¹ç‰¹å®šçš„巨页的预留感兴趣,所以它们åªæ˜¯ä¼ å…¥ä¸€ä¸ª
+地å€è€Œä¸æ˜¯ä¸€ä¸ªèŒƒå›´ã€‚此外,它们还传入相关的VMA。从VMA中,å¯ä»¥ç¡®å®šæ˜ å°„的类型(ç§æœ‰æˆ–共享)和预留
+映射的ä½ç½®ï¼ˆinode或VMA)。这些函数åªæ˜¯è°ƒç”¨ “预留映射的修改†一节中æ述的基础函数。然而,
+它们确实考虑到了ç§æœ‰å’Œå…±äº«æ˜ å°„的预留映射æ¡ç›®çš„ “相å†å«ä¹‰ï¼Œå¹¶å‘调用者éšè—了这个细节::
+
+ long vma_needs_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+
+该函数为指定的页é¢è°ƒç”¨ region_chg()。如果ä¸å­˜åœ¨é¢„留,则返回1。如果存在预留,则返回0::
+
+ long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+
+这将调用 region_add(),用于指定的页é¢ã€‚与region_chgå’Œregion_add的情况一样,该函数应在
+å…ˆå‰è°ƒç”¨çš„vma_needs_reservationåŽè°ƒç”¨ã€‚它将为该页添加一个预留æ¡ç›®ã€‚如果预留被添加,它将
+返回1,如果没有则返回0。返回值应与之å‰è°ƒç”¨vma_needs_reservation的返回值进行比较。如果出
+现æ„外的差异,说明在两次调用之间修改了预留映射::
+
+ void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+
+这将调用指定页é¢çš„ region_abort()。与region_chgå’Œregion_abort的情况一样,该函数应在
+å…ˆå‰è°ƒç”¨çš„vma_needs_reservationåŽè¢«è°ƒç”¨ã€‚它将中止/结æŸæ­£åœ¨è¿›è¡Œçš„预留添加æ“作::
+
+ long vma_add_reservation(struct hstate *h,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+
+这是一个特殊的包装函数,有助于在错误路径上清ç†é¢„留。它åªä»Žrepare_reserve_on_error()函数
+中调用。该函数与vma_needs_reservation一起使用,试图将一个预留添加到预留映射中。它考虑到
+了ç§æœ‰å’Œå…±äº«æ˜ å°„çš„ä¸åŒé¢„留映射语义。因此,region_add被调用用于共享映射(因为映射中的æ¡ç›®è¡¨
+示预留),而region_del被调用用于ç§æœ‰æ˜ å°„(因为映射中没有æ¡ç›®è¡¨ç¤ºé¢„留)。关于在错误路径上需
+è¦åšä»€ä¹ˆçš„更多信æ¯ï¼Œè¯·å‚è§ â€œé”™è¯¯è·¯å¾„ä¸­çš„é¢„ç•™æ¸…ç†â€ 。
+
+
+错误路径中的预留清ç†
+====================
+
+正如在:ref:`预留映射帮助函数<resv_map_helpers>` 一节中æ到的,预留的修改分两步进行。首
+先,在分é…页é¢ä¹‹å‰è°ƒç”¨vma_needs_reservation。如果分é…æˆåŠŸï¼Œåˆ™è°ƒç”¨vma_commit_reservation。
+如果ä¸æ˜¯ï¼Œåˆ™è°ƒç”¨vma_end_reservation。全局和å­æ± çš„预留计数根æ®æ“作的æˆåŠŸæˆ–失败进行调整,
+一切都很好。
+
+此外,在一个巨页被实例化åŽï¼ŒPagePrivate标志被清空,这样,当页é¢æœ€ç»ˆè¢«é‡Šæ”¾æ—¶ï¼Œè®¡æ•°æ˜¯
+正确的。
+
+然而,有几ç§æƒ…况是,在一个巨页被分é…åŽï¼Œä½†åœ¨å®ƒè¢«å®žä¾‹åŒ–之å‰ï¼Œå°±é‡åˆ°äº†é”™è¯¯ã€‚在这ç§æƒ…况下,
+页é¢åˆ†é…å·²ç»æ¶ˆè€—了预留,并进行了适当的å­æ± ã€é¢„留映射和全局计数调整。如果页é¢åœ¨è¿™ä¸ªæ—¶å€™è¢«é‡Šæ”¾
+(在实例化和清除PagePrivate之å‰ï¼‰ï¼Œé‚£ä¹ˆfree_huge_page将增加全局预留计数。然而,预留映射
+显示报留被消耗了。这ç§ä¸ä¸€è‡´çš„状æ€å°†å¯¼è‡´é¢„留的巨页的 “泄æ¼â€ 。全局预留计数将比它原本的è¦é«˜ï¼Œ
+并阻止分é…一个预先分é…的页é¢ã€‚
+
+函数 restore_reserve_on_error() 试图处ç†è¿™ç§æƒ…况。它有相当完善的文档。这个函数的目的
+是将预留映射æ¢å¤åˆ°é¡µé¢åˆ†é…å‰çš„状æ€ã€‚通过这ç§æ–¹å¼ï¼Œé¢„留映射的状æ€å°†ä¸Žé¡µé¢é‡Šæ”¾åŽçš„全局预留计
+数相对应。
+
+函数restore_reserve_on_error本身在试图æ¢å¤é¢„留映射æ¡ç›®æ—¶å¯èƒ½ä¼šé‡åˆ°é”™è¯¯ã€‚在这ç§æƒ…况下,
+它将简å•åœ°æ¸…除该页的PagePrivate标志。这样一æ¥ï¼Œå½“页é¢è¢«é‡Šæ”¾æ—¶ï¼Œå…¨å±€é¢„留计数将ä¸ä¼šè¢«é€’增。
+然而,预留映射将继续看起æ¥åƒé¢„留被消耗了一样。一个页é¢ä»ç„¶å¯ä»¥è¢«åˆ†é…到该地å€ï¼Œä½†å®ƒä¸ä¼šåƒæœ€
+åˆè®¾æƒ³çš„那样使用一个预留页。
+
+有一些代ç ï¼ˆæœ€æ˜Žæ˜¾çš„是userfaultfd)ä¸èƒ½è°ƒç”¨restore_reserve_on_error。在这ç§æƒ…况下,
+它简å•åœ°ä¿®æ”¹äº†PagePrivate,以便在释放巨页时ä¸ä¼šæ³„露预留。
+
+
+预留和内存策略
+==============
+当git第一次被用æ¥ç®¡ç†Linux代ç æ—¶ï¼Œæ¯ä¸ªèŠ‚点的巨页列表就存在于hstate结构中。预留的概念是
+在一段时间åŽåŠ å…¥çš„。当预留被添加时,没有å°è¯•å°†å†…存策略考虑在内。虽然cpusets与内存策略ä¸
+完全相åŒï¼Œä½†hugetlb_acct_memory中的这个注释总结了预留和cpusets/内存策略之间的相互作
+用::
+
+
+ /*
+ * 当cpuset被é…置时,它打破了严格的hugetlb页é¢é¢„留,因为计数是在一个全局å˜é‡ä¸Šå®Œ
+ * æˆçš„。在有cpuset的情况下,这样的预留完全是垃圾,因为预留没有根æ®å½“å‰cpusetçš„
+ * 页é¢å¯ç”¨æ€§æ¥æ£€æŸ¥ã€‚在任务所在的cpuset中缺ä¹ç©ºé—²çš„htlb页é¢æ—¶ï¼Œåº”用程åºä»ç„¶æœ‰å¯èƒ½
+ * 被内核OOM'ed。试图用cpusetæ¥æ‰§è¡Œä¸¥æ ¼çš„计数几乎是ä¸å¯èƒ½çš„(或者说太难看了),因
+ * 为cpuset太ä¸ç¨³å®šäº†ï¼Œä»»åŠ¡æˆ–内存节点å¯ä»¥åœ¨cpuset之间动æ€ç§»åŠ¨ã€‚与cpuset共享
+ * hugetlb映射的语义å˜åŒ–是ä¸å¯å–的。然而,为了预留一些语义,我们退回到检查当å‰ç©ºé—²
+ * 页的å¯ç”¨æ€§ï¼Œä½œä¸ºä¸€ç§æœ€å¥½çš„å°è¯•ï¼Œå¸Œæœ›èƒ½å°†cpuset改å˜è¯­ä¹‰çš„å½±å“é™åˆ°æœ€ä½Žã€‚
+ */
+
+添加巨页预留是为了防止在缺页异常时出现æ„外的页é¢åˆ†é…失败(OOM)。然而,如果一个应用
+程åºä½¿ç”¨cpusets或内存策略,就ä¸èƒ½ä¿è¯åœ¨æ‰€éœ€çš„节点上有巨页å¯ç”¨ã€‚å³ä½¿æœ‰è¶³å¤Ÿæ•°é‡çš„全局
+预留,也是如此。
+
+Hugetlbfs回归测试
+=================
+
+最完整的hugetlb测试集在libhugetlbfs仓库。如果你修改了任何hugetlb相关的代ç ï¼Œè¯·ä½¿ç”¨
+libhugetlbfs测试套件æ¥æ£€æŸ¥å›žå½’情况。此外,如果你添加了任何新的hugetlb功能,请在
+libhugetlbfs中添加适当的测试。
+
+--
+Mike Kravetz,2017年4月7日
diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/vm/hwpoison.rst
new file mode 100644
index 000000000000..c6e1e7bdb05b
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/hwpoison.rst
@@ -0,0 +1,166 @@
+
+:Original: Documentation/vm/hwpoison.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+hwpoison
+========
+
+什么是hwpoison?
+===============
+
+
+å³å°†æŽ¨å‡ºçš„英特尔CPU支æŒä»Žä¸€äº›å†…存错误中æ¢å¤ï¼ˆ ``MCAæ¢å¤`` )。这需è¦æ“作系统宣布
+一个页é¢"poisoned",æ€æ­»ä¸Žä¹‹ç›¸å…³çš„进程,并é¿å…在未æ¥ä½¿ç”¨å®ƒã€‚
+
+这个补ä¸åŒ…在虚拟机中实现了必è¦çš„(编程)框架。
+
+引用概述中的评论::
+
+ 高级机器的检查与处ç†ã€‚处ç†æ–¹æ³•æ˜¯æŸå的页é¢è¢«ç¡¬ä»¶æŠ¥å‘Šï¼Œé€šå¸¸æ˜¯ç”±äºŽ2ä½ECC内
+ 存或高速缓存故障。
+
+ 这主è¦æ˜¯é’ˆå¯¹åœ¨åŽå°æ£€æµ‹åˆ°çš„æŸå的页é¢ã€‚当当å‰çš„CPU试图访问它时,当å‰è¿è¡Œçš„进程
+ å¯ä»¥ç›´æŽ¥è¢«æ€æ­»ã€‚因为还没有访问æŸå的页é¢, 如果错误由于æŸç§åŽŸå› ä¸èƒ½è¢«å¤„ç†ï¼Œå°±å¯
+ 以安全地忽略它. 而ä¸æ˜¯ç”¨å¦å¤–一个机器检查去处ç†å®ƒã€‚
+
+ 处ç†ä¸åŒçŠ¶æ€çš„页é¢ç¼“存页。这里棘手的部分是,相对于其他虚拟内存用户, 我们å¯ä»¥å¼‚
+ 步访问任何页é¢ã€‚因为内存故障å¯èƒ½éšæ—¶éšåœ°å‘生,å¯èƒ½è¿å了他们的一些å‡è®¾ã€‚这就是
+ 为什么这段代ç å¿…é¡»éžå¸¸å°å¿ƒã€‚一般æ¥è¯´ï¼Œå®ƒè¯•å›¾ä½¿ç”¨æ­£å¸¸çš„é”规则,如获得标准é”,å³ä½¿
+ è¿™æ„味ç€é”™è¯¯å¤„ç†å¯èƒ½éœ€è¦å¾ˆé•¿çš„时间。
+
+ 这里的一些æ“作有点低效,并且具有éžçº¿æ€§çš„算法å¤æ‚性,因为数æ®ç»“构没有针对这ç§æƒ…
+ 况进行优化。特别是从vma到进程的映射就是这ç§æƒ…况。由于这ç§æƒ…况大概率是罕è§çš„,所
+ 以我们希望我们å¯ä»¥æ‘†è„±è¿™ç§æƒ…况。
+
+该代ç ç”±mm/memory-failure.c中的高级处ç†ç¨‹åºã€ä¸€ä¸ªæ–°çš„页é¢poisonä½å’Œè™šæ‹Ÿæœºä¸­çš„
+å„ç§æ£€æŸ¥ç»„æˆï¼Œç”¨æ¥å¤„ç†poison的页é¢ã€‚
+
+现在主è¦ç›®æ ‡æ˜¯KVM客户机,但它适用于所有类型的应用程åºã€‚支æŒKVM需è¦æœ€è¿‘çš„qemu-kvm
+版本。
+
+对于KVM的使用,需è¦ä¸€ä¸ªæ–°çš„ä¿¡å·ç±»åž‹ï¼Œè¿™æ ·KVMå°±å¯ä»¥ç”¨é€‚当的地å€å°†æœºå™¨æ£€æŸ¥æ³¨å…¥åˆ°å®¢æˆ·
+机中。这在ç†è®ºä¸Šä¹Ÿå…许其他应用程åºå¤„ç†å†…存故障。我们的期望是,所有的应用程åºéƒ½ä¸è¦è¿™
+æ ·åšï¼Œä½†ä¸€äº›éžå¸¸ä¸“业的应用程åºå¯èƒ½ä¼šè¿™æ ·åšã€‚
+
+æ•…éšœæ¢å¤æ¨¡å¼
+============
+
+有两ç§ï¼ˆå®žé™…上是三ç§ï¼‰æ¨¡å¼çš„内存故障æ¢å¤å¯ä»¥åœ¨ã€‚
+
+vm.memory_failure_recovery sysctl 置零:
+ 所有的内存故障都会导致panic。请ä¸è¦å°è¯•æ¢å¤ã€‚
+
+早期处ç†
+ (å¯ä»¥åœ¨å…¨å±€å’Œæ¯ä¸ªè¿›ç¨‹ä¸­æŽ§åˆ¶) 一旦检测到错误,立å³å‘应用程åºå‘é€SIGBUSè¿™å…许
+ 应用程åºä»¥æ¸©å’Œçš„æ–¹å¼å¤„ç†å†…存错误(例如,放弃å—å½±å“的对象) 这是KVM qemu使用的
+ 模å¼ã€‚
+
+推迟处ç†
+ 当应用程åºè¿è¡Œåˆ°æŸå的页é¢æ—¶ï¼Œå‘é€SIGBUS。这对ä¸çŸ¥é“内存错误的应用程åºæ¥è¯´æ˜¯
+ 最好的,默认情况下注æ„一些页é¢æ€»æ˜¯è¢«å½“作late kill处ç†ã€‚
+
+用户控制
+========
+
+vm.memory_failure_recovery
+ å‚阅 sysctl.txt
+
+vm.memory_failure_early_kill
+ 全局å¯ç”¨early kill
+
+PR_MCE_KILL
+ 设置early/late kill mode/revert 到系统默认值。
+
+ arg1: PR_MCE_KILL_CLEAR:
+ æ¢å¤åˆ°ç³»ç»Ÿé»˜è®¤å€¼
+ arg1: PR_MCE_KILL_SET:
+ arg2定义了线程特定模å¼
+
+ PR_MCE_KILL_EARLY:
+ Early kill
+ PR_MCE_KILL_LATE:
+ Late kill
+ PR_MCE_KILL_DEFAULT
+ 使用系统全局默认值
+
+ 注æ„,如果你想有一个专门的线程代表进程处ç†SIGBUS(BUS_MCEERR_AO),你应该在
+ 指定线程上调用prctl(PR_MCE_KILL_EARLY)。å¦åˆ™ï¼ŒSIGBUS将被å‘é€åˆ°ä¸»çº¿ç¨‹ã€‚
+
+PR_MCE_KILL_GET
+ 返回当å‰æ¨¡å¼
+
+测试
+====
+
+* madvise(MADV_HWPOISON, ....) (as root) - 在测试过程中Poison一个页é¢
+
+* 通过debugfs ``/sys/kernel/debug/hwpoison/`` hwpoison-inject模å—
+
+ corrupt-pfn
+ 在PFN处注入hwpoison故障,并echoed到这个文件。这åšäº†ä¸€äº›æ—©æœŸè¿‡æ»¤ï¼Œä»¥é¿
+ å…在测试套件中æŸåéžé¢„期页é¢ã€‚
+ unpoison-pfn
+ 在PFNçš„Software-unpoison页é¢å¯¹åº”到这个文件。这样,一个页é¢å¯ä»¥å†æ¬¡è¢«
+ å¤ç”¨ã€‚è¿™åªå¯¹Linux注入的故障起作用,对真正的内存故障ä¸èµ·ä½œç”¨ã€‚
+
+ 注æ„这些注入接å£å¹¶ä¸ç¨³å®šï¼Œå¯èƒ½ä¼šåœ¨ä¸åŒçš„内核版本中å‘生å˜åŒ–
+
+ corrupt-filter-dev-major, corrupt-filter-dev-minor
+ åªå¤„ç†ä¸Žå—设备major/minor定义的文件系统相关的页é¢çš„内存故障。-1U是通
+ é…符值。这应该åªç”¨äºŽäººå·¥æ³¨å…¥çš„测试。
+
+ corrupt-filter-memcg
+ é™åˆ¶æ³¨å…¥åˆ°memgroup拥有的页é¢ã€‚ç”±memcgçš„inodeå·æŒ‡å®šã€‚
+
+ Example::
+
+ mkdir /sys/fs/cgroup/mem/hwpoison
+
+ usemem -m 100 -s 1000 &
+ echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
+
+ memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
+ echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
+
+ page-types -p `pidof init` --hwpoison # shall do nothing
+ page-types -p `pidof usemem` --hwpoison # poison its pages
+
+ corrupt-filter-flags-mask, corrupt-filter-flags-value
+ 当指定时,åªæœ‰åœ¨((page_flags & mask) == value)的情况下æ‰ä¼špoison页é¢ã€‚
+ è¿™å…许对许多ç§ç±»çš„页é¢è¿›è¡ŒåŽ‹åŠ›æµ‹è¯•ã€‚page_flags与/proc/kpageflags中的相
+ åŒã€‚这些标志ä½åœ¨include/linux/kernel-page-flags.h中定义,并在
+ Documentation/admin-guide/mm/pagemap.rst中记录。
+
+* 架构特定的MCE注入器
+
+ x86 有 mce-inject, mce-test
+
+ 在mce-test中的一些便æºå¼hwpoison测试程åºï¼Œè§ä¸‹æ–‡ã€‚
+
+引用
+====
+
+http://halobates.de/mce-lc09-2.pdf
+ 09年LinuxCon的概述演讲
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-test.git
+ 测试套件(在tsrc中的hwpoison特定å¯ç§»æ¤æµ‹è¯•ï¼‰ã€‚
+
+git://git.kernel.org/pub/scm/utils/cpu/mce/mce-inject.git
+ x86特定的注入器
+
+
+é™åˆ¶
+====
+- ä¸æ˜¯æ‰€æœ‰çš„页é¢ç±»åž‹éƒ½è¢«æ”¯æŒï¼Œè€Œä¸”永远ä¸ä¼šã€‚大多数内核内部对象ä¸èƒ½è¢«æ¢
+ å¤ï¼Œç›®å‰åªæœ‰LRU页。
+
+---
+Andi Kleen, 2009年10月
diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/vm/index.rst
index a1d2f0356cc1..a1c6d529b6ff 100644
--- a/Documentation/translations/zh_CN/vm/index.rst
+++ b/Documentation/translations/zh_CN/vm/index.rst
@@ -27,27 +27,28 @@ TODO:待引用文档集被翻译完毕åŽè¯·åŠæ—¶ä¿®æ”¹æ­¤å¤„)
free_page_reporting
highmem
ksm
+ frontswap
+ hmm
+ hwpoison
+ hugetlbfs_reserv
+ memory-model
+ mmu_notifier
+ numa
+ overcommit-accounting
+ page_frags
+ page_owner
+ page_table_check
+ remap_file_pages
+ split_page_table_lock
+ z3fold
+ zsmalloc
TODOLIST:
* arch_pgtable_helpers
* free_page_reporting
-* frontswap
-* hmm
-* hwpoison
* hugetlbfs_reserv
-* memory-model
-* mmu_notifier
-* numa
-* overcommit-accounting
* page_migration
-* page_frags
-* page_owner
-* page_table_check
-* remap_file_pages
* slub
-* split_page_table_lock
* transhuge
* unevictable-lru
* vmalloced-kernel-stacks
-* z3fold
-* zsmalloc
diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/vm/memory-model.rst
new file mode 100644
index 000000000000..013e30c88d72
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/memory-model.rst
@@ -0,0 +1,135 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/vm/memory-model.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+============
+物ç†å†…存模型
+============
+
+系统中的物ç†å†…å­˜å¯ä»¥ç”¨ä¸åŒçš„æ–¹å¼è¿›è¡Œå¯»å€ã€‚最简å•çš„情况是,物ç†å†…存从地å€0å¼€
+始,跨越一个连续的范围,直到最大的地å€ã€‚然而,这个范围å¯èƒ½åŒ…å«CPU无法访问的
+å°å­”隙。那么,在完全ä¸åŒçš„地å€å¯èƒ½æœ‰å‡ ä¸ªè¿žç»­çš„范围。而且,别忘了NUMA,å³ä¸
+åŒçš„内存库连接到ä¸åŒçš„CPU。
+
+Linux使用两ç§å†…存模型中的一ç§å¯¹è¿™ç§å¤šæ ·æ€§è¿›è¡ŒæŠ½è±¡ã€‚FLATMEMå’ŒSPARSEM。æ¯
+个架构都定义了它所支æŒçš„内存模型,默认的内存模型是什么,以åŠæ˜¯å¦æœ‰å¯èƒ½æ‰‹åŠ¨
+覆盖该默认值。
+
+所有的内存模型都使用排列在一个或多个数组中的 `struct page` æ¥è·Ÿè¸ªç‰©ç†é¡µ
+帧的状æ€ã€‚
+
+无论选择哪ç§å†…存模型,物ç†é¡µæ¡†å·ï¼ˆPFN)和相应的 `struct page` 之间都存
+在一对一的映射关系。
+
+æ¯ä¸ªå†…存模型都定义了 :c:func:`pfn_to_page` å’Œ :c:func:`page_to_pfn`
+帮助函数,å…许从PFN到 `struct page` 的转æ¢ï¼Œå之亦然。
+
+FLATMEM
+=======
+
+最简å•çš„内存模型是FLATMEM。这个模型适用于éžNUMA系统的连续或大部分连续的
+物ç†å†…存。
+
+在FLATMEM内存模型中,有一个全局的 `mem_map` 数组æ¥æ˜ å°„整个物ç†å†…存。对
+于大多数架构,孔隙在 `mem_map` 数组中都有æ¡ç›®ã€‚与孔洞相对应的 `struct page`
+对象从未被完全åˆå§‹åŒ–。
+
+ä¸ºäº†åˆ†é… `mem_map` 数组,架构特定的设置代ç åº”该调用free_area_init()函数。
+然而,在调用memblock_free_all()函数之å‰ï¼Œæ˜ å°„数组是ä¸èƒ½ä½¿ç”¨çš„,该函数
+将所有的内存交给页分é…器。
+
+一个架构å¯èƒ½ä¼šé‡Šæ”¾ `mem_map` 数组中ä¸åŒ…括实际物ç†é¡µçš„部分。在这ç§æƒ…况下,特
+定架构的 :c:func:`pfn_valid` 实现应该考虑到 `mem_map` 中的孔隙。
+
+使用FLATMEM,PFNå’Œ `struct page` 之间的转æ¢æ˜¯ç›´æŽ¥çš„。 `PFN - ARCH_PFN_OFFSET`
+是 `mem_map` 数组的一个索引。
+
+`ARCH_PFN_OFFSET` 定义了物ç†å†…存起始地å€ä¸åŒäºŽ0的系统的第一个页框å·ã€‚
+
+SPARSEMEM
+=========
+
+SPARSEMEM是Linux中最通用的内存模型,它是唯一支æŒè‹¥å¹²é«˜çº§åŠŸèƒ½çš„内存模型,
+如物ç†å†…存的热æ’æ‹”ã€éžæ˜“失性内存设备的替代内存图和较大系统的内存图的延迟
+åˆå§‹åŒ–。
+
+SPARSEMEM模型将物ç†å†…存显示为一个部分的集åˆã€‚一个区段用mem_section结构
+ä½“è¡¨ç¤ºï¼Œå®ƒåŒ…å« `section_mem_map` ï¼Œä»Žé€»è¾‘ä¸Šè®²ï¼Œå®ƒæ˜¯ä¸€ä¸ªæŒ‡å‘ `struct page`
+阵列的指针。然而,它被存储在一些其他的magic中,以帮助分区管ç†ã€‚区段的大å°
+和最大区段数是使用 `SECTION_SIZE_BITS` å’Œ `MAX_PHYSMEM_BITS` 常é‡
+æ¥æŒ‡å®šçš„,这两个常é‡æ˜¯ç”±æ¯ä¸ªæ”¯æŒSPARSEMEM的架构定义的。 `MAX_PHYSMEM_BITS`
+是一个架构所支æŒçš„物ç†åœ°å€çš„实际宽度,而 `SECTION_SIZE_BITS` 是一个任
+æ„的值。
+
+最大的段数表示为 `NR_MEM_SECTIONS` ,定义为
+
+.. math::
+
+ NR\_MEM\_SECTIONS = 2 ^ {(MAX\_PHYSMEM\_BITS - SECTION\_SIZE\_BITS)}
+
+`mem_section` 对象被安排在一个å«åš `mem_sections` 的二维数组中。这个数组的
+大å°å’Œä½ç½®å–决于 `CONFIG_SPARSEM_EXTREME` å’Œå¯èƒ½çš„最大段数:
+
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被ç¦ç”¨æ—¶ï¼Œ `mem_sections` 数组是é™æ€çš„,有
+ `NR_MEM_SECTIONS` 行。æ¯ä¸€è¡ŒæŒæœ‰ä¸€ä¸ª `mem_section` 对象。
+* 当 `CONFIG_SPARSEMEM_EXTREME` 被å¯ç”¨æ—¶ï¼Œ `mem_sections` 数组被动æ€åˆ†é…。
+ æ¯ä¸€è¡ŒåŒ…å«ä»·å€¼ `PAGE_SIZE` çš„ `mem_section` 对象,行数的计算是为了适应所有的
+ 内存区。
+
+架构设置代ç åº”该调用sparse_init()æ¥åˆå§‹åŒ–内存区和内存映射。
+
+通过SPARSEMEM,有两ç§å¯èƒ½çš„æ–¹å¼å°†PFN转æ¢ä¸ºç›¸åº”çš„ `struct page` --"classic sparse"å’Œ
+ "sparse vmemmap"。选择是在构建时进行的,它由 `CONFIG_SPARSEMEM_VMEMMAP` 的
+ 值决定。
+
+Classic sparse在page->flags中编ç äº†ä¸€ä¸ªé¡µé¢çš„段å·ï¼Œå¹¶ä½¿ç”¨PFN的高ä½æ¥è®¿é—®æ˜ å°„该页
+框的段。在一个区段内,PFN是指å‘页数组的索引。
+
+Sparse vmemmapvmemmap使用虚拟映射的内存映射æ¥ä¼˜åŒ–pfn_to_pageå’Œpage_to_pfnæ“
+作。有一个全局的 `struct page *vmemmap` 指针,指å‘一个虚拟连续的 `struct page`
+对象阵列。PFN是该数组的一个索引,`struct page` 从 `vmemmap` çš„å移é‡æ˜¯è¯¥é¡µçš„PFN。
+
+为了使用vmemmap,一个架构必须ä¿ç•™ä¸€ä¸ªè™šæ‹Ÿåœ°å€çš„范围,以映射包å«å†…存映射的物ç†é¡µï¼Œå¹¶
+ç¡®ä¿ `vmemmap`指å‘该范围。此外,架构应该实现 :c:func:`vmemmap_populate` 方法,
+它将分é…物ç†å†…存并为虚拟内存映射创建页表。如果一个架构对vmemmap映射没有任何特殊è¦æ±‚,
+它å¯ä»¥ä½¿ç”¨é€šç”¨å†…存管ç†æ供的默认 :c:func:`vmemmap_populate_basepages`。
+
+虚拟映射的内存映射å…许将æŒä¹…性内存设备的 `struct page` 对象存储在这些设备上预先分
+é…的存储中。这ç§å­˜å‚¨ç”¨vmem_altmap结构表示,最终通过一长串的函数调用传递给
+vmemmap_populate()。vmemmap_populate()实现å¯ä»¥ä½¿ç”¨ `vmem_altmap` å’Œ
+:c:func:`vmemmap_alloc_block_buf` 助手æ¥åˆ†é…æŒä¹…性内存设备上的内存映射。
+
+ZONE_DEVICE
+===========
+`ZONE_DEVICE` 设施建立在 `SPARSEM_VMEMMAP` 之上,为设备驱动识别的物ç†åœ°å€èŒƒ
+å›´æä¾› `struct page` `mem_map` æœåŠ¡ã€‚ `ZONE_DEVICE` çš„ "设备" æ–¹é¢ä¸Žä»¥ä¸‹
+事实有关:这些地å€èŒƒå›´çš„页é¢å¯¹è±¡ä»Žæœªè¢«åœ¨çº¿æ ‡è®°è¿‡ï¼Œè€Œä¸”必须对设备进行引用,而ä¸ä»…ä»…
+是页é¢ï¼Œä»¥ä¿æŒå†…存被“é”定â€ä»¥ä¾¿ä½¿ç”¨ã€‚ `ZONE_DEVICE` ,通过 :c:func:`devm_memremap_pages` ,
+为给定的pfns范围执行足够的内存热æ’æ‹”æ¥å¼€å¯ :c:func:`pfn_to_page`,
+:c:func:`page_to_pfn`, ,和 :c:func:`get_user_pages` æœåŠ¡ã€‚由于页é¢å¼•
+用计数永远ä¸ä¼šä½ŽäºŽ1,所以页é¢æ°¸è¿œä¸ä¼šè¢«è¿½è¸ªä¸ºç©ºé—²å†…存,页é¢çš„ `struct list_head lru`
+空间被é‡æ–°åˆ©ç”¨ï¼Œç”¨äºŽå‘映射该内存的主机设备/驱动程åºè¿›è¡Œåå‘引用。
+
+虽然 `SPARSEMEM` 将内存作为一个区段的集åˆï¼Œå¯ä»¥é€‰æ‹©æ”¶é›†å¹¶åˆæˆå†…å­˜å—,但
+`ZONE_DEVICE` 用户需è¦æ›´å°çš„颗粒度æ¥å¡«å…… `mem_map` 。鉴于 `ZONE_DEVICE`
+内存从未被在线标记,因此它的内存范围从未通过sysfs内存热æ’æ‹”api暴露在内存å—边界
+上。这个实现ä¾èµ–于这ç§ç¼ºä¹ç”¨æˆ·æŽ¥å£çš„约æŸï¼Œå…许å­æ®µå¤§å°çš„内存范围被指定给
+:c:func:`arch_add_memory` ,å³å†…存热æ’拔的上åŠéƒ¨åˆ†ã€‚å­æ®µæ”¯æŒå…许2MB作为
+:c:func:`devm_memremap_pages` 的跨架构通用对é½é¢—粒度。
+
+`ZONE_DEVICE` 的用户是:
+
+* pmem: 通过DAX映射将平å°æŒä¹…性内存作为直接I/O目标使用。
+
+* hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` ,
+ 以å…许设备驱动程åºå调与设备内存相关的内存管ç†äº‹ä»¶ï¼Œé€šå¸¸æ˜¯GPU内存。å‚è§/vm/hmm.rst。
+
+* p2pdma: 创建 `struct page` 对象,å…许PCI/E拓扑结构中的peer设备å调它们之间的
+ 直接DMAæ“作,å³ç»•è¿‡ä¸»æœºå†…存。
diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/vm/mmu_notifier.rst
new file mode 100644
index 000000000000..b29a37b33628
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/mmu_notifier.rst
@@ -0,0 +1,97 @@
+:Original: Documentation/vm/mmu_notifier.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+什么时候需è¦é¡µè¡¨é”内通知?
+==========================
+
+当清除一个pte/pmd时,我们å¯ä»¥é€‰æ‹©é€šè¿‡åœ¨é¡µè¡¨é”下(通知版的\*_clear_flush调用
+mmu_notifier_invalidate_range)通知事件。但这ç§é€šçŸ¥å¹¶ä¸æ˜¯åœ¨æ‰€æœ‰æƒ…况下都需è¦çš„。
+
+对于二级TLB(éžCPU TLB),如IOMMU TLB或设备TLB(当设备使用类似ATS/PASID的东西让
+IOMMUèµ°CPU页表æ¥è®¿é—®è¿›ç¨‹çš„虚拟地å€ç©ºé—´ï¼‰ã€‚åªæœ‰ä¸¤ç§æƒ…况需è¦åœ¨æ¸…除pte/pmd时在æŒæœ‰é¡µ
+表é”çš„åŒæ—¶é€šçŸ¥è¿™äº›äºŒçº§TLB:
+
+ A) 在mmu_notifier_invalidate_range_end()之å‰ï¼Œæ”¯æŒé¡µçš„地å€è¢«é‡Šæ”¾ã€‚
+ B) 一个页表项被更新以指å‘一个新的页é¢ï¼ˆCOW,零页上的写异常,__replace_page(),...)。
+
+情况A很明显,你ä¸æƒ³å†’风险让设备写到一个现在å¯èƒ½è¢«ä¸€äº›å®Œå…¨ä¸åŒçš„任务使用的页é¢ã€‚
+
+情况B更加微妙。为了正确起è§ï¼Œå®ƒéœ€è¦æŒ‰ç…§ä»¥ä¸‹åºåˆ—å‘生:
+
+ - 上页表é”
+ - 清除页表项并通知 ([pmd/pte]p_huge_clear_flush_notify())
+ - 设置页表项以指å‘新页
+
+如果在设置新的pte/pmd值之å‰ï¼Œæ¸…除页表项之åŽæ²¡æœ‰è¿›è¡Œé€šçŸ¥ï¼Œé‚£ä¹ˆä½ å°±ä¼šç ´å设备的C11或
+C++11等内存模型。
+
+考虑以下情况(设备使用类似于ATS/PASID的功能)。
+
+两个地å€addrAå’ŒaddrB,这样|addrA - addrB| >= PAGE_SIZE,我们å‡è®¾å®ƒä»¬æ˜¯COWçš„
+写ä¿æŠ¤ï¼ˆB的其他情况也适用)。
+
+::
+
+ [Time N] --------------------------------------------------------------------
+ CPU-thread-0 {å°è¯•å†™åˆ°addrA}
+ CPU-thread-1 {å°è¯•å†™åˆ°addrB}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {读å–addrA并填充设备TLB}
+ DEV-thread-2 {读å–addrB并填充设备TLB}
+ [Time N+1] ------------------------------------------------------------------
+ CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+ CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+2] ------------------------------------------------------------------
+ CPU-thread-0 {COW_step1: {更新页表以指å‘addrA的新页}}
+ CPU-thread-1 {COW_step1: {更新页表以指å‘addrB的新页}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {preempted}
+ CPU-thread-2 {写入addrA,这是对新页é¢çš„写入}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+3] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {preempted}
+ CPU-thread-2 {}
+ CPU-thread-3 {写入addrB,这是一个写入新页的过程}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+4] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {}
+ DEV-thread-2 {}
+ [Time N+5] ------------------------------------------------------------------
+ CPU-thread-0 {preempted}
+ CPU-thread-1 {}
+ CPU-thread-2 {}
+ CPU-thread-3 {}
+ DEV-thread-0 {从旧页中读å–addrA}
+ DEV-thread-2 {从新页é¢è¯»å–addrB}
+
+所以在这里,因为在N+2的时候,清空页表项没有和通知一起作废二级TLB,设备在看到addrA的新值之å‰
+就看到了addrB的新值。这就破å了设备的总内存åºã€‚
+
+当改å˜ä¸€ä¸ªpte的写ä¿æŠ¤æˆ–指å‘一个新的具有相åŒå†…容的写ä¿æŠ¤é¡µï¼ˆKSM)时,将mmu_notifier_invalidate_range
+调用延迟到页表é”外的mmu_notifier_invalidate_range_end()是å¯ä»¥çš„。å³ä½¿åšé¡µè¡¨æ›´æ–°çš„线程
+在释放页表é”åŽä½†åœ¨è°ƒç”¨mmu_notifier_invalidate_range_end()å‰è¢«æŠ¢å ï¼Œä¹Ÿæ˜¯å¦‚此。
diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/vm/numa.rst
new file mode 100644
index 000000000000..6af412b924ad
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/numa.rst
@@ -0,0 +1,101 @@
+:Original: Documentation/vm/numa.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+始于1999年11月,作者: <kanoj@sgi.com>
+
+==========================
+何为éžç»Ÿä¸€å†…存访问(NUMA)?
+==========================
+
+这个问题å¯ä»¥ä»Žå‡ ä¸ªè§†è§’æ¥å›žç­”:硬件观点和Linux软件视角。
+
+从硬件角度看,NUMA系统是一个由多个组件或装é…组æˆçš„计算机平å°ï¼Œæ¯ä¸ªç»„件å¯èƒ½åŒ…å«0个或更多的CPUã€
+本地内存和/或IO总线。为了简æ´èµ·è§ï¼Œå¹¶å°†è¿™äº›ç‰©ç†ç»„件/装é…的硬件视角与软件抽象区分开æ¥ï¼Œæˆ‘们在
+本文中称这些组件/装é…为“å•å…ƒâ€ã€‚
+
+æ¯ä¸ªâ€œå•å…ƒâ€éƒ½å¯ä»¥çœ‹ä½œæ˜¯ç³»ç»Ÿçš„一个SMP[对称多处ç†å™¨]å­é›†â€”—尽管独立的SMP系统所需的一些组件å¯èƒ½
+ä¸ä¼šåœ¨ä»»ä½•ç»™å®šçš„å•å…ƒä¸Šå¡«å……。NUMA系统的å•å…ƒé€šè¿‡æŸç§ç³»ç»Ÿäº’连连接在一起——例如,交å‰å¼€å…³æˆ–点对点
+链接是NUMA系统互连的常è§ç±»åž‹ã€‚这两ç§ç±»åž‹çš„互连都å¯ä»¥èšåˆèµ·æ¥ï¼Œä»¥åˆ›å»ºNUMAå¹³å°ï¼Œå…¶ä¸­çš„å•å…ƒä¸Žå…¶
+ä»–å•å…ƒæœ‰å¤šä¸ªè·ç¦»ã€‚
+
+对于Linux,感兴趣的NUMAå¹³å°ä¸»è¦æ˜¯æ‰€è°“的缓存相干NUMA--简称ccNUMA系统系统。在ccNUMA系统中,
+所有的内存都是å¯è§çš„,并且å¯ä»¥ä»Žè¿žæŽ¥åˆ°ä»»ä½•å•å…ƒçš„任何CPU中访问,缓存一致性是由处ç†å™¨ç¼“存和/或
+系统互连在硬件中处ç†ã€‚
+
+内存访问时间和有效的内存带宽å–决于包å«CPUçš„å•å…ƒæˆ–进行内存访问的IO总线è·ç¦»åŒ…å«ç›®æ ‡å†…存的å•å…ƒ
+有多远。例如,连接到åŒä¸€å•å…ƒçš„CPU对内存的访问将比访问其他远程å•å…ƒçš„内存ç»åŽ†æ›´å¿«çš„访问时间和
+更高的带宽。 NUMAå¹³å°å¯ä»¥åœ¨ä»»ä½•ç»™å®šå•å…ƒä¸Šè®¿é—®å¤šç§è¿œç¨‹è·ç¦»çš„(其他)å•å…ƒã€‚
+
+å¹³å°ä¾›åº”商建立NUMA系统并ä¸åªæ˜¯ä¸ºäº†è®©è½¯ä»¶å¼€å‘人员的生活å˜å¾—有趣。相å,这ç§æž¶æž„是æä¾›å¯æ‰©å±•
+内存带宽的一ç§æ‰‹æ®µã€‚然而,为了实现å¯æ‰©å±•çš„内存带宽,系统和应用软件必须安排大部分的内存引用
+[cache misses]到“本地â€å†…存——åŒä¸€å•å…ƒçš„内存,如果有的è¯â€”—或者到最近的有内存的å•å…ƒã€‚
+
+这就自然而然有了Linux软件对NUMA系统的视角:
+
+Linux将系统的硬件资æºåˆ’分为多个软件抽象,称为“节点â€ã€‚Linux将节点映射到硬件平å°çš„物ç†å•å…ƒ
+上,对一些架构的细节进行了抽象。与物ç†å•å…ƒä¸€æ ·ï¼Œè½¯ä»¶èŠ‚点å¯èƒ½åŒ…å«0或更多的CPUã€å†…存和/或IO
+总线。åŒæ ·ï¼Œå¯¹â€œè¾ƒè¿‘â€èŠ‚点的内存访问——映射到较近å•å…ƒçš„节点——通常会比对较远å•å…ƒçš„访问ç»åŽ†æ›´å¿«
+的访问时间和更高的有效带宽。
+
+对于一些架构,如x86,Linux将“éšè—â€ä»»ä½•ä»£è¡¨æ²¡æœ‰å†…存连接的物ç†å•å…ƒçš„节点,并将连接到该å•å…ƒ
+的任何CPUé‡æ–°åˆ†é…到代表有内存的å•å…ƒçš„节点上。因此,在这些架构上,我们ä¸èƒ½å‡è®¾Linux将所有
+çš„CPU与一个给定的节点相关è”,会看到相åŒçš„本地内存访问时间和带宽。
+
+此外,对于æŸäº›æž¶æž„,åŒæ ·ä»¥x86为例,Linux支æŒå¯¹é¢å¤–节点的仿真。对于NUMA仿真,Linux会将现
+有的节点或者éžNUMAå¹³å°çš„系统内存分割æˆå¤šä¸ªèŠ‚点。æ¯ä¸ªæ¨¡æ‹Ÿçš„节点将管ç†åº•å±‚å•å…ƒç‰©ç†å†…存的一部
+分。NUMA仿真对于在éžNUMAå¹³å°ä¸Šæµ‹è¯•NUMA内核和应用功能是éžå¸¸æœ‰ç”¨çš„,当与cpusets一起使用时,
+å¯ä»¥ä½œä¸ºä¸€ç§å†…存资æºç®¡ç†æœºåˆ¶ã€‚[è§ Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+对于æ¯ä¸ªæœ‰å†…存的节点,Linux构建了一个独立的内存管ç†å­ç³»ç»Ÿï¼Œæœ‰è‡ªå·±çš„空闲页列表ã€ä½¿ç”¨ä¸­é¡µåˆ—表ã€
+使用统计和é”æ¥è°ƒè§£è®¿é—®ã€‚此外,Linux为æ¯ä¸ªå†…存区[DMAã€DMA32ã€NORMALã€HIGH_MEMORYã€MOVABLE
+中的一个或多个]构建了一个有åºçš„“区列表â€ã€‚zonelist指定了当一个选定的区/节点ä¸èƒ½æ»¡è¶³åˆ†é…请求
+æ—¶è¦è®¿é—®çš„区/节点。当一个区没有å¯ç”¨çš„内存æ¥æ»¡è¶³è¯·æ±‚时,这ç§æƒ…况被称为“overflow 溢出â€æˆ–
+“fallback 回退â€ã€‚
+
+由于一些节点包å«å¤šä¸ªåŒ…å«ä¸åŒç±»åž‹å†…存的区,Linux必须决定是å¦å¯¹åŒºåˆ—表进行排åºï¼Œä½¿åˆ†é…回退到ä¸åŒ
+节点上的相åŒåŒºç±»åž‹ï¼Œæˆ–åŒä¸€èŠ‚点上的ä¸åŒåŒºç±»åž‹ã€‚这是一个é‡è¦çš„考虑因素,因为有些区,如DMA或DMA32,
+代表了相对稀缺的资æºã€‚Linux选择了一个默认的Node ordered zonelist。这æ„味ç€åœ¨ä½¿ç”¨æŒ‰NUMAè·
+离排åºçš„远程节点之å‰ï¼Œå®ƒä¼šå°è¯•å›žé€€åˆ°åŒä¸€èŠ‚点的其他分区。
+
+默认情况下,Linux会å°è¯•ä»Žæ‰§è¡Œè¯·æ±‚çš„CPU被分é…到的节点中满足内存分é…请求。具体æ¥è¯´ï¼ŒLinux将试
+图从请求æ¥æºçš„节点的适当分区列表中的第一个节点进行分é…。这被称为“本地分é…â€ã€‚如果“本地â€èŠ‚点ä¸èƒ½
+满足请求,内核将检查所选分区列表中其他节点的区域,寻找列表中第一个能满足请求的区域。
+
+本地分é…将倾å‘于ä¿æŒå¯¹åˆ†é…的内存的åŽç»­è®¿é—® “本地â€çš„底层物ç†èµ„æºå’Œç³»ç»Ÿäº’连——åªè¦å†…核代表其分é…
+一些内存的任务åŽæ¥ä¸ä»Žè¯¥å†…å­˜è¿ç§»ã€‚Linux调度器知é“å¹³å°çš„NUMA拓扑结构——体现在“调度域â€æ•°æ®ç»“æž„
+中[è§ Documentation/scheduler/sched-domains.rst]——并且调度器试图尽é‡å‡å°‘任务è¿ç§»åˆ°é¥
+远的调度域中。然而,调度器并没有直接考虑到任务的NUMA足迹。因此,在充分ä¸å¹³è¡¡çš„情况下,任务å¯
+以在节点之间è¿ç§»ï¼Œè¿œç¦»å…¶åˆå§‹èŠ‚点和内核数æ®ç»“构。
+
+系统管ç†å‘˜å’Œåº”用程åºè®¾è®¡è€…å¯ä»¥ä½¿ç”¨å„ç§CPU亲和命令行接å£ï¼Œå¦‚taskset(1)å’Œnumactl(1),以åŠç¨‹
+åºæŽ¥å£ï¼Œå¦‚sched_setaffinity(2),æ¥é™åˆ¶ä»»åŠ¡çš„è¿ç§»ï¼Œä»¥æ”¹å–„NUMA定ä½ã€‚此外,人们å¯ä»¥ä½¿ç”¨
+Linux NUMA内存策略修改内核的默认本地分é…行为。 [è§
+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`].
+
+系统管ç†å‘˜å¯ä»¥ä½¿ç”¨æŽ§åˆ¶ç»„å’ŒCPUsetsé™åˆ¶éžç‰¹æƒç”¨æˆ·åœ¨è°ƒåº¦æˆ–NUMA命令和功能中å¯ä»¥æŒ‡å®šçš„CPU和节点
+的内存。 [è§ Documentation/admin-guide/cgroup-v1/cpusets.rst]
+
+在ä¸éšè—无内存节点的架构上,Linux会在分区列表中åªåŒ…括有内存的区域[节点]。这æ„味ç€å¯¹äºŽä¸€ä¸ªæ— 
+内存的节点,“本地内存节点â€â€”—CPU节点的分区列表中的第一个区域的节点——将ä¸æ˜¯èŠ‚点本身。相å,它
+将是内核在建立分区列表时选择的离它最近的有内存的节点。所以,默认情况下,本地分é…将由内核æä¾›
+最近的å¯ç”¨å†…å­˜æ¥å®Œæˆã€‚这是åŒä¸€æœºåˆ¶çš„结果,该机制å…许这ç§åˆ†é…在一个包å«å†…存的节点溢出时回退到
+其他附近的节点。
+
+一些内核分é…ä¸å¸Œæœ›æˆ–ä¸èƒ½å®¹å¿è¿™ç§åˆ†é…回退行为。相å,他们想确ä¿ä»–们从指定的节点获得内存,或者
+得到通知说该节点没有空闲内存。例如,当一个å­ç³»ç»Ÿåˆ†é…æ¯ä¸ªCPU的内存资æºæ—¶ï¼Œé€šå¸¸æ˜¯è¿™ç§æƒ…况。
+
+一个典型的分é…模å¼æ˜¯ä½¿ç”¨å†…核的numa_node_id()或CPU_to_node()函数获得“当å‰CPUâ€æ‰€åœ¨èŠ‚点的
+节点ID,然åŽåªä»Žè¿”回的节点ID请求内存。当这样的分é…失败时,请求的å­ç³»ç»Ÿå¯ä»¥æ¢å¤åˆ°å®ƒè‡ªå·±çš„回退
+路径。æ¿å—内核内存分é…器就是这样的一个例å­ã€‚或者,å­ç³»ç»Ÿå¯ä»¥é€‰æ‹©åœ¨åˆ†é…失败时ç¦ç”¨æˆ–ä¸å¯ç”¨è‡ªå·±ã€‚
+内核分æžå­ç³»ç»Ÿå°±æ˜¯è¿™æ ·çš„一个例å­ã€‚
+
+如果架构支æŒâ€”—ä¸éšè—无内存节点,那么连接到无内存节点的CPU将总是产生回退路径的开销,或者一些
+å­ç³»ç»Ÿå¦‚果试图完全从无内存的节点分é…内存,将无法åˆå§‹åŒ–。为了é€æ˜Žåœ°æ”¯æŒè¿™ç§æž¶æž„,内核å­ç³»ç»Ÿå¯
+以使用numa_mem_id()或cpu_to_mem()函数æ¥å®šä½è°ƒç”¨æˆ–指定CPU的“本地内存节点â€ã€‚åŒæ ·ï¼Œè¿™æ˜¯åŒ
+一个节点,默认的本地页分é…将从这个节点开始å°è¯•ã€‚
diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
new file mode 100644
index 000000000000..8765cb118f24
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/overcommit-accounting.rst
@@ -0,0 +1,86 @@
+:Original: Documentation/vm/overcommit-accounting.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+
+==============
+超é‡ä½¿ç”¨å®¡è®¡
+==============
+
+Linux内核支æŒä¸‹åˆ—超é‡ä½¿ç”¨å¤„ç†æ¨¡å¼
+
+0
+ å¯å‘å¼è¶…é‡ä½¿ç”¨å¤„ç†ã€‚æ‹’ç»æ˜Žæ˜¾çš„地å€ç©ºé—´è¶…é‡ä½¿ç”¨ã€‚用于一个典型的系统。
+ 它确ä¿ä¸¥é‡çš„疯狂分é…失败,åŒæ—¶å…许超é‡ä½¿ç”¨ä»¥å‡å°‘swap的使用。在这ç§æ¨¡å¼ä¸‹ï¼Œ
+ å…许root分é…ç¨å¤šçš„内存。这是默认的。
+1
+ 总是超é‡ä½¿ç”¨ã€‚适用于一些科学应用。ç»å…¸çš„例å­æ˜¯ä½¿ç”¨ç¨€ç–数组的代ç ï¼Œåªæ˜¯ä¾èµ–
+ 几乎完全由零页组æˆçš„虚拟内存
+
+2
+ ä¸è¶…é‡ä½¿ç”¨ã€‚系统æ交的总地å€ç©ºé—´ä¸å…许超过swap+一个å¯é…置的物ç†RAMçš„æ•°é‡
+ (默认为50%)。根æ®ä½ ä½¿ç”¨çš„æ•°é‡ï¼Œåœ¨å¤§å¤šæ•°æƒ…况下,这æ„味ç€ä¸€ä¸ªè¿›ç¨‹åœ¨è®¿é—®é¡µé¢æ—¶
+ ä¸ä¼šè¢«æ€æ­»ï¼Œä½†ä¼šåœ¨å†…存分é…上收到相应的错误。
+
+ 对于那些想ä¿è¯ä»–们的内存分é…在未æ¥å¯ç”¨è€Œåˆä¸éœ€è¦åˆå§‹åŒ–æ¯ä¸€ä¸ªé¡µé¢çš„应用程åºæ¥è¯´
+ 是很有用的。
+
+超é‡ä½¿ç”¨ç­–略是通过sysctl `vm.overcommit_memory` 设置的。
+
+å¯ä»¥é€šè¿‡ `vm.overcommit_ratio` (百分比)或 `vm.overcommit_kbytes` (ç»å¯¹å€¼ï¼‰
+æ¥è®¾ç½®è¶…é™æ•°é‡ã€‚这些åªæœ‰åœ¨ `vm.overcommit_memory` 被设置为2æ—¶æ‰æœ‰æ•ˆæžœã€‚
+
+在 ``/proc/meminfo`` 中å¯ä»¥åˆ†åˆ«ä»¥CommitLimitå’ŒCommitted_ASçš„å½¢å¼æŸ¥çœ‹å½“å‰
+的超é‡ä½¿ç”¨å’Œæ交é‡ã€‚
+
+陷阱
+====
+
+C语言的堆栈增长是一个éšå«çš„mremap。如果你想得到ç»å¯¹çš„ä¿è¯ï¼Œå¹¶åœ¨æŽ¥è¿‘边缘的地方è¿è¡Œï¼Œ
+ä½  **å¿…é¡»** 为你认为你需è¦çš„最大尺寸的堆栈进行mmap。对于典型的堆栈使用æ¥è¯´ï¼Œè¿™å¹¶
+ä¸é‡è¦ï¼Œä½†å¦‚果你真的éžå¸¸å…³å¿ƒçš„è¯ï¼Œè¿™å°±æ˜¯ä¸€ä¸ªå€¼å¾—关注的案例。
+
+
+在模å¼2中,MAP_NORESERVE标志被忽略。
+
+
+它是如何工作的
+==============
+
+超é‡ä½¿ç”¨æ˜¯åŸºäºŽä»¥ä¸‹è§„则
+
+对于文件映射
+ | SHARED or READ-only - 0 cost (该文件是映射而ä¸æ˜¯äº¤æ¢)
+ | PRIVATE WRITABLE - æ¯ä¸ªå®žä¾‹çš„映射大å°
+
+对于匿å或者 ``/dev/zero`` 映射
+ | SHARED - 映射的大å°
+ | PRIVATE READ-only - 0 cost (但作用ä¸å¤§)
+ | PRIVATE WRITABLE - æ¯ä¸ªå®žä¾‹çš„映射大å°
+
+é¢å¤–的计数
+ | 通过mmap制作å¯å†™å‰¯æœ¬çš„页é¢
+ | 从åŒä¸€æ± ä¸­æå–çš„shmfs内存
+
+状æ€
+====
+
+* 我们核算mmap内存映射
+* 我们核算mprotect在æ交中的å˜åŒ–
+* 我们核算mremap的大å°å˜åŒ–
+* 我们的审计 brk
+* 审计munmap
+* 我们在/proc中报告commit 状æ€
+* 核对并检查分å‰çš„情况
+* 审查堆栈处ç†/执行中的构建
+* å™è¿°SHMfs的情况
+* 实现实际é™åˆ¶çš„执行
+
+å¾…ç»­
+====
+* ptrace 页计数(这很难)。
diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/vm/page_frags.rst
new file mode 100644
index 000000000000..ad27fed33634
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/page_frags.rst
@@ -0,0 +1,38 @@
+:Original: Documentation/vm/page_frag.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页é¢ç‰‡æ®µ
+========
+
+一个页é¢ç‰‡æ®µæ˜¯ä¸€ä¸ªä»»æ„长度的任æ„å移的内存区域,它ä½äºŽä¸€ä¸ª0或更高阶的å¤åˆé¡µé¢ä¸­ã€‚
+该页中的多个碎片在该页的引用计数器中被å•ç‹¬è®¡ç®—。
+
+page_frag函数,page_frag_allocå’Œpage_frag_free,为页é¢ç‰‡æ®µæ供了一个简å•
+的分é…框架。这被网络堆栈和网络设备驱动使用,以æ供一个内存的支æŒåŒºåŸŸï¼Œä½œä¸º
+sk_buff->head使用,或者用于skb_shared_info的 “frags†部分。
+
+为了使用页é¢ç‰‡æ®µAPI,需è¦ä¸€ä¸ªæ”¯æŒé¡µé¢ç‰‡æ®µçš„缓冲区。这为碎片分é…æ供了一个中心点,
+并å…许多个调用使用一个缓存的页é¢ã€‚这样åšçš„好处是å¯ä»¥é¿å…对get_page的多次调用,
+这在分é…时开销å¯èƒ½ä¼šå¾ˆå¤§ã€‚然而,由于这ç§ç¼“存的性质,è¦æ±‚任何对缓存的调用都è¦å—到æ¯
+个CPUçš„é™åˆ¶ï¼Œæˆ–者æ¯ä¸ªCPUçš„é™åˆ¶ï¼Œå¹¶åœ¨æ‰§è¡Œç¢Žç‰‡åˆ†é…时强制ç¦æ­¢ä¸­æ–­ã€‚
+
+网络堆栈在æ¯ä¸ªCPU使用两个独立的缓存æ¥å¤„ç†ç¢Žç‰‡åˆ†é…。netdev_alloc_cache被使用
+netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
+被调用__napi_alloc_fragå’Œ__napi_alloc_skb的调用者使用。这两个调用的主è¦åŒºåˆ«æ˜¯
+它们å¯èƒ½è¢«è°ƒç”¨çš„环境。“netdev†å‰ç¼€çš„函数å¯ä»¥åœ¨ä»»ä½•ä¸Šä¸‹æ–‡ä¸­ä½¿ç”¨ï¼Œå› ä¸ºè¿™äº›å‡½æ•°
+å°†ç¦ç”¨ä¸­æ–­ï¼Œè€Œ â€napi“ å‰ç¼€çš„函数åªå¯ä»¥åœ¨softirq上下文中使用。
+
+许多网络设备驱动程åºä½¿ç”¨ç±»ä¼¼çš„方法æ¥åˆ†é…页é¢ç‰‡æ®µï¼Œä½†é¡µé¢ç‰‡æ®µæ˜¯åœ¨çŽ¯æˆ–æ述符级别上
+缓存的。为了实现这些情况,有必è¦æ供一ç§æ‹†è§£é¡µé¢ç¼“存的通用方法。出于这个原因,
+__page_frag_cache_drain被实现了。它å…许通过一次调用从一个页é¢é‡Šæ”¾å¤šä¸ªå¼•ç”¨ã€‚
+这样åšçš„好处是,它å…许清ç†è¢«æ·»åŠ åˆ°ä¸€ä¸ªé¡µé¢çš„多个引用,以é¿å…æ¯æ¬¡åˆ†é…都调用
+get_page。
+
+Alexander Duyck,2016年11月29日。
diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/vm/page_owner.rst
new file mode 100644
index 000000000000..9e951fabba9d
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/page_owner.rst
@@ -0,0 +1,116 @@
+:Original: Documentation/vm/page_owner.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+================================
+page owner: 跟踪è°åˆ†é…çš„æ¯ä¸ªé¡µé¢
+================================
+
+概述
+====
+
+page owner是用æ¥è¿½è¸ªè°åˆ†é…çš„æ¯ä¸€ä¸ªé¡µé¢ã€‚它å¯ä»¥ç”¨æ¥è°ƒè¯•å†…存泄æ¼æˆ–找到内存å ç”¨è€…。
+当分é…å‘生时,有关分é…çš„ä¿¡æ¯ï¼Œå¦‚调用堆栈和页é¢çš„顺åºè¢«å­˜å‚¨åˆ°æ¯ä¸ªé¡µé¢çš„特定存储中。
+当我们需è¦äº†è§£æ‰€æœ‰é¡µé¢çš„状æ€æ—¶ï¼Œæˆ‘们å¯ä»¥èŽ·å¾—并分æžè¿™äº›ä¿¡æ¯ã€‚
+
+尽管我们已ç»æœ‰äº†è¿½è¸ªé¡µé¢åˆ†é…/释放的tracepoint,但用它æ¥åˆ†æžè°åˆ†é…çš„æ¯ä¸ªé¡µé¢æ˜¯
+相当å¤æ‚的。我们需è¦æ‰©å¤§è·Ÿè¸ªç¼“冲区,以防止在用户空间程åºå¯åŠ¨å‰å‡ºçŽ°é‡å ã€‚而且,å¯
+动的程åºä¼šä¸æ–­åœ°å°†è·Ÿè¸ªç¼“冲区转出,供以åŽåˆ†æžï¼Œè¿™å°†ä¼šæ”¹å˜ç³»ç»Ÿçš„行为,会产生更多的
+å¯èƒ½æ€§ï¼Œè€Œä¸æ˜¯ä»…ä»…ä¿ç•™åœ¨å†…存中,所以ä¸åˆ©äºŽè°ƒè¯•ã€‚
+
+页é¢æ‰€æœ‰è€…也å¯ä»¥ç”¨äºŽå„ç§ç›®çš„。例如,å¯ä»¥é€šè¿‡æ¯ä¸ªé¡µé¢çš„gfp标志信æ¯èŽ·å¾—精确的碎片
+统计。如果å¯ç”¨äº†page owner,它就已ç»å®žçŽ°å¹¶æ¿€æ´»äº†ã€‚我们éžå¸¸æ¬¢è¿Žå…¶ä»–用途。
+
+page owner在默认情况下是ç¦ç”¨çš„。所以,如果你想使用它,你需è¦åœ¨ä½ çš„å¯åŠ¨cmdline
+中加入"page_owner=on"。如果内核是用page owner构建的,并且由于没有å¯ç”¨å¯åŠ¨
+选项而在è¿è¡Œæ—¶ç¦ç”¨page owner,那么è¿è¡Œæ—¶çš„开销是很å°çš„。如果在è¿è¡Œæ—¶ç¦ç”¨ï¼Œå®ƒä¸
+需è¦å†…å­˜æ¥å­˜å‚¨æ‰€æœ‰è€…ä¿¡æ¯ï¼Œæ‰€ä»¥æ²¡æœ‰è¿è¡Œæ—¶å†…存开销。而且,页é¢æ‰€æœ‰è€…在页é¢åˆ†é…器的
+热路径中åªæ’入了两个ä¸å¯èƒ½çš„分支,如果ä¸å¯ç”¨ï¼Œé‚£ä¹ˆåˆ†é…就会åƒæ²¡æœ‰é¡µé¢æ‰€æœ‰è€…的内核
+一样进行。这两个ä¸å¯èƒ½çš„分支应该ä¸ä¼šå½±å“到分é…的性能,特别是在é™æ€é”®è·³è½¬æ ‡ç­¾ä¿®è¡¥
+功能å¯ç”¨çš„情况下。以下是由于这个功能而导致的内核代ç å¤§å°çš„å˜åŒ–。
+
+- 没有page owner::
+
+ text data bss dec hex filename
+ 48392 2333 644 51369 c8a9 mm/page_alloc.o
+
+- 有page owner::
+
+ text data bss dec hex filename
+ 48800 2445 644 51889 cab1 mm/page_alloc.o
+ 6662 108 29 6799 1a8f mm/page_owner.o
+ 1025 8 8 1041 411 mm/page_ext.o
+
+虽然总共增加了8KB的代ç ï¼Œä½†page_alloc.o增加了520字节,其中ä¸åˆ°ä¸€åŠæ˜¯åœ¨hotpath
+中。构建带有page owner的内核,并在需è¦æ—¶æ‰“开它,将是调试内核内存问题的最佳选择。
+
+有一个问题是由实现细节引起的。页所有者将信æ¯å­˜å‚¨åˆ°struct page扩展的内存中。这
+个内存的åˆå§‹åŒ–时间比稀ç–内存系统中的页é¢åˆ†é…器å¯åŠ¨çš„时间è¦æ™šä¸€äº›ï¼Œæ‰€ä»¥ï¼Œåœ¨åˆå§‹åŒ–
+之å‰ï¼Œè®¸å¤šé¡µé¢å¯ä»¥è¢«åˆ†é…,但它们没有所有者信æ¯ã€‚为了解决这个问题,这些早期分é…çš„
+页é¢åœ¨åˆå§‹åŒ–阶段被调查并标记为分é…。虽然这并ä¸æ„味ç€å®ƒä»¬æœ‰æ­£ç¡®çš„所有者信æ¯ï¼Œä½†è‡³
+少,我们å¯ä»¥æ›´å‡†ç¡®åœ°åˆ¤æ–­è¯¥é¡µæ˜¯å¦è¢«åˆ†é…。在2GB内存的x86-64虚拟机上,有13343
+个早期分é…的页é¢è¢«æ•æ‰å’Œæ ‡è®°ï¼Œå°½ç®¡å®ƒä»¬å¤§éƒ¨åˆ†æ˜¯ç”±ç»“构页扩展功能分é…的。总之,在这
+之åŽï¼Œæ²¡æœ‰ä»»ä½•é¡µé¢å¤„于未追踪状æ€ã€‚
+
+使用方法
+========
+
+1) 构建用户空间的帮助::
+
+ cd tools/vm
+ make page_owner_sort
+
+2) å¯ç”¨page owner: 添加 "page_owner=on" 到 boot cmdline.
+
+3) åšä½ æƒ³è°ƒè¯•çš„工作。
+
+4) 分æžæ¥è‡ªé¡µé¢æ‰€æœ‰è€…çš„ä¿¡æ¯::
+
+ cat /sys/kernel/debug/page_owner > page_owner_full.txt
+ ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+
+ ``page_owner_full.txt`` 的一般输出情况如下(输出信æ¯æ— ç¿»è¯‘价值)::
+
+ Page allocated via order XXX, ...
+ PFN XXX ...
+ // Detailed stack
+
+ Page allocated via order XXX, ...
+ PFN XXX ...
+ // Detailed stack
+
+ ``page_owner_sort`` 工具忽略了 ``PFN`` 行,将剩余的行放在buf中,使用regexpæ
+ å–页åºå€¼ï¼Œè®¡ç®—buf的次数和页数,最åŽæ ¹æ®å‚数进行排åºã€‚
+
+ 在 ``sorted_page_owner.txt`` 中å¯ä»¥çœ‹åˆ°å…³äºŽè°åˆ†é…了æ¯ä¸ªé¡µé¢çš„结果。一般输出::
+
+ XXX times, XXX pages:
+ Page allocated via order XXX, ...
+ // Detailed stack
+
+ 默认情况下, ``page_owner_sort`` 是根æ®buf的时间æ¥æŽ’åºçš„。如果你想
+ 按buf的页数排åºï¼Œè¯·ä½¿ç”¨-må‚数。详细的å‚数是:
+
+ 基本函数:
+
+ Sort:
+ -a 按内存分é…时间排åº
+ -m 按总内存排åº
+ -p 按pid排åºã€‚
+ -P 按tgid排åºã€‚
+ -r 按内存释放时间排åºã€‚
+ -s 按堆栈跟踪排åºã€‚
+ -t 按时间排åºï¼ˆé»˜è®¤ï¼‰ã€‚
+
+ 其它函数:
+
+ Cull:
+ -c 通过比较堆栈跟踪而ä¸æ˜¯æ€»å—æ¥è¿›è¡Œå‰”除。
+
+ Filter:
+ -f 过滤掉内存已被释放的å—çš„ä¿¡æ¯ã€‚
diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/vm/page_table_check.rst
new file mode 100644
index 000000000000..a29fc1b360e6
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/page_table_check.rst
@@ -0,0 +1,56 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Original: Documentation/vm/page_table_check.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+========
+页表检查
+========
+
+概述
+====
+
+页表检查å…许通过确ä¿é˜²æ­¢æŸäº›ç±»åž‹çš„内存æŸåæ¥å¼ºåŒ–内核。
+
+当新的页é¢å¯ä»¥ä»Žç”¨æˆ·ç©ºé—´è®¿é—®æ—¶ï¼Œé¡µè¡¨æ£€æŸ¥é€šè¿‡å°†å®ƒä»¬çš„页表项(PTEs PMD等)添加到页表中æ¥æ‰§è¡Œé¢å¤–
+的验è¯ã€‚
+
+在检测到æŸå的情况下,内核会被崩溃。页表检查有一个å°çš„性能和内存开销。因此,它在默认情况下是ç¦ç”¨
+的,但是在é¢å¤–的加固超过性能æˆæœ¬çš„系统上,å¯ä»¥é€‰æ‹©å¯ç”¨ã€‚å¦å¤–,由于页表检查是åŒæ­¥çš„,它å¯ä»¥å¸®åŠ©è°ƒ
+试åŒæ˜ å°„内存æŸå问题,在错误的映射å‘生时崩溃内核,而ä¸æ˜¯åœ¨å†…å­˜æŸå错误å‘生åŽå†…核崩溃。
+
+åŒé‡æ˜ å°„检测逻辑
+================
+
++-------------------+-------------------+-------------------+------------------+
+| Current Mapping | New mapping | Permissions | Rule |
++===================+===================+===================+==================+
+| Anonymous | Anonymous | Read | Allow |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous | Anonymous | Read / Write | Prohibit |
++-------------------+-------------------+-------------------+------------------+
+| Anonymous | Named | Any | Prohibit |
++-------------------+-------------------+-------------------+------------------+
+| Named | Anonymous | Any | Prohibit |
++-------------------+-------------------+-------------------+------------------+
+| Named | Named | Any | Allow |
++-------------------+-------------------+-------------------+------------------+
+
+å¯ç”¨é¡µè¡¨æ£€æŸ¥
+============
+
+用以下方法构建内核:
+
+- PAGE_TABLE_CHECK=y
+ 注æ„,它åªèƒ½åœ¨ARCH_SUPPORTS_PAGE_TABLE_CHECKå¯ç”¨çš„å¹³å°ä¸Šå¯ç”¨ã€‚
+
+- 使用 "page_table_check=on" 内核å‚æ•°å¯åŠ¨ã€‚
+
+å¯ä»¥é€‰æ‹©ç”¨PAGE_TABLE_CHECK_ENFORCEDæ¥æž„建内核,以便在没有é¢å¤–的内核å‚数的情况下获得页表
+支æŒã€‚
diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/vm/remap_file_pages.rst
new file mode 100644
index 000000000000..af6b7e28af23
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/remap_file_pages.rst
@@ -0,0 +1,32 @@
+:Original: Documentation/vm/remap_file_pages.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+==============================
+remap_file_pages()系统调用
+==============================
+
+remap_file_pages()系统调用被用æ¥åˆ›å»ºä¸€ä¸ªéžçº¿æ€§æ˜ å°„,也就是说,在这个映射中,
+文件的页é¢è¢«æ— åºæ˜ å°„到内存中。使用remap_file_pages()比é‡å¤è°ƒç”¨mmap(2)的好
+处是,å‰è€…ä¸éœ€è¦å†…核创建é¢å¤–çš„VMA(虚拟内存区)数æ®ç»“构。
+
+支æŒéžçº¿æ€§æ˜ å°„需è¦åœ¨å†…核虚拟内存å­ç³»ç»Ÿä¸­ç¼–写大é‡çš„non-trivial的代ç ï¼ŒåŒ…括热
+路径。å¦å¤–,为了使éžçº¿æ€§æ˜ å°„工作,内核需è¦ä¸€ç§æ–¹æ³•æ¥åŒºåˆ†æ­£å¸¸çš„页表项和带有文件
+å移的项(pte_file)。内核为达到这个目的在PTE中ä¿ç•™äº†æ ‡å¿—。PTE标志是稀缺资
+æºï¼Œç‰¹åˆ«æ˜¯åœ¨æŸäº›CPU架构上。如果能腾出这个标志用于其他用途就更好了。
+
+幸è¿çš„是,在生活中并没有很多remap_file_pages()的用户。åªçŸ¥é“有一个ä¼ä¸šçš„RDBMS
+实现在32ä½ç³»ç»Ÿä¸Šä½¿ç”¨è¿™ä¸ªç³»ç»Ÿè°ƒç”¨æ¥æ˜ å°„比32ä½è™šæ‹Ÿåœ°å€ç©ºé—´çº¿æ€§å°ºå¯¸æ›´å¤§çš„文件。
+由于64ä½ç³»ç»Ÿçš„广泛使用,这ç§ä½¿ç”¨æƒ…况已ç»ä¸é‡è¦äº†ã€‚
+
+syscall被废弃了,现在用一个模拟æ¥ä»£æ›¿å®ƒã€‚仿真会创建新的VMA,而ä¸æ˜¯éžçº¿æ€§æ˜ å°„。
+对于remap_file_pages()的少数用户æ¥è¯´ï¼Œå®ƒçš„工作速度会å˜æ…¢ï¼Œä½†ABI被ä¿ç•™äº†ã€‚
+
+仿真的一个副作用(除了性能之外)是,由于é¢å¤–çš„VMA,用户å¯ä»¥æ›´å®¹æ˜“达到
+vm.max_map_countçš„é™åˆ¶ã€‚关于é™åˆ¶çš„更多细节,请å‚è§DEFAULT_MAX_MAP_COUNT
+的注释。
diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
new file mode 100644
index 000000000000..50694d97c426
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/split_page_table_lock.rst
@@ -0,0 +1,96 @@
+:Original: Documentation/vm/split_page_table_lock.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=================================
+分页表é”(split page table lock)
+=================================
+
+最åˆï¼Œmm->page_table_lock spinlockä¿æŠ¤äº†mm_struct的所有页表。但是这ç§æ–¹
+法导致了多线程应用程åºçš„缺页异常å¯æ‰©å±•æ€§å·®ï¼Œå› ä¸ºå¯¹é”的争夺很激烈。为了æ高å¯æ‰©
+展性,我们引入了分页表é”。
+
+有了分页表é”,我们就有了å•ç‹¬çš„æ¯å¼ è¡¨é”æ¥é¡ºåºåŒ–对表的访问。目å‰ï¼Œæˆ‘们对PTEå’Œ
+PMD表使用分页é”。对高层表的访问由mm->page_table_lockä¿æŠ¤ã€‚
+
+有一些辅助工具æ¥é”定/解é”一个表和其他访问器函数:
+
+ - pte_offset_map_lock()
+ 映射pte并获å–PTE表é”,返回所å–é”的指针;
+ - pte_unmap_unlock()
+ 解é”和解映射PTE表;
+ - pte_alloc_map_lock()
+ 如果需è¦çš„è¯ï¼Œåˆ†é…PTE表并获å–é”,如果分é…失败,返回已获å–çš„é”的指针
+ 或NULL;
+ - pte_lockptr()
+ 返回指å‘PTE表é”的指针;
+ - pmd_lock()
+ å–å¾—PMD表é”,返回所å–é”的指针。
+ - pmd_lockptr()
+ 返回指å‘PMD表é”的指针;
+
+如果CONFIG_SPLIT_PTLOCK_CPUS(通常为4)å°äºŽæˆ–等于NR_CPUS,则在编译
+æ—¶å¯ç”¨PTE表的分页表é”。如果分页é”被ç¦ç”¨ï¼Œæ‰€æœ‰çš„表都由mm->page_table_lock
+æ¥ä¿æŠ¤ã€‚
+
+如果PMD表å¯ç”¨äº†åˆ†é¡µé”,并且架构支æŒå®ƒï¼Œé‚£ä¹ˆPMD表的分页é”就会被å¯ç”¨ï¼ˆè§
+下文)。
+
+Hugetlb 和分页表é”
+==================
+
+Hugetlbå¯ä»¥æ”¯æŒå¤šç§é¡µé¢å¤§å°ã€‚我们åªå¯¹PMD级别使用分页é”,但ä¸å¯¹PUD使用。
+
+Hugetlb特定的辅助函数:
+
+ - huge_pte_lock()
+ 对PMD_SIZE页é¢é‡‡å–pmd分割é”,å¦åˆ™mm->page_table_lockï¼›
+ - huge_pte_lockptr()
+ 返回指å‘表é”的指针。
+
+架构对分页表é”的支æŒ
+====================
+
+没有必è¦ç‰¹åˆ«å¯ç”¨PTE分页表é”:所有需è¦çš„东西都由pgtable_pte_page_ctor()
+å’Œpgtable_pte_page_dtor()完æˆï¼Œå®ƒä»¬å¿…须在PTE表分é…/释放时被调用。
+
+ç¡®ä¿æž¶æž„ä¸ä½¿ç”¨slab分é…器æ¥åˆ†é…页表:slab使用page->slab_cacheæ¥åˆ†é…其页
+é¢ã€‚这个区域与page->ptl共享存储。
+
+PMD分页é”åªæœ‰åœ¨ä½ æœ‰ä¸¤ä¸ªä»¥ä¸Šçš„页表级别时æ‰æœ‰æ„义。
+
+å¯ç”¨PMD分页é”需è¦åœ¨PMD表分é…时调用pgtable_pmd_page_ctor(),在释放时调
+用pgtable_pmd_page_dtor()。
+
+分é…通常å‘生在pmd_alloc_one()中,释放å‘生在pmd_free()å’Œpmd_free_tlb()
+中,但è¦ç¡®ä¿è¦†ç›–所有的PMD表分é…/释放路径:å³X86_PAE在pgd_alloc()中预先
+分é…一些PMD。
+
+一切就绪åŽï¼Œä½ å¯ä»¥è®¾ç½®CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。
+
+注æ„:pgtable_pte_page_ctor()å’Œpgtable_pmd_page_ctor()å¯èƒ½å¤±è´¥--å¿…
+须正确处ç†ã€‚
+
+page->ptl
+=========
+
+page->ptl用于访问分割页表é”,其中'page'是包å«è¯¥è¡¨çš„页é¢struct page。它
+与page->private(以åŠunion中的其他几个字段)共享存储。
+
+为了é¿å…增加struct page的大å°å¹¶èŽ·å¾—最佳性能,我们使用了一个技巧:
+
+ - 如果spinlock_t适åˆäºŽlong,我们使用page->ptr作为spinlock,这样我们
+ å°±å¯ä»¥é¿å…间接访问并节çœä¸€ä¸ªç¼“存行。
+ - 如果spinlock_t的大å°å¤§äºŽlong的大å°ï¼Œæˆ‘们使用page->ptl作为spinlock_t
+ 的指针并动æ€åˆ†é…它。这å…许在å¯ç”¨DEBUG_SPINLOCK或DEBUG_LOCK_ALLOCçš„
+ 情况下使用分页é”,但由于间接访问而多花了一个缓存行。
+
+PTE表的spinlock_t分é…在pgtable_pte_page_ctor()中,PMD表的spinlock_t
+分é…在pgtable_pmd_page_ctor()中。
+
+请ä¸è¦ç›´æŽ¥è®¿é—®page->ptl - -使用适当的辅助函数。
diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/vm/z3fold.rst
new file mode 100644
index 000000000000..57204aa08caa
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/z3fold.rst
@@ -0,0 +1,31 @@
+:Original: Documentation/vm/z3fold.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+======
+z3fold
+======
+
+z3fold是一个专门用于存储压缩页的分é…器。它被设计为æ¯ä¸ªç‰©ç†é¡µæœ€å¤šå¯ä»¥å­˜å‚¨ä¸‰ä¸ªåŽ‹ç¼©é¡µã€‚
+它是zbudçš„è¡ç”Ÿç‰©ï¼Œå…许更高的压缩率,ä¿æŒå…¶å‰è¾ˆçš„简å•æ€§å’Œç¡®å®šæ€§ã€‚
+
+z3foldå’Œzbud的主è¦åŒºåˆ«æ˜¯:
+
+* 与zbudä¸åŒçš„是,z3foldå…许最大的PAGE_SIZE分é…。
+* z3fold在其页é¢ä¸­æœ€å¤šå¯ä»¥å®¹çº³3个压缩页é¢
+* z3fold本身没有输出任何API,因此打算通过zpoolçš„APIæ¥ä½¿ç”¨
+
+为了ä¿æŒç¡®å®šæ€§å’Œç®€å•æ€§ï¼Œz3fold,就åƒzbud一样,总是在æ¯é¡µå­˜å‚¨ä¸€ä¸ªæ•´æ•°çš„压缩页,但是
+它最多å¯ä»¥å­˜å‚¨3页,ä¸åƒzbud最多å¯ä»¥å­˜å‚¨2页。因此压缩率达到2.7å€å·¦å³ï¼Œè€Œzbud的压缩
+率是1.7å€å·¦å³ã€‚
+
+ä¸åƒzbud(但也åƒzsmalloc),z3fold_alloc()那样ä¸è¿”回一个å¯é‡å¤å¼•ç”¨çš„指针。相å,它
+返回一个无符å·é•¿å¥æŸ„,它编ç äº†è¢«åˆ†é…对象的实际ä½ç½®ã€‚
+
+ä¿æŒæœ‰æ•ˆçš„压缩率接近于zsmalloc,z3foldä¸ä¾èµ–于MMUçš„å¯ç”¨ï¼Œå¹¶æ供更å¯é¢„测的回收行
+为,这使得它更适åˆäºŽå°åž‹å’Œå应迅速的系统。
diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/vm/zsmalloc.rst
new file mode 100644
index 000000000000..29e9c70a8eb6
--- /dev/null
+++ b/Documentation/translations/zh_CN/vm/zsmalloc.rst
@@ -0,0 +1,78 @@
+:Original: Documentation/vm/zs_malloc.rst
+
+:翻译:
+
+ å¸å»¶è…¾ Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+========
+zsmalloc
+========
+
+这个分é…器是为与zram一起使用而设计的。因此,该分é…器应该在低内存æ¡ä»¶ä¸‹å·¥ä½œè‰¯å¥½ã€‚特别是,
+它从未å°è¯•è¿‡higher order页é¢çš„分é…,这在内存压力下很å¯èƒ½ä¼šå¤±è´¥ã€‚å¦ä¸€æ–¹é¢ï¼Œå¦‚果我们åª
+是使用å•ï¼ˆ0-order)页,它将é­å—éžå¸¸é«˜çš„碎片化 - 任何大å°ä¸ºPAGE_SIZE/2或更大的对象将
+å æ®æ•´ä¸ªé¡µé¢ã€‚这是其å‰èº«ï¼ˆxvmalloc)的主è¦é—®é¢˜ä¹‹ä¸€ã€‚
+
+为了克æœè¿™äº›é—®é¢˜ï¼Œzsmalloc分é…了一堆0-order页é¢ï¼Œå¹¶ä½¿ç”¨å„ç§"struct page"字段将它
+们链接起æ¥ã€‚这些链接的页é¢ä½œä¸ºä¸€ä¸ªå•ä¸€çš„higher order页é¢ï¼Œå³ä¸€ä¸ªå¯¹è±¡å¯ä»¥è·¨è¶Š0-order
+页é¢çš„边界。代ç å°†è¿™äº›é“¾æŽ¥çš„页é¢ä½œä¸ºä¸€ä¸ªå®žä½“,称为zspage。
+
+为了简å•èµ·è§ï¼Œzsmallocåªèƒ½åˆ†é…大å°ä¸è¶…过PAGE_SIZE的对象,因为这满足了所有当å‰ç”¨æˆ·çš„
+è¦æ±‚(在最å的情况下,页é¢æ˜¯ä¸å¯åŽ‹ç¼©çš„,因此以"原样"å³æœªåŽ‹ç¼©çš„å½¢å¼å­˜å‚¨ï¼‰ã€‚对于大于这
+个大å°çš„分é…请求,会返回失败(è§zs_malloc)。
+
+此外,zs_malloc()并ä¸è¿”回一个å¯é‡å¤å¼•ç”¨çš„指针。相å,它返回一个ä¸é€æ˜Žçš„å¥æŸ„(无符å·
+长),它编ç äº†è¢«åˆ†é…对象的实际ä½ç½®ã€‚è¿™ç§é—´æŽ¥æ€§çš„原因是zsmalloc并ä¸ä¿æŒzspages的永久
+映射,因为这在32ä½ç³»ç»Ÿä¸Šä¼šå¯¼è‡´é—®é¢˜ï¼Œå› ä¸ºå†…核空间映射的VA区域éžå¸¸å°ã€‚因此,在使用分é…
+的内存之å‰ï¼Œå¯¹è±¡å¿…须使用zs_map_object()进行映射以获得一个å¯ç”¨çš„指针,éšåŽä½¿ç”¨
+zs_unmap_object()解除映射。
+
+stat
+====
+
+通过CONFIG_ZSMALLOC_STAT,我们å¯ä»¥é€šè¿‡ ``/sys/kernel/debug/zsmalloc/<user name>``
+看到zsmalloc内部信æ¯ã€‚下é¢æ˜¯ä¸€ä¸ªç»Ÿè®¡è¾“出的例å­ã€‚::
+
+ # cat /sys/kernel/debug/zsmalloc/zram0/classes
+
+ class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
+ ...
+ ...
+ 9 176 0 1 186 129 8 4
+ 10 192 1 0 2880 2872 135 3
+ 11 208 0 1 819 795 42 2
+ 12 224 0 1 219 159 12 4
+ ...
+ ...
+
+
+class
+ 索引
+size
+ zspage存储对象大å°
+almost_empty
+ ZS_ALMOST_EMPTY zspageçš„æ•°é‡ï¼ˆè§ä¸‹æ–‡ï¼‰ã€‚
+almost_full
+ ZS_ALMOST_FULL zspageçš„æ•°é‡(è§ä¸‹å›¾)
+obj_allocated
+ 已分é…对象的数é‡
+obj_used
+ 分é…给用户的对象的数é‡
+pages_used
+ 为该类分é…的页数
+pages_per_zspage
+ 组æˆä¸€ä¸ªzspageçš„0-order页é¢çš„æ•°é‡
+
+当n <= N / f时,我们将一个zspage分é…ç»™ZS_ALMOST_EMPTYfullness组,其中
+
+* n = 已分é…对象的数é‡
+* N = zspageå¯ä»¥å­˜å‚¨çš„对象总数
+* f = fullness_threshold_frac(å³ï¼Œç›®å‰æ˜¯4个)
+
+åŒæ ·åœ°ï¼Œæˆ‘们将zspage分é…ç»™:
+
+* ZS_ALMOST_FULL when n > N / f
+* ZS_EMPTY when n == 0
+* ZS_FULL when n == N
diff --git a/Documentation/userspace-api/ioctl/cdrom.rst b/Documentation/userspace-api/ioctl/cdrom.rst
index 682948fc88a3..2ad91dbebd7c 100644
--- a/Documentation/userspace-api/ioctl/cdrom.rst
+++ b/Documentation/userspace-api/ioctl/cdrom.rst
@@ -718,6 +718,9 @@ CDROMPLAYBLK
CDROMGETSPINDOWN
+ Obsolete, was ide-cd only
+
+
usage::
char spindown;
@@ -736,6 +739,9 @@ CDROMGETSPINDOWN
CDROMSETSPINDOWN
+ Obsolete, was ide-cd only
+
+
usage::
char spindown
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index f35552ff19ba..b8ea59493964 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -1,14 +1,14 @@
.. SPDX-License-Identifier: GPL-2.0
.. Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
.. Copyright © 2019-2020 ANSSI
-.. Copyright © 2021 Microsoft Corporation
+.. Copyright © 2021-2022 Microsoft Corporation
=====================================
Landlock: unprivileged access control
=====================================
:Author: Mickaël Salaün
-:Date: March 2021
+:Date: May 2022
The goal of Landlock is to enable to restrict ambient rights (e.g. global
filesystem access) for a set of processes. Because Landlock is a stackable
@@ -18,6 +18,13 @@ is expected to help mitigate the security impact of bugs or
unexpected/malicious behaviors in user space applications. Landlock empowers
any process, including unprivileged ones, to securely restrict themselves.
+We can quickly make sure that Landlock is enabled in the running system by
+looking for "landlock: Up and running" in kernel logs (as root): ``dmesg | grep
+landlock || journalctl -kg landlock`` . Developers can also easily check for
+Landlock support with a :ref:`related system call <landlock_abi_versions>`. If
+Landlock is not currently supported, we need to :ref:`configure the kernel
+appropriately <kernel_support>`.
+
Landlock rules
==============
@@ -29,14 +36,15 @@ the thread enforcing it, and its future children.
Defining and enforcing a security policy
----------------------------------------
-We first need to create the ruleset that will contain our rules. For this
+We first need to define the ruleset that will contain our rules. For this
example, the ruleset will contain rules that only allow read actions, but write
actions will be denied. The ruleset then needs to handle both of these kind of
-actions.
+actions. This is required for backward and forward compatibility (i.e. the
+kernel and user space may not know each other's supported restrictions), hence
+the need to be explicit about the denied-by-default access rights.
.. code-block:: c
- int ruleset_fd;
struct landlock_ruleset_attr ruleset_attr = {
.handled_access_fs =
LANDLOCK_ACCESS_FS_EXECUTE |
@@ -51,9 +59,34 @@ actions.
LANDLOCK_ACCESS_FS_MAKE_SOCK |
LANDLOCK_ACCESS_FS_MAKE_FIFO |
LANDLOCK_ACCESS_FS_MAKE_BLOCK |
- LANDLOCK_ACCESS_FS_MAKE_SYM,
+ LANDLOCK_ACCESS_FS_MAKE_SYM |
+ LANDLOCK_ACCESS_FS_REFER,
};
+Because we may not know on which kernel version an application will be
+executed, it is safer to follow a best-effort security approach. Indeed, we
+should try to protect users as much as possible whatever the kernel they are
+using. To avoid binary enforcement (i.e. either all security features or
+none), we can leverage a dedicated Landlock command to get the current version
+of the Landlock ABI and adapt the handled accesses. Let's check if we should
+remove the `LANDLOCK_ACCESS_FS_REFER` access right which is only supported
+starting with the second version of the ABI.
+
+.. code-block:: c
+
+ int abi;
+
+ abi = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION);
+ if (abi < 2) {
+ ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER;
+ }
+
+This enables to create an inclusive ruleset that will contain our rules.
+
+.. code-block:: c
+
+ int ruleset_fd;
+
ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
if (ruleset_fd < 0) {
perror("Failed to create a ruleset");
@@ -92,6 +125,11 @@ descriptor.
return 1;
}
+It may also be required to create rules following the same logic as explained
+for the ruleset creation, by filtering access rights according to the Landlock
+ABI version. In this example, this is not required because
+`LANDLOCK_ACCESS_FS_REFER` is not allowed by any rule.
+
We now have a ruleset with one rule allowing read access to ``/usr`` while
denying all other handled accesses for the filesystem. The next step is to
restrict the current thread from gaining more privileges (e.g. thanks to a SUID
@@ -125,6 +163,27 @@ ruleset.
Full working code can be found in `samples/landlock/sandboxer.c`_.
+Good practices
+--------------
+
+It is recommended setting access rights to file hierarchy leaves as much as
+possible. For instance, it is better to be able to have ``~/doc/`` as a
+read-only hierarchy and ``~/tmp/`` as a read-write hierarchy, compared to
+``~/`` as a read-only hierarchy and ``~/tmp/`` as a read-write hierarchy.
+Following this good practice leads to self-sufficient hierarchies that don't
+depend on their location (i.e. parent directories). This is particularly
+relevant when we want to allow linking or renaming. Indeed, having consistent
+access rights per directory enables to change the location of such directory
+without relying on the destination directory access rights (except those that
+are required for this operation, see `LANDLOCK_ACCESS_FS_REFER` documentation).
+Having self-sufficient hierarchies also helps to tighten the required access
+rights to the minimal set of data. This also helps avoid sinkhole directories,
+i.e. directories where data can be linked to but not linked from. However,
+this depends on data organization, which might not be controlled by developers.
+In this case, granting read-write access to ``~/tmp/``, instead of write-only
+access, would potentially allow to move ``~/tmp/`` to a non-readable directory
+and still keep the ability to list the content of ``~/tmp/``.
+
Layers of file path access rights
---------------------------------
@@ -192,6 +251,58 @@ To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
process, a sandboxed process should have a subset of the target process rules,
which means the tracee must be in a sub-domain of the tracer.
+Compatibility
+=============
+
+Backward and forward compatibility
+----------------------------------
+
+Landlock is designed to be compatible with past and future versions of the
+kernel. This is achieved thanks to the system call attributes and the
+associated bitflags, particularly the ruleset's `handled_access_fs`. Making
+handled access right explicit enables the kernel and user space to have a clear
+contract with each other. This is required to make sure sandboxing will not
+get stricter with a system update, which could break applications.
+
+Developers can subscribe to the `Landlock mailing list
+<https://subspace.kernel.org/lists.linux.dev.html>`_ to knowingly update and
+test their applications with the latest available features. In the interest of
+users, and because they may use different kernel versions, it is strongly
+encouraged to follow a best-effort security approach by checking the Landlock
+ABI version at runtime and only enforcing the supported features.
+
+.. _landlock_abi_versions:
+
+Landlock ABI versions
+---------------------
+
+The Landlock ABI version can be read with the sys_landlock_create_ruleset()
+system call:
+
+.. code-block:: c
+
+ int abi;
+
+ abi = landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION);
+ if (abi < 0) {
+ switch (errno) {
+ case ENOSYS:
+ printf("Landlock is not supported by the current kernel.\n");
+ break;
+ case EOPNOTSUPP:
+ printf("Landlock is currently disabled.\n");
+ break;
+ }
+ return 0;
+ }
+ if (abi >= 2) {
+ printf("Landlock supports LANDLOCK_ACCESS_FS_REFER.\n");
+ }
+
+The following kernel interfaces are implicitly supported by the first ABI
+version. Features only supported from a specific version are explicitly marked
+as such.
+
Kernel interface
================
@@ -228,21 +339,6 @@ Enforcing a ruleset
Current limitations
===================
-File renaming and linking
--------------------------
-
-Because Landlock targets unprivileged access controls, it is needed to properly
-handle composition of rules. Such property also implies rules nesting.
-Properly handling multiple layers of ruleset, each one of them able to restrict
-access to files, also implies to inherit the ruleset restrictions from a parent
-to its hierarchy. Because files are identified and restricted by their
-hierarchy, moving or linking a file from one directory to another implies to
-propagate the hierarchy constraints. To protect against privilege escalations
-through renaming or linking, and for the sake of simplicity, Landlock currently
-limits linking and renaming to the same directory. Future Landlock evolutions
-will enable more flexibility for renaming and linking, with dedicated ruleset
-flags.
-
Filesystem topology modification
--------------------------------
@@ -267,8 +363,8 @@ restrict such paths with dedicated ruleset flags.
Ruleset layers
--------------
-There is a limit of 64 layers of stacked rulesets. This can be an issue for a
-task willing to enforce a new ruleset in complement to its 64 inherited
+There is a limit of 16 layers of stacked rulesets. This can be an issue for a
+task willing to enforce a new ruleset in complement to its 16 inherited
rulesets. Once this limit is reached, sys_landlock_restrict_self() returns
E2BIG. It is then strongly suggested to carefully build rulesets once in the
life of a thread, especially for applications able to launch other applications
@@ -281,6 +377,44 @@ Memory usage
Kernel memory allocated to create rulesets is accounted and can be restricted
by the Documentation/admin-guide/cgroup-v1/memory.rst.
+Previous limitations
+====================
+
+File renaming and linking (ABI 1)
+---------------------------------
+
+Because Landlock targets unprivileged access controls, it needs to properly
+handle composition of rules. Such property also implies rules nesting.
+Properly handling multiple layers of rulesets, each one of them able to
+restrict access to files, also implies inheritance of the ruleset restrictions
+from a parent to its hierarchy. Because files are identified and restricted by
+their hierarchy, moving or linking a file from one directory to another implies
+propagation of the hierarchy constraints, or restriction of these actions
+according to the potentially lost constraints. To protect against privilege
+escalations through renaming or linking, and for the sake of simplicity,
+Landlock previously limited linking and renaming to the same directory.
+Starting with the Landlock ABI version 2, it is now possible to securely
+control renaming and linking thanks to the new `LANDLOCK_ACCESS_FS_REFER`
+access right.
+
+.. _kernel_support:
+
+Kernel support
+==============
+
+Landlock was first introduced in Linux 5.13 but it must be configured at build
+time with `CONFIG_SECURITY_LANDLOCK=y`. Landlock must also be enabled at boot
+time as the other security modules. The list of security modules enabled by
+default is set with `CONFIG_LSM`. The kernel configuration should then
+contains `CONFIG_LSM=landlock,[...]` with `[...]` as the list of other
+potentially useful security modules for the running system (see the
+`CONFIG_LSM` help).
+
+If the running kernel doesn't have `landlock` in `CONFIG_LSM`, then we can
+still enable it by adding ``lsm=landlock,[...]`` to
+Documentation/admin-guide/kernel-parameters.rst thanks to the bootloader
+configuration.
+
Questions and answers
=====================
diff --git a/Documentation/userspace-api/media/drivers/uvcvideo.rst b/Documentation/userspace-api/media/drivers/uvcvideo.rst
index e5fd8fad333c..a290f9fadae9 100644
--- a/Documentation/userspace-api/media/drivers/uvcvideo.rst
+++ b/Documentation/userspace-api/media/drivers/uvcvideo.rst
@@ -7,7 +7,7 @@ This file documents some driver-specific aspects of the UVC driver, such as
driver-specific ioctls and implementation notes.
Questions and remarks can be sent to the Linux UVC development mailing list at
-linux-uvc-devel@lists.berlios.de.
+linux-media@vger.kernel.org.
Extension Unit (XU) support
diff --git a/Documentation/userspace-api/media/mediactl/media-controller-model.rst b/Documentation/userspace-api/media/mediactl/media-controller-model.rst
index 222cb99debb5..78bfdfb2a322 100644
--- a/Documentation/userspace-api/media/mediactl/media-controller-model.rst
+++ b/Documentation/userspace-api/media/mediactl/media-controller-model.rst
@@ -33,3 +33,9 @@ are:
- An **interface link** is a point-to-point bidirectional control
connection between a Linux Kernel interface and an entity.
+
+- An **ancillary link** is a point-to-point connection denoting that two
+ entities form a single logical unit. For example this could represent the
+ fact that a particular camera sensor and lens controller form a single
+ physical module, meaning this lens controller drives the lens for this
+ camera sensor. \ No newline at end of file
diff --git a/Documentation/userspace-api/media/mediactl/media-types.rst b/Documentation/userspace-api/media/mediactl/media-types.rst
index 0a26397bd01d..0ffeece1e0c8 100644
--- a/Documentation/userspace-api/media/mediactl/media-types.rst
+++ b/Documentation/userspace-api/media/mediactl/media-types.rst
@@ -412,14 +412,21 @@ must be set for every pad.
is set by drivers and is read-only for applications.
* - ``MEDIA_LNK_FL_LINK_TYPE``
- - This is a bitmask that defines the type of the link. Currently,
- two types of links are supported:
+ - This is a bitmask that defines the type of the link. The following
+ link types are currently supported:
.. _MEDIA-LNK-FL-DATA-LINK:
- ``MEDIA_LNK_FL_DATA_LINK`` if the link is between two pads
+ ``MEDIA_LNK_FL_DATA_LINK`` for links that represent a data connection
+ between two pads.
.. _MEDIA-LNK-FL-INTERFACE-LINK:
- ``MEDIA_LNK_FL_INTERFACE_LINK`` if the link is between an
- interface and an entity
+ ``MEDIA_LNK_FL_INTERFACE_LINK`` for links that associate an entity to its
+ interface.
+
+ .. _MEDIA-LNK-FL-ANCILLARY-LINK:
+
+ ``MEDIA_LNK_FL_ANCILLARY_LINK`` for links that represent a physical
+ relationship between two entities. The link may or may not be
+ immutable, so applications must not assume either case.
diff --git a/Documentation/userspace-api/media/v4l/dev-decoder.rst b/Documentation/userspace-api/media/v4l/dev-decoder.rst
index 3cf2b496f2d0..675bc2c3c6b8 100644
--- a/Documentation/userspace-api/media/v4l/dev-decoder.rst
+++ b/Documentation/userspace-api/media/v4l/dev-decoder.rst
@@ -72,6 +72,12 @@ coded resolution
coded width
width for given coded resolution.
+coding tree unit
+ processing unit of the HEVC codec (corresponds to macroblock units in
+ H.264, VP8, VP9),
+ can use block structures of up to 64×64 pixels.
+ Good at sub-partitioning the picture into variable sized structures.
+
decode order
the order in which frames are decoded; may differ from display order if the
coded format includes a feature of frame reordering; for decoders,
@@ -104,7 +110,8 @@ keyframe
macroblock
a processing unit in image and video compression formats based on linear
block transforms (e.g. H.264, VP8, VP9); codec-specific, but for most of
- popular codecs the size is 16x16 samples (pixels).
+ popular codecs the size is 16x16 samples (pixels). The HEVC codec uses a
+ slightly more flexible processing unit called coding tree unit (CTU).
OUTPUT
the source buffer queue; for decoders, the queue of buffers containing
diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
index 6541e4c32b26..bee73065e993 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst
@@ -649,10 +649,16 @@ Stateless Codec Control ID
:c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
* - __u32
- ``pic_num``
- -
+ - For short term references, this must match the derived value PicNum
+ (8-28) and for long term references it must match the derived value
+ LongTermPicNum (8-29). When decoding frames (as opposed to fields)
+ pic_num is the same as FrameNumWrap.
* - __u16
- ``frame_num``
- -
+ - For short term references, this must match the frame_num value from
+ the slice header syntax (the driver will wrap the value if needed). For
+ long term references, this must be set to the value of
+ long_term_frame_idx described in the dec_ref_pic_marking() syntax.
* - __u8
- ``fields``
- Specifies how the DPB entry is referenced. See :ref:`Reference Fields <h264_ref_fields>`
diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
index 4cd7c541fc30..6183f43f4d73 100644
--- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
+++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec.rst
@@ -1180,6 +1180,28 @@ enum v4l2_mpeg_video_h264_entropy_mode -
is set to non zero value.
Applicable to H264, H263 and MPEG4 encoder.
+``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE (enum)``
+
+enum v4l2_mpeg_video_intra_refresh_period_type -
+ Sets the type of intra refresh. The period to refresh
+ the whole frame is specified by V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD.
+ Note that if this control is not present, then it is undefined what
+ refresh type is used and it is up to the driver to decide.
+ Applicable to H264 and HEVC encoders. Possible values are:
+
+.. tabularcolumns:: |p{9.6cm}|p{7.9cm}|
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+
+ * - ``V4L2_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_RANDOM``
+ - The whole frame is completely refreshed randomly
+ after the specified period.
+ * - ``V4L2_MPEG_VIDEO_INTRA_REFRESH_PERIOD_TYPE_CYCLIC``
+ - The whole frame MBs are completely refreshed in cyclic order
+ after the specified period.
+
``V4L2_CID_MPEG_VIDEO_INTRA_REFRESH_PERIOD (integer)``
Intra macroblock refresh period. This sets the period to refresh
the whole frame. In other words, this defines the number of frames
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst b/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
index cabfa34b7db5..0ff68cd8cf62 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-reserved.rst
@@ -239,6 +239,25 @@ please make a proposal on the linux-media mailing list.
It remains an opaque intermediate format and the MDP hardware must be
used to convert ``V4L2_PIX_FMT_MT21C`` to ``V4L2_PIX_FMT_NV12M``,
``V4L2_PIX_FMT_YUV420M`` or ``V4L2_PIX_FMT_YVU420``.
+ * .. _V4L2-PIX-FMT-QC08C:
+
+ - ``V4L2_PIX_FMT_QC08C``
+ - 'QC08C'
+ - Compressed Macro-tile 8-Bit YUV420 format used by Qualcomm platforms.
+ It is an opaque intermediate format. The used compression is lossless
+ and it is used by various multimedia hardware blocks like GPU, display
+ controllers, ISP and video accelerators.
+ It contains four planes for progressive video and eight planes for
+ interlaced video.
+ * .. _V4L2-PIX-FMT-QC10C:
+
+ - ``V4L2_PIX_FMT_QC10C``
+ - 'QC10C'
+ - Compressed Macro-tile 10-Bit YUV420 format used by Qualcomm platforms.
+ It is an opaque intermediate format. The used compression is lossless
+ and it is used by various multimedia hardware blocks like GPU, display
+ controllers, ISP and video accelerators.
+ It contains four planes for progressive video.
.. raw:: latex
\normalsize
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
index 8ebd58c3588f..6a387f9df3ba 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
@@ -48,6 +48,17 @@ are often referred to as greyscale formats.
- ...
- ...
+ * .. _V4L2-PIX-FMT-IPU3-Y10:
+
+ - ``V4L2_PIX_FMT_IPU3_Y10``
+ - 'ip3y'
+
+ - Y'\ :sub:`0`\ [7:0]
+ - Y'\ :sub:`1`\ [5:0] Y'\ :sub:`0`\ [9:8]
+ - Y'\ :sub:`2`\ [3:0] Y'\ :sub:`1`\ [9:6]
+ - Y'\ :sub:`3`\ [1:0] Y'\ :sub:`2`\ [9:4]
+ - Y'\ :sub:`3`\ [9:2]
+
* .. _V4L2-PIX-FMT-Y10:
- ``V4L2_PIX_FMT_Y10``
@@ -133,4 +144,5 @@ are often referred to as greyscale formats.
For the Y16 and Y16_BE formats, the actual sampling precision may be lower
than 16 bits. For example, 10 bits per pixel uses values in the range 0 to
- 1023.
+ 1023. For the IPU3_Y10 format 25 pixels are packed into 32 bytes, which
+ leaves the 6 most significant bits of the last byte padded with 0.
diff --git a/Documentation/userspace-api/media/v4l/vidioc-streamon.rst b/Documentation/userspace-api/media/v4l/vidioc-streamon.rst
index 0bc86f06947b..1a79313a29fa 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-streamon.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-streamon.rst
@@ -43,8 +43,7 @@ the capture or output process during streaming
Capture hardware is disabled and no input buffers are filled (if there
are any empty buffers in the incoming queue) until ``VIDIOC_STREAMON``
has been called. Output hardware is disabled and no video signal is
-produced until ``VIDIOC_STREAMON`` has been called. The ioctl will
-succeed when at least one output buffer is in the incoming queue.
+produced until ``VIDIOC_STREAMON`` has been called.
Memory-to-memory devices will not start until ``VIDIOC_STREAMON`` has
been called for both the capture and output stream types.
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 539e9d4a4860..d1e2b9193f09 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -271,6 +271,16 @@ notifying process it will be replaced. The supervisor can also add an FD, and
respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return
value will be the injected file descriptor number.
+The notifying process can be preempted, resulting in the notification being
+aborted. This can be problematic when trying to take actions on behalf of the
+notifying process that are long-running and typically retryable (mounting a
+filesytem). Alternatively, at filter installation time, the
+``SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV`` flag can be set. This flag makes it
+such that when a user notification is received by the supervisor, the notifying
+process will ignore non-fatal signals until the response is sent. Signals that
+are sent prior to the notification being received by userspace are handled
+normally.
+
It is worth noting that ``struct seccomp_data`` contains the values of register
arguments to the syscall, but does not contain pointers to memory. The task's
memory is accessible to suitably privileged traces via ``ptrace()`` or
diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
new file mode 100644
index 000000000000..bf593e88cfd9
--- /dev/null
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -0,0 +1,155 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================================================
+The Definitive SEV Guest API Documentation
+===================================================================
+
+1. General description
+======================
+
+The SEV API is a set of ioctls that are used by the guest or hypervisor
+to get or set a certain aspect of the SEV virtual machine. The ioctls belong
+to the following classes:
+
+ - Hypervisor ioctls: These query and set global attributes which affect the
+ whole SEV firmware. These ioctl are used by platform provisioning tools.
+
+ - Guest ioctls: These query and set attributes of the SEV virtual machine.
+
+2. API description
+==================
+
+This section describes ioctls that is used for querying the SEV guest report
+from the SEV firmware. For each ioctl, the following information is provided
+along with a description:
+
+ Technology:
+ which SEV technology provides this ioctl. SEV, SEV-ES, SEV-SNP or all.
+
+ Type:
+ hypervisor or guest. The ioctl can be used inside the guest or the
+ hypervisor.
+
+ Parameters:
+ what parameters are accepted by the ioctl.
+
+ Returns:
+ the return value. General error numbers (-ENOMEM, -EINVAL)
+ are not detailed, but errors with specific meanings are.
+
+The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device.
+The ioctl accepts struct snp_user_guest_request. The input and output structure is
+specified through the req_data and resp_data field respectively. If the ioctl fails
+to execute due to a firmware error, then fw_err code will be set otherwise the
+fw_err will be set to 0x00000000000000ff.
+
+The firmware checks that the message sequence counter is one greater than
+the guests message sequence counter. If guest driver fails to increment message
+counter (e.g. counter overflow), then -EIO will be returned.
+
+::
+
+ struct snp_guest_request_ioctl {
+ /* Message version number */
+ __u32 msg_version;
+
+ /* Request and response structure address */
+ __u64 req_data;
+ __u64 resp_data;
+
+ /* firmware error code on failure (see psp-sev.h) */
+ __u64 fw_err;
+ };
+
+2.1 SNP_GET_REPORT
+------------------
+
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in): struct snp_report_req
+:Returns (out): struct snp_report_resp on success, -negative on error
+
+The SNP_GET_REPORT ioctl can be used to query the attestation report from the
+SEV-SNP firmware. The ioctl uses the SNP_GUEST_REQUEST (MSG_REPORT_REQ) command
+provided by the SEV-SNP firmware to query the attestation report.
+
+On success, the snp_report_resp.data will contains the report. The report
+contain the format described in the SEV-SNP specification. See the SEV-SNP
+specification for further details.
+
+2.2 SNP_GET_DERIVED_KEY
+-----------------------
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in): struct snp_derived_key_req
+:Returns (out): struct snp_derived_key_resp on success, -negative on error
+
+The SNP_GET_DERIVED_KEY ioctl can be used to get a key derive from a root key.
+The derived key can be used by the guest for any purpose, such as sealing keys
+or communicating with external entities.
+
+The ioctl uses the SNP_GUEST_REQUEST (MSG_KEY_REQ) command provided by the
+SEV-SNP firmware to derive the key. See SEV-SNP specification for further details
+on the various fields passed in the key derivation request.
+
+On success, the snp_derived_key_resp.data contains the derived key value. See
+the SEV-SNP specification for further details.
+
+
+2.3 SNP_GET_EXT_REPORT
+----------------------
+:Technology: sev-snp
+:Type: guest ioctl
+:Parameters (in/out): struct snp_ext_report_req
+:Returns (out): struct snp_report_resp on success, -negative on error
+
+The SNP_GET_EXT_REPORT ioctl is similar to the SNP_GET_REPORT. The difference is
+related to the additional certificate data that is returned with the report.
+The certificate data returned is being provided by the hypervisor through the
+SNP_SET_EXT_CONFIG.
+
+The ioctl uses the SNP_GUEST_REQUEST (MSG_REPORT_REQ) command provided by the SEV-SNP
+firmware to get the attestation report.
+
+On success, the snp_ext_report_resp.data will contain the attestation report
+and snp_ext_report_req.certs_address will contain the certificate blob. If the
+length of the blob is smaller than expected then snp_ext_report_req.certs_len will
+be updated with the expected value.
+
+See GHCB specification for further detail on how to parse the certificate blob.
+
+3. SEV-SNP CPUID Enforcement
+============================
+
+SEV-SNP guests can access a special page that contains a table of CPUID values
+that have been validated by the PSP as part of the SNP_LAUNCH_UPDATE firmware
+command. It provides the following assurances regarding the validity of CPUID
+values:
+
+ - Its address is obtained via bootloader/firmware (via CC blob), and those
+ binaries will be measured as part of the SEV-SNP attestation report.
+ - Its initial state will be encrypted/pvalidated, so attempts to modify
+ it during run-time will result in garbage being written, or #VC exceptions
+ being generated due to changes in validation state if the hypervisor tries
+ to swap the backing page.
+ - Attempts to bypass PSP checks by the hypervisor by using a normal page, or
+ a non-CPUID encrypted page will change the measurement provided by the
+ SEV-SNP attestation report.
+ - The CPUID page contents are *not* measured, but attempts to modify the
+ expected contents of a CPUID page as part of guest initialization will be
+ gated by the PSP CPUID enforcement policy checks performed on the page
+ during SNP_LAUNCH_UPDATE, and noticeable later if the guest owner
+ implements their own checks of the CPUID values.
+
+It is important to note that this last assurance is only useful if the kernel
+has taken care to make use of the SEV-SNP CPUID throughout all stages of boot.
+Otherwise, guest owner attestation provides no assurance that the kernel wasn't
+fed incorrect values at some point during boot.
+
+
+Reference
+---------
+
+SEV-SNP and GHCB specification: developer.amd.com/sev
+
+The driver is based on SEV-SNP firmware spec 0.9 and GHCB spec version 2.0.
diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst
index edea7fea95a8..492f0920b988 100644
--- a/Documentation/virt/index.rst
+++ b/Documentation/virt/index.rst
@@ -13,6 +13,7 @@ Linux Virtualization Support
guest-halt-polling
ne_overview
acrn/index
+ coco/sev-guest
.. only:: html and subproject
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 4a900cdbc62e..c8e2e9cd84dc 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -5713,6 +5713,8 @@ affect the device's behavior. Current defined flags::
#define KVM_RUN_X86_SMM (1 << 0)
/* x86, set if bus lock detected in VM */
#define KVM_RUN_BUS_LOCK (1 << 1)
+ /* arm64, set for KVM_EXIT_DEBUG */
+ #define KVM_DEBUG_ARCH_HSR_HIGH_VALID (1 << 0)
::
diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
index f8b225fc9190..cbaee9e59241 100644
--- a/Documentation/vm/arch_pgtable_helpers.rst
+++ b/Documentation/vm/arch_pgtable_helpers.rst
@@ -13,7 +13,7 @@ Following tables describe the expected semantics which can also be tested during
boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
test need to be in sync.
-======================
+
PTE Page Table Helpers
======================
@@ -79,7 +79,7 @@ PTE Page Table Helpers
| ptep_set_access_flags | Converts into a more permissive PTE |
+---------------------------+--------------------------------------------------+
-======================
+
PMD Page Table Helpers
======================
@@ -153,7 +153,7 @@ PMD Page Table Helpers
| pmdp_set_access_flags | Converts into a more permissive PMD |
+---------------------------+--------------------------------------------------+
-======================
+
PUD Page Table Helpers
======================
@@ -209,7 +209,7 @@ PUD Page Table Helpers
| pudp_set_access_flags | Converts into a more permissive PUD |
+---------------------------+--------------------------------------------------+
-==========================
+
HugeTLB Page Table Helpers
==========================
@@ -235,7 +235,7 @@ HugeTLB Page Table Helpers
| huge_ptep_set_access_flags | Converts into a more permissive HugeTLB |
+---------------------------+--------------------------------------------------+
-========================
+
SWAP Page Table Helpers
========================
diff --git a/Documentation/vm/bootmem.rst b/Documentation/vm/bootmem.rst
new file mode 100644
index 000000000000..eb2b31eedfa1
--- /dev/null
+++ b/Documentation/vm/bootmem.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Boot Memory
+===========
diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst
index 44365c4574a3..e72736d53604 100644
--- a/Documentation/vm/index.rst
+++ b/Documentation/vm/index.rst
@@ -2,12 +2,39 @@
Linux Memory Management Documentation
=====================================
-This is a collection of documents about the Linux memory management (mm)
-subsystem internals with different level of details ranging from notes and
-mailing list responses for elaborating descriptions of data structures and
-algorithms. If you are looking for advice on simply allocating memory, see the
-:ref:`memory_allocation`. For controlling and tuning guides, see the
-:doc:`admin guide <../admin-guide/mm/index>`.
+Memory Management Guide
+=======================
+
+This is a guide to understanding the memory management subsystem
+of Linux. If you are looking for advice on simply allocating memory,
+see the :ref:`memory_allocation`. For controlling and tuning guides,
+see the :doc:`admin guide <../admin-guide/mm/index>`.
+
+.. toctree::
+ :maxdepth: 1
+
+ physical_memory
+ page_tables
+ process_addrs
+ bootmem
+ page_allocation
+ vmalloc
+ slab
+ highmem
+ page_reclaim
+ swap
+ page_cache
+ shmfs
+ oom
+
+Legacy Documentation
+====================
+
+This is a collection of older documents about the Linux memory management
+(MM) subsystem internals with different level of details ranging from
+notes and mailing list responses for elaborating descriptions of data
+structures and algorithms. It should all be integrated nicely into the
+above structured documentation, or deleted if it has served its purpose.
.. toctree::
:maxdepth: 1
@@ -18,7 +45,6 @@ algorithms. If you are looking for advice on simply allocating memory, see the
damon/index
free_page_reporting
frontswap
- highmem
hmm
hwpoison
hugetlbfs_reserv
diff --git a/Documentation/vm/oom.rst b/Documentation/vm/oom.rst
new file mode 100644
index 000000000000..18e9e40c1ec1
--- /dev/null
+++ b/Documentation/vm/oom.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+Out Of Memory Handling
+======================
diff --git a/Documentation/vm/page_allocation.rst b/Documentation/vm/page_allocation.rst
new file mode 100644
index 000000000000..d9b4495561f1
--- /dev/null
+++ b/Documentation/vm/page_allocation.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Page Allocation
+===============
diff --git a/Documentation/vm/page_cache.rst b/Documentation/vm/page_cache.rst
new file mode 100644
index 000000000000..75eba7c431b2
--- /dev/null
+++ b/Documentation/vm/page_cache.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Page Cache
+==========
diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/vm/page_reclaim.rst
new file mode 100644
index 000000000000..50a30b7f8ac3
--- /dev/null
+++ b/Documentation/vm/page_reclaim.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Page Reclaim
+============
diff --git a/Documentation/vm/page_tables.rst b/Documentation/vm/page_tables.rst
new file mode 100644
index 000000000000..96939571d7bc
--- /dev/null
+++ b/Documentation/vm/page_tables.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Page Tables
+===========
diff --git a/Documentation/vm/physical_memory.rst b/Documentation/vm/physical_memory.rst
new file mode 100644
index 000000000000..2ab7b8c1c863
--- /dev/null
+++ b/Documentation/vm/physical_memory.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Physical Memory
+===============
diff --git a/Documentation/vm/process_addrs.rst b/Documentation/vm/process_addrs.rst
new file mode 100644
index 000000000000..e8618fbc62c9
--- /dev/null
+++ b/Documentation/vm/process_addrs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Process Addresses
+=================
diff --git a/Documentation/vm/shmfs.rst b/Documentation/vm/shmfs.rst
new file mode 100644
index 000000000000..8b01ebb4c30e
--- /dev/null
+++ b/Documentation/vm/shmfs.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Shared Memory Filesystem
+========================
diff --git a/Documentation/vm/slab.rst b/Documentation/vm/slab.rst
new file mode 100644
index 000000000000..87d5a5bb172f
--- /dev/null
+++ b/Documentation/vm/slab.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Slab Allocation
+===============
diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst
index d3028554b1e9..43063ade737a 100644
--- a/Documentation/vm/slub.rst
+++ b/Documentation/vm/slub.rst
@@ -384,5 +384,69 @@ c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the
40,60`` range will plot only samples collected between 40th and
60th seconds).
+
+DebugFS files for SLUB
+======================
+
+For more information about current state of SLUB caches with the user tracking
+debug option enabled, debugfs files are available, typically under
+/sys/kernel/debug/slab/<cache>/ (created only for caches with enabled user
+tracking). There are 2 types of these files with the following debug
+information:
+
+1. alloc_traces::
+
+ Prints information about unique allocation traces of the currently
+ allocated objects. The output is sorted by frequency of each trace.
+
+ Information in the output:
+ Number of objects, allocating function, minimal/average/maximal jiffies since alloc,
+ pid range of the allocating processes, cpu mask of allocating cpus, and stack trace.
+
+ Example:::
+
+ 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1::
+ __slab_alloc+0x6d/0x90
+ kmem_cache_alloc_trace+0x2eb/0x300
+ populate_error_injection_list+0x97/0x110
+ init_error_injection+0x1b/0x71
+ do_one_initcall+0x5f/0x2d0
+ kernel_init_freeable+0x26f/0x2d7
+ kernel_init+0xe/0x118
+ ret_from_fork+0x22/0x30
+
+
+2. free_traces::
+
+ Prints information about unique freeing traces of the currently allocated
+ objects. The freeing traces thus come from the previous life-cycle of the
+ objects and are reported as not available for objects allocated for the first
+ time. The output is sorted by frequency of each trace.
+
+ Information in the output:
+ Number of objects, freeing function, minimal/average/maximal jiffies since free,
+ pid range of the freeing processes, cpu mask of freeing cpus, and stack trace.
+
+ Example:::
+
+ 1980 <not-available> age=4294912290 pid=0 cpus=0
+ 51 acpi_ut_update_ref_count+0x6a6/0x782 age=236886/237027/237772 pid=1 cpus=1
+ kfree+0x2db/0x420
+ acpi_ut_update_ref_count+0x6a6/0x782
+ acpi_ut_update_object_reference+0x1ad/0x234
+ acpi_ut_remove_reference+0x7d/0x84
+ acpi_rs_get_prt_method_data+0x97/0xd6
+ acpi_get_irq_routing_table+0x82/0xc4
+ acpi_pci_irq_find_prt_entry+0x8e/0x2e0
+ acpi_pci_irq_lookup+0x3a/0x1e0
+ acpi_pci_irq_enable+0x77/0x240
+ pcibios_enable_device+0x39/0x40
+ do_pci_enable_device.part.0+0x5d/0xe0
+ pci_enable_device_flags+0xfc/0x120
+ pci_enable_device+0x13/0x20
+ virtio_pci_probe+0x9e/0x170
+ local_pci_probe+0x48/0x80
+ pci_device_probe+0x105/0x1c0
+
Christoph Lameter, May 30, 2007
Sergey Senozhatsky, October 23, 2015
diff --git a/Documentation/vm/swap.rst b/Documentation/vm/swap.rst
new file mode 100644
index 000000000000..78819bd4d745
--- /dev/null
+++ b/Documentation/vm/swap.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+Swap
+====
diff --git a/Documentation/vm/vmalloc.rst b/Documentation/vm/vmalloc.rst
new file mode 100644
index 000000000000..363fe20d6b9f
--- /dev/null
+++ b/Documentation/vm/vmalloc.rst
@@ -0,0 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================
+Virtually Contiguous Memory Allocation
+======================================
diff --git a/Documentation/w1/slaves/w1_therm.rst b/Documentation/w1/slaves/w1_therm.rst
index c3c9ed7a356c..758dadba84f6 100644
--- a/Documentation/w1/slaves/w1_therm.rst
+++ b/Documentation/w1/slaves/w1_therm.rst
@@ -6,7 +6,8 @@ Supported chips:
* Maxim ds18*20 based temperature sensors.
* Maxim ds1825 based temperature sensors.
- * GXCAS GC20MH01 temperature sensor.
+ * GXCAS GX20MH01 temperature sensor.
+ * Maxim MAX31850 thermoelement interface.
Author: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
@@ -15,7 +16,7 @@ Description
-----------
w1_therm provides basic temperature conversion for ds18*20, ds28ea00, GX20MH01
-devices.
+and MAX31850 devices.
Supported family codes:
@@ -137,3 +138,7 @@ bits in Config register; R2 bit in Config register enabling 13 and 14 bit
resolutions. The device is powered up in 14-bit resolution mode. The conversion
times specified in the datasheet are too low and have to be increased. The
device supports driver features ``1`` and ``2``.
+
+MAX31850 device shares family number 0x3B with DS1825. The device is generally
+compatible with DS1825. The higher 4 bits of Config register read all 1,
+indicating 15, but the device is always operating in 14-bit resolution mode.
diff --git a/Documentation/x86/cpuinfo.rst b/Documentation/x86/cpuinfo.rst
index 5d54c39a063f..08246e8ac835 100644
--- a/Documentation/x86/cpuinfo.rst
+++ b/Documentation/x86/cpuinfo.rst
@@ -140,9 +140,8 @@ from #define X86_FEATURE_UMIP (16*32 + 2).
In addition, there exists a variety of custom command-line parameters that
disable specific features. The list of parameters includes, but is not limited
-to, nofsgsbase, nosmap, and nosmep. 5-level paging can also be disabled using
-"no5lvl". SMAP and SMEP are disabled with the aforementioned parameters,
-respectively.
+to, nofsgsbase, nosgx, noxsave, etc. 5-level paging can also be disabled using
+"no5lvl".
e: The feature was known to be non-functional.
----------------------------------------------
diff --git a/Documentation/x86/exception-tables.rst b/Documentation/x86/exception-tables.rst
index de58110c5ffd..efde1fef4fbd 100644
--- a/Documentation/x86/exception-tables.rst
+++ b/Documentation/x86/exception-tables.rst
@@ -32,14 +32,14 @@ Whenever the kernel tries to access an address that is currently not
accessible, the CPU generates a page fault exception and calls the
page fault handler::
- void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+ void exc_page_fault(struct pt_regs *regs, unsigned long error_code)
in arch/x86/mm/fault.c. The parameters on the stack are set up by
the low level assembly glue in arch/x86/entry/entry_32.S. The parameter
regs is a pointer to the saved registers on the stack, error_code
contains a reason code for the exception.
-do_page_fault first obtains the unaccessible address from the CPU
+exc_page_fault() first obtains the inaccessible address from the CPU
control register CR2. If the address is within the virtual address
space of the process, the fault probably occurred, because the page
was not swapped in, write protected or something similar. However,
@@ -57,10 +57,10 @@ Where does fixup point to?
Since we jump to the contents of fixup, fixup obviously points
to executable code. This code is hidden inside the user access macros.
-I have picked the get_user macro defined in arch/x86/include/asm/uaccess.h
+I have picked the get_user() macro defined in arch/x86/include/asm/uaccess.h
as an example. The definition is somewhat hard to follow, so let's peek at
the code generated by the preprocessor and the compiler. I selected
-the get_user call in drivers/char/sysrq.c for a detailed examination.
+the get_user() call in drivers/char/sysrq.c for a detailed examination.
The original code in sysrq.c line 587::
@@ -281,12 +281,15 @@ vma occurs?
> c017e7a5 <do_con_write+e1> movb (%ebx),%dl
#. MMU generates exception
-#. CPU calls do_page_fault
-#. do page fault calls search_exception_table (regs->eip == c017e7a5);
-#. search_exception_table looks up the address c017e7a5 in the
+#. CPU calls exc_page_fault()
+#. exc_page_fault() calls do_user_addr_fault()
+#. do_user_addr_fault() calls kernelmode_fixup_or_oops()
+#. kernelmode_fixup_or_oops() calls fixup_exception() (regs->eip == c017e7a5);
+#. fixup_exception() calls search_exception_tables()
+#. search_exception_tables() looks up the address c017e7a5 in the
exception table (i.e. the contents of the ELF section __ex_table)
and returns the address of the associated fault handle code c0199ff5.
-#. do_page_fault modifies its own return address to point to the fault
+#. fixup_exception() modifies its own return address to point to the fault
handle code and returns.
#. execution continues in the fault handling code.
#. a) EAX becomes -EFAULT (== -14)
@@ -298,9 +301,9 @@ The steps 8a to 8c in a certain way emulate the faulting instruction.
That's it, mostly. If you look at our example, you might ask why
we set EAX to -EFAULT in the exception handler code. Well, the
-get_user macro actually returns a value: 0, if the user access was
+get_user() macro actually returns a value: 0, if the user access was
successful, -EFAULT on failure. Our original code did not test this
-return value, however the inline assembly code in get_user tries to
+return value, however the inline assembly code in get_user() tries to
return -EFAULT. GCC selected EAX to return this value.
NOTE:
diff --git a/Documentation/x86/ifs.rst b/Documentation/x86/ifs.rst
new file mode 100644
index 000000000000..97abb696a680
--- /dev/null
+++ b/Documentation/x86/ifs.rst
@@ -0,0 +1,2 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. kernel-doc:: drivers/platform/x86/intel/ifs/ifs.h
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 91b2fa456618..c73d133fd37c 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -22,10 +22,11 @@ x86-specific Documentation
mtrr
pat
intel-hfi
- intel-iommu
+ iommu
intel_txt
amd-memory-encryption
amd_hsmp
+ tdx
pti
mds
microcode
@@ -35,6 +36,7 @@ x86-specific Documentation
usb-legacy-support
i386/index
x86_64/index
+ ifs
sva
sgx
features
diff --git a/Documentation/x86/intel-iommu.rst b/Documentation/x86/intel-iommu.rst
deleted file mode 100644
index 099f13d51d5f..000000000000
--- a/Documentation/x86/intel-iommu.rst
+++ /dev/null
@@ -1,115 +0,0 @@
-===================
-Linux IOMMU Support
-===================
-
-The architecture spec can be obtained from the below location.
-
-http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
-
-This guide gives a quick cheat sheet for some basic understanding.
-
-Some Keywords
-
-- DMAR - DMA remapping
-- DRHD - DMA Remapping Hardware Unit Definition
-- RMRR - Reserved memory Region Reporting Structure
-- ZLR - Zero length reads from PCI devices
-- IOVA - IO Virtual address.
-
-Basic stuff
------------
-
-ACPI enumerates and lists the different DMA engines in the platform, and
-device scope relationships between PCI devices and which DMA engine controls
-them.
-
-What is RMRR?
--------------
-
-There are some devices the BIOS controls, for e.g USB devices to perform
-PS2 emulation. The regions of memory used for these devices are marked
-reserved in the e820 map. When we turn on DMA translation, DMA to those
-regions will fail. Hence BIOS uses RMRR to specify these regions along with
-devices that need to access these regions. OS is expected to setup
-unity mappings for these regions for these devices to access these regions.
-
-How is IOVA generated?
-----------------------
-
-Well behaved drivers call pci_map_*() calls before sending command to device
-that needs to perform DMA. Once DMA is completed and mapping is no longer
-required, device performs a pci_unmap_*() calls to unmap the region.
-
-The Intel IOMMU driver allocates a virtual address per domain. Each PCIE
-device has its own domain (hence protection). Devices under p2p bridges
-share the virtual address with all devices under the p2p bridge due to
-transaction id aliasing for p2p bridges.
-
-IOVA generation is pretty generic. We used the same technique as vmalloc()
-but these are not global address spaces, but separate for each domain.
-Different DMA engines may support different number of domains.
-
-We also allocate guard pages with each mapping, so we can attempt to catch
-any overflow that might happen.
-
-
-Graphics Problems?
-------------------
-If you encounter issues with graphics devices, you can try adding
-option intel_iommu=igfx_off to turn off the integrated graphics engine.
-If this fixes anything, please ensure you file a bug reporting the problem.
-
-Some exceptions to IOVA
------------------------
-Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
-The same is true for peer to peer transactions. Hence we reserve the
-address from PCI MMIO ranges so they are not allocated for IOVA addresses.
-
-
-Fault reporting
----------------
-When errors are reported, the DMA engine signals via an interrupt. The fault
-reason and device that caused it with fault reason is printed on console.
-
-See below for sample.
-
-
-Boot Message Sample
--------------------
-
-Something like this gets printed indicating presence of DMAR tables
-in ACPI.
-
-ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
-
-When DMAR is being processed and initialized by ACPI, prints DMAR locations
-and any RMRR's processed::
-
- ACPI DMAR:Host address width 36
- ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
- ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
- ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
- ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
- ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
-
-When DMAR is enabled for use, you will notice..
-
-PCI-DMA: Using DMAR IOMMU
--------------------------
-
-Fault reporting
-^^^^^^^^^^^^^^^
-
-::
-
- DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
- DMAR:[fault reason 05] PTE Write access is not set
- DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
- DMAR:[fault reason 05] PTE Write access is not set
-
-TBD
-----
-
-- For compatibility testing, could use unity map domain for all devices, just
- provide a 1-1 for all useful memory under a single domain for all devices.
-- API for paravirt ops for abstracting functionality for VMM folks.
diff --git a/Documentation/x86/iommu.rst b/Documentation/x86/iommu.rst
new file mode 100644
index 000000000000..42c7a6faa39a
--- /dev/null
+++ b/Documentation/x86/iommu.rst
@@ -0,0 +1,151 @@
+=================
+x86 IOMMU Support
+=================
+
+The architecture specs can be obtained from the below locations.
+
+- Intel: http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
+- AMD: https://www.amd.com/system/files/TechDocs/48882_IOMMU.pdf
+
+This guide gives a quick cheat sheet for some basic understanding.
+
+Basic stuff
+-----------
+
+ACPI enumerates and lists the different IOMMUs on the platform, and
+device scope relationships between devices and which IOMMU controls
+them.
+
+Some ACPI Keywords:
+
+- DMAR - Intel DMA Remapping table
+- DRHD - Intel DMA Remapping Hardware Unit Definition
+- RMRR - Intel Reserved Memory Region Reporting Structure
+- IVRS - AMD I/O Virtualization Reporting Structure
+- IVDB - AMD I/O Virtualization Definition Block
+- IVHD - AMD I/O Virtualization Hardware Definition
+
+What is Intel RMRR?
+^^^^^^^^^^^^^^^^^^^
+
+There are some devices the BIOS controls, for e.g USB devices to perform
+PS2 emulation. The regions of memory used for these devices are marked
+reserved in the e820 map. When we turn on DMA translation, DMA to those
+regions will fail. Hence BIOS uses RMRR to specify these regions along with
+devices that need to access these regions. OS is expected to setup
+unity mappings for these regions for these devices to access these regions.
+
+What is AMD IVRS?
+^^^^^^^^^^^^^^^^^
+
+The architecture defines an ACPI-compatible data structure called an I/O
+Virtualization Reporting Structure (IVRS) that is used to convey information
+related to I/O virtualization to system software. The IVRS describes the
+configuration and capabilities of the IOMMUs contained in the platform as
+well as information about the devices that each IOMMU virtualizes.
+
+The IVRS provides information about the following:
+
+- IOMMUs present in the platform including their capabilities and proper configuration
+- System I/O topology relevant to each IOMMU
+- Peripheral devices that cannot be otherwise enumerated
+- Memory regions used by SMI/SMM, platform firmware, and platform hardware. These are generally exclusion ranges to be configured by system software.
+
+How is an I/O Virtual Address (IOVA) generated?
+-----------------------------------------------
+
+Well behaved drivers call dma_map_*() calls before sending command to device
+that needs to perform DMA. Once DMA is completed and mapping is no longer
+required, driver performs dma_unmap_*() calls to unmap the region.
+
+Intel Specific Notes
+--------------------
+
+Graphics Problems?
+^^^^^^^^^^^^^^^^^^
+
+If you encounter issues with graphics devices, you can try adding
+option intel_iommu=igfx_off to turn off the integrated graphics engine.
+If this fixes anything, please ensure you file a bug reporting the problem.
+
+Some exceptions to IOVA
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff).
+The same is true for peer to peer transactions. Hence we reserve the
+address from PCI MMIO ranges so they are not allocated for IOVA addresses.
+
+AMD Specific Notes
+------------------
+
+Graphics Problems?
+^^^^^^^^^^^^^^^^^^
+
+If you encounter issues with integrated graphics devices, you can try adding
+option iommu=pt to the kernel command line use a 1:1 mapping for the IOMMU. If
+this fixes anything, please ensure you file a bug reporting the problem.
+
+Fault reporting
+---------------
+When errors are reported, the IOMMU signals via an interrupt. The fault
+reason and device that caused it is printed on the console.
+
+
+Kernel Log Samples
+------------------
+
+Intel Boot Messages
+^^^^^^^^^^^^^^^^^^^
+
+Something like this gets printed indicating presence of DMAR tables
+in ACPI:
+
+::
+
+ ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
+
+When DMAR is being processed and initialized by ACPI, prints DMAR locations
+and any RMRR's processed:
+
+::
+
+ ACPI DMAR:Host address width 36
+ ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
+ ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
+ ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
+ ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
+ ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
+
+When DMAR is enabled for use, you will notice:
+
+::
+
+ PCI-DMA: Using DMAR IOMMU
+
+Intel Fault reporting
+^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+ DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+ DMAR:[fault reason 05] PTE Write access is not set
+ DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
+ DMAR:[fault reason 05] PTE Write access is not set
+
+AMD Boot Messages
+^^^^^^^^^^^^^^^^^
+
+Something like this gets printed indicating presence of the IOMMU:
+
+::
+
+ iommu: Default domain type: Translated
+ iommu: DMA domain TLB invalidation policy: lazy mode
+
+AMD Fault reporting
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+ AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0007 address=0xffffc02000 flags=0x0000]
+ AMD-Vi: Event logged [IO_PAGE_FAULT device=07:00.0 domain=0x0007 address=0xffffc02000 flags=0x0000]
diff --git a/Documentation/x86/tdx.rst b/Documentation/x86/tdx.rst
new file mode 100644
index 000000000000..b8fa4329e1a5
--- /dev/null
+++ b/Documentation/x86/tdx.rst
@@ -0,0 +1,218 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Intel Trust Domain Extensions (TDX)
+=====================================
+
+Intel's Trust Domain Extensions (TDX) protect confidential guest VMs from
+the host and physical attacks by isolating the guest register state and by
+encrypting the guest memory. In TDX, a special module running in a special
+mode sits between the host and the guest and manages the guest/host
+separation.
+
+Since the host cannot directly access guest registers or memory, much
+normal functionality of a hypervisor must be moved into the guest. This is
+implemented using a Virtualization Exception (#VE) that is handled by the
+guest kernel. A #VE is handled entirely inside the guest kernel, but some
+require the hypervisor to be consulted.
+
+TDX includes new hypercall-like mechanisms for communicating from the
+guest to the hypervisor or the TDX module.
+
+New TDX Exceptions
+==================
+
+TDX guests behave differently from bare-metal and traditional VMX guests.
+In TDX guests, otherwise normal instructions or memory accesses can cause
+#VE or #GP exceptions.
+
+Instructions marked with an '*' conditionally cause exceptions. The
+details for these instructions are discussed below.
+
+Instruction-based #VE
+---------------------
+
+- Port I/O (INS, OUTS, IN, OUT)
+- HLT
+- MONITOR, MWAIT
+- WBINVD, INVD
+- VMCALL
+- RDMSR*,WRMSR*
+- CPUID*
+
+Instruction-based #GP
+---------------------
+
+- All VMX instructions: INVEPT, INVVPID, VMCLEAR, VMFUNC, VMLAUNCH,
+ VMPTRLD, VMPTRST, VMREAD, VMRESUME, VMWRITE, VMXOFF, VMXON
+- ENCLS, ENCLU
+- GETSEC
+- RSM
+- ENQCMD
+- RDMSR*,WRMSR*
+
+RDMSR/WRMSR Behavior
+--------------------
+
+MSR access behavior falls into three categories:
+
+- #GP generated
+- #VE generated
+- "Just works"
+
+In general, the #GP MSRs should not be used in guests. Their use likely
+indicates a bug in the guest. The guest may try to handle the #GP with a
+hypercall but it is unlikely to succeed.
+
+The #VE MSRs are typically able to be handled by the hypervisor. Guests
+can make a hypercall to the hypervisor to handle the #VE.
+
+The "just works" MSRs do not need any special guest handling. They might
+be implemented by directly passing through the MSR to the hardware or by
+trapping and handling in the TDX module. Other than possibly being slow,
+these MSRs appear to function just as they would on bare metal.
+
+CPUID Behavior
+--------------
+
+For some CPUID leaves and sub-leaves, the virtualized bit fields of CPUID
+return values (in guest EAX/EBX/ECX/EDX) are configurable by the
+hypervisor. For such cases, the Intel TDX module architecture defines two
+virtualization types:
+
+- Bit fields for which the hypervisor controls the value seen by the guest
+ TD.
+
+- Bit fields for which the hypervisor configures the value such that the
+ guest TD either sees their native value or a value of 0. For these bit
+ fields, the hypervisor can mask off the native values, but it can not
+ turn *on* values.
+
+A #VE is generated for CPUID leaves and sub-leaves that the TDX module does
+not know how to handle. The guest kernel may ask the hypervisor for the
+value with a hypercall.
+
+#VE on Memory Accesses
+======================
+
+There are essentially two classes of TDX memory: private and shared.
+Private memory receives full TDX protections. Its content is protected
+against access from the hypervisor. Shared memory is expected to be
+shared between guest and hypervisor and does not receive full TDX
+protections.
+
+A TD guest is in control of whether its memory accesses are treated as
+private or shared. It selects the behavior with a bit in its page table
+entries. This helps ensure that a guest does not place sensitive
+information in shared memory, exposing it to the untrusted hypervisor.
+
+#VE on Shared Memory
+--------------------
+
+Access to shared mappings can cause a #VE. The hypervisor ultimately
+controls whether a shared memory access causes a #VE, so the guest must be
+careful to only reference shared pages it can safely handle a #VE. For
+instance, the guest should be careful not to access shared memory in the
+#VE handler before it reads the #VE info structure (TDG.VP.VEINFO.GET).
+
+Shared mapping content is entirely controlled by the hypervisor. The guest
+should only use shared mappings for communicating with the hypervisor.
+Shared mappings must never be used for sensitive memory content like kernel
+stacks. A good rule of thumb is that hypervisor-shared memory should be
+treated the same as memory mapped to userspace. Both the hypervisor and
+userspace are completely untrusted.
+
+MMIO for virtual devices is implemented as shared memory. The guest must
+be careful not to access device MMIO regions unless it is also prepared to
+handle a #VE.
+
+#VE on Private Pages
+--------------------
+
+An access to private mappings can also cause a #VE. Since all kernel
+memory is also private memory, the kernel might theoretically need to
+handle a #VE on arbitrary kernel memory accesses. This is not feasible, so
+TDX guests ensure that all guest memory has been "accepted" before memory
+is used by the kernel.
+
+A modest amount of memory (typically 512M) is pre-accepted by the firmware
+before the kernel runs to ensure that the kernel can start up without
+being subjected to a #VE.
+
+The hypervisor is permitted to unilaterally move accepted pages to a
+"blocked" state. However, if it does this, page access will not generate a
+#VE. It will, instead, cause a "TD Exit" where the hypervisor is required
+to handle the exception.
+
+Linux #VE handler
+=================
+
+Just like page faults or #GP's, #VE exceptions can be either handled or be
+fatal. Typically, an unhandled userspace #VE results in a SIGSEGV.
+An unhandled kernel #VE results in an oops.
+
+Handling nested exceptions on x86 is typically nasty business. A #VE
+could be interrupted by an NMI which triggers another #VE and hilarity
+ensues. The TDX #VE architecture anticipated this scenario and includes a
+feature to make it slightly less nasty.
+
+During #VE handling, the TDX module ensures that all interrupts (including
+NMIs) are blocked. The block remains in place until the guest makes a
+TDG.VP.VEINFO.GET TDCALL. This allows the guest to control when interrupts
+or a new #VE can be delivered.
+
+However, the guest kernel must still be careful to avoid potential
+#VE-triggering actions (discussed above) while this block is in place.
+While the block is in place, any #VE is elevated to a double fault (#DF)
+which is not recoverable.
+
+MMIO handling
+=============
+
+In non-TDX VMs, MMIO is usually implemented by giving a guest access to a
+mapping which will cause a VMEXIT on access, and then the hypervisor
+emulates the access. That is not possible in TDX guests because VMEXIT
+will expose the register state to the host. TDX guests don't trust the host
+and can't have their state exposed to the host.
+
+In TDX, MMIO regions typically trigger a #VE exception in the guest. The
+guest #VE handler then emulates the MMIO instruction inside the guest and
+converts it into a controlled TDCALL to the host, rather than exposing
+guest state to the host.
+
+MMIO addresses on x86 are just special physical addresses. They can
+theoretically be accessed with any instruction that accesses memory.
+However, the kernel instruction decoding method is limited. It is only
+designed to decode instructions like those generated by io.h macros.
+
+MMIO access via other means (like structure overlays) may result in an
+oops.
+
+Shared Memory Conversions
+=========================
+
+All TDX guest memory starts out as private at boot. This memory can not
+be accessed by the hypervisor. However, some kernel users like device
+drivers might have a need to share data with the hypervisor. To do this,
+memory must be converted between shared and private. This can be
+accomplished using some existing memory encryption helpers:
+
+ * set_memory_decrypted() converts a range of pages to shared.
+ * set_memory_encrypted() converts memory back to private.
+
+Device drivers are the primary user of shared memory, but there's no need
+to touch every driver. DMA buffers and ioremap() do the conversions
+automatically.
+
+TDX uses SWIOTLB for most DMA allocations. The SWIOTLB buffer is
+converted to shared on boot.
+
+For coherent DMA allocation, the DMA buffer gets converted on the
+allocation. Check force_dma_unencrypted() for details.
+
+References
+==========
+
+TDX reference material is collected here:
+
+https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html
diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst
index 07aa0007f346..03ec9cf01181 100644
--- a/Documentation/x86/x86_64/boot-options.rst
+++ b/Documentation/x86/x86_64/boot-options.rst
@@ -157,15 +157,6 @@ Rebooting
newer BIOS, or newer board) using this option will ignore the built-in
quirk table, and use the generic default reboot actions.
-Non Executable Mappings
-=======================
-
- noexec=on|off
- on
- Enable(default)
- off
- Disable
-
NUMA
====
@@ -310,3 +301,17 @@ Miscellaneous
Do not use GB pages for kernel direct mappings.
gbpages
Use GB pages for kernel direct mappings.
+
+
+AMD SEV (Secure Encrypted Virtualization)
+=========================================
+Options relating to AMD SEV, specified via the following format:
+
+::
+
+ sev=option1[,option2]
+
+The available options are:
+
+ debug
+ Enable debug messages.
diff --git a/Documentation/x86/zero-page.rst b/Documentation/x86/zero-page.rst
index f088f5881666..45aa9cceb4f1 100644
--- a/Documentation/x86/zero-page.rst
+++ b/Documentation/x86/zero-page.rst
@@ -19,6 +19,7 @@ Offset/Size Proto Name Meaning
058/008 ALL tboot_addr Physical address of tboot shared page
060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information
(struct ist_info)
+070/008 ALL acpi_rsdp_addr Physical address of ACPI RSDP table
080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!!
090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!!
0A0/010 ALL sys_desc_table System description table (struct sys_desc_table),
@@ -27,6 +28,7 @@ Offset/Size Proto Name Meaning
0C0/004 ALL ext_ramdisk_image ramdisk_image high 32bits
0C4/004 ALL ext_ramdisk_size ramdisk_size high 32bits
0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits
+13C/004 ALL cc_blob_address Physical address of Confidential Computing blob
140/080 ALL edid_info Video mode setup (struct edid_info)
1C0/020 ALL efi_info EFI 32 information (struct efi_info)
1E0/004 ALL alt_mem_k Alternative mem check, in KB