diff options
1079 files changed, 30675 insertions, 13906 deletions
@@ -2049,6 +2049,10 @@ D: pirq addr, CS5535 alsa audio driver S: Gurgaon, India S: Kuala Lumpur, Malaysia +N: Mohit Kumar +D: ST Microelectronics SPEAr13xx PCI host bridge driver +D: Synopsys Designware PCI host bridge driver + N: Gabor Kuti M: seasons@falcon.sch.bme.hu M: seasons@makosteszta.sote.hu diff --git a/Documentation/ABI/testing/sysfs-class-mtd b/Documentation/ABI/testing/sysfs-class-mtd index 76ee192f80a0..3b5c3bca9186 100644 --- a/Documentation/ABI/testing/sysfs-class-mtd +++ b/Documentation/ABI/testing/sysfs-class-mtd @@ -222,3 +222,13 @@ Description: The number of blocks that are marked as reserved, if any, in this partition. These are typically used to store the in-flash bad block table (BBT). + +What: /sys/class/mtd/mtdX/offset +Date: March 2015 +KernelVersion: 4.1 +Contact: linux-mtd@lists.infradead.org +Description: + For a partition, the offset of that partition from the start + of the master device in bytes. This attribute is absent on + main devices, so it can be used to distinguish between + partitions and devices that aren't partitions. diff --git a/Documentation/ABI/testing/sysfs-driver-toshiba_acpi b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi index ca9c71a531c5..eed922ef42e5 100644 --- a/Documentation/ABI/testing/sysfs-driver-toshiba_acpi +++ b/Documentation/ABI/testing/sysfs-driver-toshiba_acpi @@ -8,9 +8,11 @@ Description: This file controls the keyboard backlight operation mode, valid * 0x2 -> AUTO (also called TIMER) * 0x8 -> ON * 0x10 -> OFF - Note that the kernel 3.16 onwards this file accepts all listed + Note that from kernel 3.16 onwards this file accepts all listed parameters, kernel 3.15 only accepts the first two (FN-Z and AUTO). + Also note that toggling this value on type 1 devices, requires + a reboot for changes to take effect. Users: KToshiba What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_backlight_timeout @@ -67,15 +69,72 @@ Description: This file shows the current keyboard backlight type, * 2 -> Type 2, supporting modes TIMER, ON and OFF Users: KToshiba +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_sleep_charge +Date: January 23, 2015 +KernelVersion: 4.0 +Contact: Azael Avalos <coproscefalo@gmail.com> +Description: This file controls the USB Sleep & Charge charging mode, which + can be: + * 0 -> Disabled (0x00) + * 1 -> Alternate (0x09) + * 2 -> Auto (0x21) + * 3 -> Typical (0x11) + Note that from kernel 4.1 onwards this file accepts all listed + values, kernel 4.0 only supports the first three. + Note that this feature only works when connected to power, if + you want to use it under battery, see the entry named + "sleep_functions_on_battery" +Users: KToshiba + +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/sleep_functions_on_battery +Date: January 23, 2015 +KernelVersion: 4.0 +Contact: Azael Avalos <coproscefalo@gmail.com> +Description: This file controls the USB Sleep Functions under battery, and + set the level at which point they will be disabled, accepted + values can be: + * 0 -> Disabled + * 1-100 -> Battery level to disable sleep functions + Currently it prints two values, the first one indicates if the + feature is enabled or disabled, while the second one shows the + current battery level set. + Note that when the value is set to disabled, the sleep function + will only work when connected to power. +Users: KToshiba + +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_rapid_charge +Date: January 23, 2015 +KernelVersion: 4.0 +Contact: Azael Avalos <coproscefalo@gmail.com> +Description: This file controls the USB Rapid Charge state, which can be: + * 0 -> Disabled + * 1 -> Enabled + Note that toggling this value requires a reboot for changes to + take effect. +Users: KToshiba + +What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_sleep_music +Date: January 23, 2015 +KernelVersion: 4.0 +Contact: Azael Avalos <coproscefalo@gmail.com> +Description: This file controls the Sleep & Music state, which values can be: + * 0 -> Disabled + * 1 -> Enabled + Note that this feature only works when connected to power, if + you want to use it under battery, see the entry named + "sleep_functions_on_battery" +Users: KToshiba + What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/version -Date: February, 2015 -KernelVersion: 3.20 +Date: February 12, 2015 +KernelVersion: 4.0 Contact: Azael Avalos <coproscefalo@gmail.com> Description: This file shows the current version of the driver +Users: KToshiba What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/fan -Date: February, 2015 -KernelVersion: 3.20 +Date: February 12, 2015 +KernelVersion: 4.0 Contact: Azael Avalos <coproscefalo@gmail.com> Description: This file controls the state of the internal fan, valid values are: @@ -83,8 +142,8 @@ Description: This file controls the state of the internal fan, valid * 1 -> ON What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/kbd_function_keys -Date: February, 2015 -KernelVersion: 3.20 +Date: February 12, 2015 +KernelVersion: 4.0 Contact: Azael Avalos <coproscefalo@gmail.com> Description: This file controls the Special Functions (hotkeys) operation mode, valid values are: @@ -94,21 +153,29 @@ Description: This file controls the Special Functions (hotkeys) operation and the hotkeys are accessed via FN-F{1-12}. In the "Special Functions" mode, the F{1-12} keys trigger the hotkey and the F{1-12} keys are accessed via FN-F{1-12}. + Note that toggling this value requires a reboot for changes to + take effect. +Users: KToshiba What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/panel_power_on -Date: February, 2015 -KernelVersion: 3.20 +Date: February 12, 2015 +KernelVersion: 4.0 Contact: Azael Avalos <coproscefalo@gmail.com> Description: This file controls whether the laptop should turn ON whenever the LID is opened, valid values are: * 0 -> Disabled * 1 -> Enabled + Note that toggling this value requires a reboot for changes to + take effect. +Users: KToshiba What: /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS{1900,620{0,7,8}}:00/usb_three -Date: February, 2015 -KernelVersion: 3.20 +Date: February 12, 2015 +KernelVersion: 4.0 Contact: Azael Avalos <coproscefalo@gmail.com> -Description: This file controls whether the USB 3 functionality, valid - values are: +Description: This file controls the USB 3 functionality, valid values are: * 0 -> Disabled (Acts as a regular USB 2) * 1 -> Enabled (Full USB 3 functionality) + Note that toggling this value requires a reboot for changes to + take effect. +Users: KToshiba diff --git a/Documentation/ABI/testing/sysfs-platform-dell-laptop b/Documentation/ABI/testing/sysfs-platform-dell-laptop new file mode 100644 index 000000000000..8c6a0b8e1131 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-dell-laptop @@ -0,0 +1,69 @@ +What: /sys/class/leds/dell::kbd_backlight/als_enabled +Date: December 2014 +KernelVersion: 3.19 +Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>, + Pali Rohár <pali.rohar@gmail.com> +Description: + This file allows to control the automatic keyboard + illumination mode on some systems that have an ambient + light sensor. Write 1 to this file to enable the auto + mode, 0 to disable it. + +What: /sys/class/leds/dell::kbd_backlight/als_setting +Date: December 2014 +KernelVersion: 3.19 +Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>, + Pali Rohár <pali.rohar@gmail.com> +Description: + This file allows to specifiy the on/off threshold value, + as reported by the ambient light sensor. + +What: /sys/class/leds/dell::kbd_backlight/start_triggers +Date: December 2014 +KernelVersion: 3.19 +Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>, + Pali Rohár <pali.rohar@gmail.com> +Description: + This file allows to control the input triggers that + turn on the keyboard backlight illumination that is + disabled because of inactivity. + Read the file to see the triggers available. The ones + enabled are preceded by '+', those disabled by '-'. + + To enable a trigger, write its name preceded by '+' to + this file. To disable a trigger, write its name preceded + by '-' instead. + + For example, to enable the keyboard as trigger run: + echo +keyboard > /sys/class/leds/dell::kbd_backlight/start_triggers + To disable it: + echo -keyboard > /sys/class/leds/dell::kbd_backlight/start_triggers + + Note that not all the available triggers can be configured. + +What: /sys/class/leds/dell::kbd_backlight/stop_timeout +Date: December 2014 +KernelVersion: 3.19 +Contact: Gabriele Mazzotta <gabriele.mzt@gmail.com>, + Pali Rohár <pali.rohar@gmail.com> +Description: + This file allows to specify the interval after which the + keyboard illumination is disabled because of inactivity. + The timeouts are expressed in seconds, minutes, hours and + days, for which the symbols are 's', 'm', 'h' and 'd' + respectively. + + To configure the timeout, write to this file a value along + with any the above units. If no unit is specified, the value + is assumed to be expressed in seconds. + + For example, to set the timeout to 10 minutes run: + echo 10m > /sys/class/leds/dell::kbd_backlight/stop_timeout + + Note that when this file is read, the returned value might be + expressed in a different unit than the one used when the timeout + was set. + + Also note that only some timeouts are supported and that + some systems might fall back to a specific timeout in case + an invalid timeout is written to this file. diff --git a/Documentation/arm64/acpi_object_usage.txt b/Documentation/arm64/acpi_object_usage.txt new file mode 100644 index 000000000000..a6e1a1805e51 --- /dev/null +++ b/Documentation/arm64/acpi_object_usage.txt @@ -0,0 +1,593 @@ +ACPI Tables +----------- +The expectations of individual ACPI tables are discussed in the list that +follows. + +If a section number is used, it refers to a section number in the ACPI +specification where the object is defined. If "Signature Reserved" is used, +the table signature (the first four bytes of the table) is the only portion +of the table recognized by the specification, and the actual table is defined +outside of the UEFI Forum (see Section 5.2.6 of the specification). + +For ACPI on arm64, tables also fall into the following categories: + + -- Required: DSDT, FADT, GTDT, MADT, MCFG, RSDP, SPCR, XSDT + + -- Recommended: BERT, EINJ, ERST, HEST, SSDT + + -- Optional: BGRT, CPEP, CSRT, DRTM, ECDT, FACS, FPDT, MCHI, MPST, + MSCT, RASF, SBST, SLIT, SPMI, SRAT, TCPA, TPM2, UEFI + + -- Not supported: BOOT, DBG2, DBGP, DMAR, ETDT, HPET, IBFT, IVRS, + LPIT, MSDM, RSDT, SLIC, WAET, WDAT, WDRT, WPBT + + +Table Usage for ARMv8 Linux +----- ---------------------------------------------------------------- +BERT Section 18.3 (signature == "BERT") + == Boot Error Record Table == + Must be supplied if RAS support is provided by the platform. It + is recommended this table be supplied. + +BOOT Signature Reserved (signature == "BOOT") + == simple BOOT flag table == + Microsoft only table, will not be supported. + +BGRT Section 5.2.22 (signature == "BGRT") + == Boot Graphics Resource Table == + Optional, not currently supported, with no real use-case for an + ARM server. + +CPEP Section 5.2.18 (signature == "CPEP") + == Corrected Platform Error Polling table == + Optional, not currently supported, and not recommended until such + time as ARM-compatible hardware is available, and the specification + suitably modified. + +CSRT Signature Reserved (signature == "CSRT") + == Core System Resources Table == + Optional, not currently supported. + +DBG2 Signature Reserved (signature == "DBG2") + == DeBuG port table 2 == + Microsoft only table, will not be supported. + +DBGP Signature Reserved (signature == "DBGP") + == DeBuG Port table == + Microsoft only table, will not be supported. + +DSDT Section 5.2.11.1 (signature == "DSDT") + == Differentiated System Description Table == + A DSDT is required; see also SSDT. + + ACPI tables contain only one DSDT but can contain one or more SSDTs, + which are optional. Each SSDT can only add to the ACPI namespace, + but cannot modify or replace anything in the DSDT. + +DMAR Signature Reserved (signature == "DMAR") + == DMA Remapping table == + x86 only table, will not be supported. + +DRTM Signature Reserved (signature == "DRTM") + == Dynamic Root of Trust for Measurement table == + Optional, not currently supported. + +ECDT Section 5.2.16 (signature == "ECDT") + == Embedded Controller Description Table == + Optional, not currently supported, but could be used on ARM if and + only if one uses the GPE_BIT field to represent an IRQ number, since + there are no GPE blocks defined in hardware reduced mode. This would + need to be modified in the ACPI specification. + +EINJ Section 18.6 (signature == "EINJ") + == Error Injection table == + This table is very useful for testing platform response to error + conditions; it allows one to inject an error into the system as + if it had actually occurred. However, this table should not be + shipped with a production system; it should be dynamically loaded + and executed with the ACPICA tools only during testing. + +ERST Section 18.5 (signature == "ERST") + == Error Record Serialization Table == + On a platform supports RAS, this table must be supplied if it is not + UEFI-based; if it is UEFI-based, this table may be supplied. When this + table is not present, UEFI run time service will be utilized to save + and retrieve hardware error information to and from a persistent store. + +ETDT Signature Reserved (signature == "ETDT") + == Event Timer Description Table == + Obsolete table, will not be supported. + +FACS Section 5.2.10 (signature == "FACS") + == Firmware ACPI Control Structure == + It is unlikely that this table will be terribly useful. If it is + provided, the Global Lock will NOT be used since it is not part of + the hardware reduced profile, and only 64-bit address fields will + be considered valid. + +FADT Section 5.2.9 (signature == "FACP") + == Fixed ACPI Description Table == + Required for arm64. + + The HW_REDUCED_ACPI flag must be set. All of the fields that are + to be ignored when HW_REDUCED_ACPI is set are expected to be set to + zero. + + If an FACS table is provided, the X_FIRMWARE_CTRL field is to be + used, not FIRMWARE_CTRL. + + If PSCI is used (as is recommended), make sure that ARM_BOOT_ARCH is + filled in properly -- that the PSCI_COMPLIANT flag is set and that + PSCI_USE_HVC is set or unset as needed (see table 5-37). + + For the DSDT that is also required, the X_DSDT field is to be used, + not the DSDT field. + +FPDT Section 5.2.23 (signature == "FPDT") + == Firmware Performance Data Table == + Optional, not currently supported. + +GTDT Section 5.2.24 (signature == "GTDT") + == Generic Timer Description Table == + Required for arm64. + +HEST Section 18.3.2 (signature == "HEST") + == Hardware Error Source Table == + Until further error source types are defined, use only types 6 (AER + Root Port), 7 (AER Endpoint), 8 (AER Bridge), or 9 (Generic Hardware + Error Source). Firmware first error handling is possible if and only + if Trusted Firmware is being used on arm64. + + Must be supplied if RAS support is provided by the platform. It + is recommended this table be supplied. + +HPET Signature Reserved (signature == "HPET") + == High Precision Event timer Table == + x86 only table, will not be supported. + +IBFT Signature Reserved (signature == "IBFT") + == iSCSI Boot Firmware Table == + Microsoft defined table, support TBD. + +IVRS Signature Reserved (signature == "IVRS") + == I/O Virtualization Reporting Structure == + x86_64 (AMD) only table, will not be supported. + +LPIT Signature Reserved (signature == "LPIT") + == Low Power Idle Table == + x86 only table as of ACPI 5.1; future versions have been adapted for + use with ARM and will be recommended in order to support ACPI power + management. + +MADT Section 5.2.12 (signature == "APIC") + == Multiple APIC Description Table == + Required for arm64. Only the GIC interrupt controller structures + should be used (types 0xA - 0xE). + +MCFG Signature Reserved (signature == "MCFG") + == Memory-mapped ConFiGuration space == + If the platform supports PCI/PCIe, an MCFG table is required. + +MCHI Signature Reserved (signature == "MCHI") + == Management Controller Host Interface table == + Optional, not currently supported. + +MPST Section 5.2.21 (signature == "MPST") + == Memory Power State Table == + Optional, not currently supported. + +MSDM Signature Reserved (signature == "MSDM") + == Microsoft Data Management table == + Microsoft only table, will not be supported. + +MSCT Section 5.2.19 (signature == "MSCT") + == Maximum System Characteristic Table == + Optional, not currently supported. + +RASF Section 5.2.20 (signature == "RASF") + == RAS Feature table == + Optional, not currently supported. + +RSDP Section 5.2.5 (signature == "RSD PTR") + == Root System Description PoinTeR == + Required for arm64. + +RSDT Section 5.2.7 (signature == "RSDT") + == Root System Description Table == + Since this table can only provide 32-bit addresses, it is deprecated + on arm64, and will not be used. + +SBST Section 5.2.14 (signature == "SBST") + == Smart Battery Subsystem Table == + Optional, not currently supported. + +SLIC Signature Reserved (signature == "SLIC") + == Software LIcensing table == + Microsoft only table, will not be supported. + +SLIT Section 5.2.17 (signature == "SLIT") + == System Locality distance Information Table == + Optional in general, but required for NUMA systems. + +SPCR Signature Reserved (signature == "SPCR") + == Serial Port Console Redirection table == + Required for arm64. + +SPMI Signature Reserved (signature == "SPMI") + == Server Platform Management Interface table == + Optional, not currently supported. + +SRAT Section 5.2.16 (signature == "SRAT") + == System Resource Affinity Table == + Optional, but if used, only the GICC Affinity structures are read. + To support NUMA, this table is required. + +SSDT Section 5.2.11.2 (signature == "SSDT") + == Secondary System Description Table == + These tables are a continuation of the DSDT; these are recommended + for use with devices that can be added to a running system, but can + also serve the purpose of dividing up device descriptions into more + manageable pieces. + + An SSDT can only ADD to the ACPI namespace. It cannot modify or + replace existing device descriptions already in the namespace. + + These tables are optional, however. ACPI tables should contain only + one DSDT but can contain many SSDTs. + +TCPA Signature Reserved (signature == "TCPA") + == Trusted Computing Platform Alliance table == + Optional, not currently supported, and may need changes to fully + interoperate with arm64. + +TPM2 Signature Reserved (signature == "TPM2") + == Trusted Platform Module 2 table == + Optional, not currently supported, and may need changes to fully + interoperate with arm64. + +UEFI Signature Reserved (signature == "UEFI") + == UEFI ACPI data table == + Optional, not currently supported. No known use case for arm64, + at present. + +WAET Signature Reserved (signature == "WAET") + == Windows ACPI Emulated devices Table == + Microsoft only table, will not be supported. + +WDAT Signature Reserved (signature == "WDAT") + == Watch Dog Action Table == + Microsoft only table, will not be supported. + +WDRT Signature Reserved (signature == "WDRT") + == Watch Dog Resource Table == + Microsoft only table, will not be supported. + +WPBT Signature Reserved (signature == "WPBT") + == Windows Platform Binary Table == + Microsoft only table, will not be supported. + +XSDT Section 5.2.8 (signature == "XSDT") + == eXtended System Description Table == + Required for arm64. + + +ACPI Objects +------------ +The expectations on individual ACPI objects are discussed in the list that +follows: + +Name Section Usage for ARMv8 Linux +---- ------------ ------------------------------------------------- +_ADR 6.1.1 Use as needed. + +_BBN 6.5.5 Use as needed; PCI-specific. + +_BDN 6.5.3 Optional; not likely to be used on arm64. + +_CCA 6.2.17 This method should be defined for all bus masters + on arm64. While cache coherency is assumed, making + it explicit ensures the kernel will set up DMA as + it should. + +_CDM 6.2.1 Optional, to be used only for processor devices. + +_CID 6.1.2 Use as needed. + +_CLS 6.1.3 Use as needed. + +_CRS 6.2.2 Required on arm64. + +_DCK 6.5.2 Optional; not likely to be used on arm64. + +_DDN 6.1.4 This field can be used for a device name. However, + it is meant for DOS device names (e.g., COM1), so be + careful of its use across OSes. + +_DEP 6.5.8 Use as needed. + +_DIS 6.2.3 Optional, for power management use. + +_DLM 5.7.5 Optional. + +_DMA 6.2.4 Optional. + +_DSD 6.2.5 To be used with caution. If this object is used, try + to use it within the constraints already defined by the + Device Properties UUID. Only in rare circumstances + should it be necessary to create a new _DSD UUID. + + In either case, submit the _DSD definition along with + any driver patches for discussion, especially when + device properties are used. A driver will not be + considered complete without a corresponding _DSD + description. Once approved by kernel maintainers, + the UUID or device properties must then be registered + with the UEFI Forum; this may cause some iteration as + more than one OS will be registering entries. + +_DSM Do not use this method. It is not standardized, the + return values are not well documented, and it is + currently a frequent source of error. + +_DSW 7.2.1 Use as needed; power management specific. + +_EDL 6.3.1 Optional. + +_EJD 6.3.2 Optional. + +_EJx 6.3.3 Optional. + +_FIX 6.2.7 x86 specific, not used on arm64. + +\_GL 5.7.1 This object is not to be used in hardware reduced + mode, and therefore should not be used on arm64. + +_GLK 6.5.7 This object requires a global lock be defined; there + is no global lock on arm64 since it runs in hardware + reduced mode. Hence, do not use this object on arm64. + +\_GPE 5.3.1 This namespace is for x86 use only. Do not use it + on arm64. + +_GSB 6.2.7 Optional. + +_HID 6.1.5 Use as needed. This is the primary object to use in + device probing, though _CID and _CLS may also be used. + +_HPP 6.2.8 Optional, PCI specific. + +_HPX 6.2.9 Optional, PCI specific. + +_HRV 6.1.6 Optional, use as needed to clarify device behavior; in + some cases, this may be easier to use than _DSD. + +_INI 6.5.1 Not required, but can be useful in setting up devices + when UEFI leaves them in a state that may not be what + the driver expects before it starts probing. + +_IRC 7.2.15 Use as needed; power management specific. + +_LCK 6.3.4 Optional. + +_MAT 6.2.10 Optional; see also the MADT. + +_MLS 6.1.7 Optional, but highly recommended for use in + internationalization. + +_OFF 7.1.2 It is recommended to define this method for any device + that can be turned on or off. + +_ON 7.1.3 It is recommended to define this method for any device + that can be turned on or off. + +\_OS 5.7.3 This method will return "Linux" by default (this is + the value of the macro ACPI_OS_NAME on Linux). The + command line parameter acpi_os=<string> can be used + to set it to some other value. + +_OSC 6.2.11 This method can be a global method in ACPI (i.e., + \_SB._OSC), or it may be associated with a specific + device (e.g., \_SB.DEV0._OSC), or both. When used + as a global method, only capabilities published in + the ACPI specification are allowed. When used as + a device-specific method, the process described for + using _DSD MUST be used to create an _OSC definition; + out-of-process use of _OSC is not allowed. That is, + submit the device-specific _OSC usage description as + part of the kernel driver submission, get it approved + by the kernel community, then register it with the + UEFI Forum. + +\_OSI 5.7.2 Deprecated on ARM64. Any invocation of this method + will print a warning on the console and return false. + That is, as far as ACPI firmware is concerned, _OSI + cannot be used to determine what sort of system is + being used or what functionality is provided. The + _OSC method is to be used instead. + +_OST 6.3.5 Optional. + +_PDC 8.4.1 Deprecated, do not use on arm64. + +\_PIC 5.8.1 The method should not be used. On arm64, the only + interrupt model available is GIC. + +_PLD 6.1.8 Optional. + +\_PR 5.3.1 This namespace is for x86 use only on legacy systems. + Do not use it on arm64. + +_PRS 6.2.12 Optional. + +_PRT 6.2.13 Required as part of the definition of all PCI root + devices. + +_PRW 7.2.13 Use as needed; power management specific. + +_PRx 7.2.8-11 Use as needed; power management specific. If _PR0 is + defined, _PR3 must also be defined. + +_PSC 7.2.6 Use as needed; power management specific. + +_PSE 7.2.7 Use as needed; power management specific. + +_PSW 7.2.14 Use as needed; power management specific. + +_PSx 7.2.2-5 Use as needed; power management specific. If _PS0 is + defined, _PS3 must also be defined. If clocks or + regulators need adjusting to be consistent with power + usage, change them in these methods. + +\_PTS 7.3.1 Use as needed; power management specific. + +_PXM 6.2.14 Optional. + +_REG 6.5.4 Use as needed. + +\_REV 5.7.4 Always returns the latest version of ACPI supported. + +_RMV 6.3.6 Optional. + +\_SB 5.3.1 Required on arm64; all devices must be defined in this + namespace. + +_SEG 6.5.6 Use as needed; PCI-specific. + +\_SI 5.3.1, Optional. + 9.1 + +_SLI 6.2.15 Optional; recommended when SLIT table is in use. + +_STA 6.3.7, It is recommended to define this method for any device + 7.1.4 that can be turned on or off. + +_SRS 6.2.16 Optional; see also _PRS. + +_STR 6.1.10 Recommended for conveying device names to end users; + this is preferred over using _DDN. + +_SUB 6.1.9 Use as needed; _HID or _CID are preferred. + +_SUN 6.1.11 Optional. + +\_Sx 7.3.2 Use as needed; power management specific. + +_SxD 7.2.16-19 Use as needed; power management specific. + +_SxW 7.2.20-24 Use as needed; power management specific. + +_SWS 7.3.3 Use as needed; power management specific; this may + require specification changes for use on arm64. + +\_TTS 7.3.4 Use as needed; power management specific. + +\_TZ 5.3.1 Optional. + +_UID 6.1.12 Recommended for distinguishing devices of the same + class; define it if at all possible. + +\_WAK 7.3.5 Use as needed; power management specific. + + +ACPI Event Model +---------------- +Do not use GPE block devices; these are not supported in the hardware reduced +profile used by arm64. Since there are no GPE blocks defined for use on ARM +platforms, GPIO-signaled interrupts should be used for creating system events. + + +ACPI Processor Control +---------------------- +Section 8 of the ACPI specification is currently undergoing change that +should be completed in the 6.0 version of the specification. Processor +performance control will be handled differently for arm64 at that point +in time. Processor aggregator devices (section 8.5) will not be used, +for example, but another similar mechanism instead. + +While UEFI constrains what we can say until the release of 6.0, it is +recommended that CPPC (8.4.5) be used as the primary model. This will +still be useful into the future. C-states and P-states will still be +provided, but most of the current design work appears to favor CPPC. + +Further, it is essential that the ARMv8 SoC provide a fully functional +implementation of PSCI; this will be the only mechanism supported by ACPI +to control CPU power state (including secondary CPU booting). + +More details will be provided on the release of the ACPI 6.0 specification. + + +ACPI System Address Map Interfaces +---------------------------------- +In Section 15 of the ACPI specification, several methods are mentioned as +possible mechanisms for conveying memory resource information to the kernel. +For arm64, we will only support UEFI for booting with ACPI, hence the UEFI +GetMemoryMap() boot service is the only mechanism that will be used. + + +ACPI Platform Error Interfaces (APEI) +------------------------------------- +The APEI tables supported are described above. + +APEI requires the equivalent of an SCI and an NMI on ARMv8. The SCI is used +to notify the OSPM of errors that have occurred but can be corrected and the +system can continue correct operation, even if possibly degraded. The NMI is +used to indicate fatal errors that cannot be corrected, and require immediate +attention. + +Since there is no direct equivalent of the x86 SCI or NMI, arm64 handles +these slightly differently. The SCI is handled as a normal GPIO-signaled +interrupt; given that these are corrected (or correctable) errors being +reported, this is sufficient. The NMI is emulated as the highest priority +GPIO-signaled interrupt possible. This implies some caution must be used +since there could be interrupts at higher privilege levels or even interrupts +at the same priority as the emulated NMI. In Linux, this should not be the +case but one should be aware it could happen. + + +ACPI Objects Not Supported on ARM64 +----------------------------------- +While this may change in the future, there are several classes of objects +that can be defined, but are not currently of general interest to ARM servers. + +These are not supported: + + -- Section 9.2: ambient light sensor devices + + -- Section 9.3: battery devices + + -- Section 9.4: lids (e.g., laptop lids) + + -- Section 9.8.2: IDE controllers + + -- Section 9.9: floppy controllers + + -- Section 9.10: GPE block devices + + -- Section 9.15: PC/AT RTC/CMOS devices + + -- Section 9.16: user presence detection devices + + -- Section 9.17: I/O APIC devices; all GICs must be enumerable via MADT + + -- Section 9.18: time and alarm devices (see 9.15) + + +ACPI Objects Not Yet Implemented +-------------------------------- +While these objects have x86 equivalents, and they do make some sense in ARM +servers, there is either no hardware available at present, or in some cases +there may not yet be a non-ARM implementation. Hence, they are currently not +implemented though that may change in the future. + +Not yet implemented are: + + -- Section 10: power source and power meter devices + + -- Section 11: thermal management + + -- Section 12: embedded controllers interface + + -- Section 13: SMBus interfaces + + -- Section 17: NUMA support (prototypes have been submitted for + review) diff --git a/Documentation/arm64/arm-acpi.txt b/Documentation/arm64/arm-acpi.txt new file mode 100644 index 000000000000..570a4f8e1a01 --- /dev/null +++ b/Documentation/arm64/arm-acpi.txt @@ -0,0 +1,505 @@ +ACPI on ARMv8 Servers +--------------------- +ACPI can be used for ARMv8 general purpose servers designed to follow +the ARM SBSA (Server Base System Architecture) [0] and SBBR (Server +Base Boot Requirements) [1] specifications. Please note that the SBBR +can be retrieved simply by visiting [1], but the SBSA is currently only +available to those with an ARM login due to ARM IP licensing concerns. + +The ARMv8 kernel implements the reduced hardware model of ACPI version +5.1 or later. Links to the specification and all external documents +it refers to are managed by the UEFI Forum. The specification is +available at http://www.uefi.org/specifications and documents referenced +by the specification can be found via http://www.uefi.org/acpi. + +If an ARMv8 system does not meet the requirements of the SBSA and SBBR, +or cannot be described using the mechanisms defined in the required ACPI +specifications, then ACPI may not be a good fit for the hardware. + +While the documents mentioned above set out the requirements for building +industry-standard ARMv8 servers, they also apply to more than one operating +system. The purpose of this document is to describe the interaction between +ACPI and Linux only, on an ARMv8 system -- that is, what Linux expects of +ACPI and what ACPI can expect of Linux. + + +Why ACPI on ARM? +---------------- +Before examining the details of the interface between ACPI and Linux, it is +useful to understand why ACPI is being used. Several technologies already +exist in Linux for describing non-enumerable hardware, after all. In this +section we summarize a blog post [2] from Grant Likely that outlines the +reasoning behind ACPI on ARMv8 servers. Actually, we snitch a good portion +of the summary text almost directly, to be honest. + +The short form of the rationale for ACPI on ARM is: + +-- ACPI’s bytecode (AML) allows the platform to encode hardware behavior, + while DT explicitly does not support this. For hardware vendors, being + able to encode behavior is a key tool used in supporting operating + system releases on new hardware. + +-- ACPI’s OSPM defines a power management model that constrains what the + platform is allowed to do into a specific model, while still providing + flexibility in hardware design. + +-- In the enterprise server environment, ACPI has established bindings (such + as for RAS) which are currently used in production systems. DT does not. + Such bindings could be defined in DT at some point, but doing so means ARM + and x86 would end up using completely different code paths in both firmware + and the kernel. + +-- Choosing a single interface to describe the abstraction between a platform + and an OS is important. Hardware vendors would not be required to implement + both DT and ACPI if they want to support multiple operating systems. And, + agreeing on a single interface instead of being fragmented into per OS + interfaces makes for better interoperability overall. + +-- The new ACPI governance process works well and Linux is now at the same + table as hardware vendors and other OS vendors. In fact, there is no + longer any reason to feel that ACPI is only belongs to Windows or that + Linux is in any way secondary to Microsoft in this arena. The move of + ACPI governance into the UEFI forum has significantly opened up the + specification development process, and currently, a large portion of the + changes being made to ACPI is being driven by Linux. + +Key to the use of ACPI is the support model. For servers in general, the +responsibility for hardware behaviour cannot solely be the domain of the +kernel, but rather must be split between the platform and the kernel, in +order to allow for orderly change over time. ACPI frees the OS from needing +to understand all the minute details of the hardware so that the OS doesn’t +need to be ported to each and every device individually. It allows the +hardware vendors to take responsibility for power management behaviour without +depending on an OS release cycle which is not under their control. + +ACPI is also important because hardware and OS vendors have already worked +out the mechanisms for supporting a general purpose computing ecosystem. The +infrastructure is in place, the bindings are in place, and the processes are +in place. DT does exactly what Linux needs it to when working with vertically +integrated devices, but there are no good processes for supporting what the +server vendors need. Linux could potentially get there with DT, but doing so +really just duplicates something that already works. ACPI already does what +the hardware vendors need, Microsoft won’t collaborate on DT, and hardware +vendors would still end up providing two completely separate firmware +interfaces -- one for Linux and one for Windows. + + +Kernel Compatibility +-------------------- +One of the primary motivations for ACPI is standardization, and using that +to provide backward compatibility for Linux kernels. In the server market, +software and hardware are often used for long periods. ACPI allows the +kernel and firmware to agree on a consistent abstraction that can be +maintained over time, even as hardware or software change. As long as the +abstraction is supported, systems can be updated without necessarily having +to replace the kernel. + +When a Linux driver or subsystem is first implemented using ACPI, it by +definition ends up requiring a specific version of the ACPI specification +-- it's baseline. ACPI firmware must continue to work, even though it may +not be optimal, with the earliest kernel version that first provides support +for that baseline version of ACPI. There may be a need for additional drivers, +but adding new functionality (e.g., CPU power management) should not break +older kernel versions. Further, ACPI firmware must also work with the most +recent version of the kernel. + + +Relationship with Device Tree +----------------------------- +ACPI support in drivers and subsystems for ARMv8 should never be mutually +exclusive with DT support at compile time. + +At boot time the kernel will only use one description method depending on +parameters passed from the bootloader (including kernel bootargs). + +Regardless of whether DT or ACPI is used, the kernel must always be capable +of booting with either scheme (in kernels with both schemes enabled at compile +time). + + +Booting using ACPI tables +------------------------- +The only defined method for passing ACPI tables to the kernel on ARMv8 +is via the UEFI system configuration table. Just so it is explicit, this +means that ACPI is only supported on platforms that boot via UEFI. + +When an ARMv8 system boots, it can either have DT information, ACPI tables, +or in some very unusual cases, both. If no command line parameters are used, +the kernel will try to use DT for device enumeration; if there is no DT +present, the kernel will try to use ACPI tables, but only if they are present. +In neither is available, the kernel will not boot. If acpi=force is used +on the command line, the kernel will attempt to use ACPI tables first, but +fall back to DT if there are no ACPI tables present. The basic idea is that +the kernel will not fail to boot unless it absolutely has no other choice. + +Processing of ACPI tables may be disabled by passing acpi=off on the kernel +command line; this is the default behavior. + +In order for the kernel to load and use ACPI tables, the UEFI implementation +MUST set the ACPI_20_TABLE_GUID to point to the RSDP table (the table with +the ACPI signature "RSD PTR "). If this pointer is incorrect and acpi=force +is used, the kernel will disable ACPI and try to use DT to boot instead; the +kernel has, in effect, determined that ACPI tables are not present at that +point. + +If the pointer to the RSDP table is correct, the table will be mapped into +the kernel by the ACPI core, using the address provided by UEFI. + +The ACPI core will then locate and map in all other ACPI tables provided by +using the addresses in the RSDP table to find the XSDT (eXtended System +Description Table). The XSDT in turn provides the addresses to all other +ACPI tables provided by the system firmware; the ACPI core will then traverse +this table and map in the tables listed. + +The ACPI core will ignore any provided RSDT (Root System Description Table). +RSDTs have been deprecated and are ignored on arm64 since they only allow +for 32-bit addresses. + +Further, the ACPI core will only use the 64-bit address fields in the FADT +(Fixed ACPI Description Table). Any 32-bit address fields in the FADT will +be ignored on arm64. + +Hardware reduced mode (see Section 4.1 of the ACPI 5.1 specification) will +be enforced by the ACPI core on arm64. Doing so allows the ACPI core to +run less complex code since it no longer has to provide support for legacy +hardware from other architectures. Any fields that are not to be used for +hardware reduced mode must be set to zero. + +For the ACPI core to operate properly, and in turn provide the information +the kernel needs to configure devices, it expects to find the following +tables (all section numbers refer to the ACPI 5.1 specfication): + + -- RSDP (Root System Description Pointer), section 5.2.5 + + -- XSDT (eXtended System Description Table), section 5.2.8 + + -- FADT (Fixed ACPI Description Table), section 5.2.9 + + -- DSDT (Differentiated System Description Table), section + 5.2.11.1 + + -- MADT (Multiple APIC Description Table), section 5.2.12 + + -- GTDT (Generic Timer Description Table), section 5.2.24 + + -- If PCI is supported, the MCFG (Memory mapped ConFiGuration + Table), section 5.2.6, specifically Table 5-31. + +If the above tables are not all present, the kernel may or may not be +able to boot properly since it may not be able to configure all of the +devices available. + + +ACPI Detection +-------------- +Drivers should determine their probe() type by checking for a null +value for ACPI_HANDLE, or checking .of_node, or other information in +the device structure. This is detailed further in the "Driver +Recommendations" section. + +In non-driver code, if the presence of ACPI needs to be detected at +runtime, then check the value of acpi_disabled. If CONFIG_ACPI is not +set, acpi_disabled will always be 1. + + +Device Enumeration +------------------ +Device descriptions in ACPI should use standard recognized ACPI interfaces. +These may contain less information than is typically provided via a Device +Tree description for the same device. This is also one of the reasons that +ACPI can be useful -- the driver takes into account that it may have less +detailed information about the device and uses sensible defaults instead. +If done properly in the driver, the hardware can change and improve over +time without the driver having to change at all. + +Clocks provide an excellent example. In DT, clocks need to be specified +and the drivers need to take them into account. In ACPI, the assumption +is that UEFI will leave the device in a reasonable default state, including +any clock settings. If for some reason the driver needs to change a clock +value, this can be done in an ACPI method; all the driver needs to do is +invoke the method and not concern itself with what the method needs to do +to change the clock. Changing the hardware can then take place over time +by changing what the ACPI method does, and not the driver. + +In DT, the parameters needed by the driver to set up clocks as in the example +above are known as "bindings"; in ACPI, these are known as "Device Properties" +and provided to a driver via the _DSD object. + +ACPI tables are described with a formal language called ASL, the ACPI +Source Language (section 19 of the specification). This means that there +are always multiple ways to describe the same thing -- including device +properties. For example, device properties could use an ASL construct +that looks like this: Name(KEY0, "value0"). An ACPI device driver would +then retrieve the value of the property by evaluating the KEY0 object. +However, using Name() this way has multiple problems: (1) ACPI limits +names ("KEY0") to four characters unlike DT; (2) there is no industry +wide registry that maintains a list of names, minimzing re-use; (3) +there is also no registry for the definition of property values ("value0"), +again making re-use difficult; and (4) how does one maintain backward +compatibility as new hardware comes out? The _DSD method was created +to solve precisely these sorts of problems; Linux drivers should ALWAYS +use the _DSD method for device properties and nothing else. + +The _DSM object (ACPI Section 9.14.1) could also be used for conveying +device properties to a driver. Linux drivers should only expect it to +be used if _DSD cannot represent the data required, and there is no way +to create a new UUID for the _DSD object. Note that there is even less +regulation of the use of _DSM than there is of _DSD. Drivers that depend +on the contents of _DSM objects will be more difficult to maintain over +time because of this; as of this writing, the use of _DSM is the cause +of quite a few firmware problems and is not recommended. + +Drivers should look for device properties in the _DSD object ONLY; the _DSD +object is described in the ACPI specification section 6.2.5, but this only +describes how to define the structure of an object returned via _DSD, and +how specific data structures are defined by specific UUIDs. Linux should +only use the _DSD Device Properties UUID [5]: + + -- UUID: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 + + -- http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf + +The UEFI Forum provides a mechanism for registering device properties [4] +so that they may be used across all operating systems supporting ACPI. +Device properties that have not been registered with the UEFI Forum should +not be used. + +Before creating new device properties, check to be sure that they have not +been defined before and either registered in the Linux kernel documentation +as DT bindings, or the UEFI Forum as device properties. While we do not want +to simply move all DT bindings into ACPI device properties, we can learn from +what has been previously defined. + +If it is necessary to define a new device property, or if it makes sense to +synthesize the definition of a binding so it can be used in any firmware, +both DT bindings and ACPI device properties for device drivers have review +processes. Use them both. When the driver itself is submitted for review +to the Linux mailing lists, the device property definitions needed must be +submitted at the same time. A driver that supports ACPI and uses device +properties will not be considered complete without their definitions. Once +the device property has been accepted by the Linux community, it must be +registered with the UEFI Forum [4], which will review it again for consistency +within the registry. This may require iteration. The UEFI Forum, though, +will always be the canonical site for device property definitions. + +It may make sense to provide notice to the UEFI Forum that there is the +intent to register a previously unused device property name as a means of +reserving the name for later use. Other operating system vendors will +also be submitting registration requests and this may help smooth the +process. + +Once registration and review have been completed, the kernel provides an +interface for looking up device properties in a manner independent of +whether DT or ACPI is being used. This API should be used [6]; it can +eliminate some duplication of code paths in driver probing functions and +discourage divergence between DT bindings and ACPI device properties. + + +Programmable Power Control Resources +------------------------------------ +Programmable power control resources include such resources as voltage/current +providers (regulators) and clock sources. + +With ACPI, the kernel clock and regulator framework is not expected to be used +at all. + +The kernel assumes that power control of these resources is represented with +Power Resource Objects (ACPI section 7.1). The ACPI core will then handle +correctly enabling and disabling resources as they are needed. In order to +get that to work, ACPI assumes each device has defined D-states and that these +can be controlled through the optional ACPI methods _PS0, _PS1, _PS2, and _PS3; +in ACPI, _PS0 is the method to invoke to turn a device full on, and _PS3 is for +turning a device full off. + +There are two options for using those Power Resources. They can: + + -- be managed in a _PSx method which gets called on entry to power + state Dx. + + -- be declared separately as power resources with their own _ON and _OFF + methods. They are then tied back to D-states for a particular device + via _PRx which specifies which power resources a device needs to be on + while in Dx. Kernel then tracks number of devices using a power resource + and calls _ON/_OFF as needed. + +The kernel ACPI code will also assume that the _PSx methods follow the normal +ACPI rules for such methods: + + -- If either _PS0 or _PS3 is implemented, then the other method must also + be implemented. + + -- If a device requires usage or setup of a power resource when on, the ASL + should organize that it is allocated/enabled using the _PS0 method. + + -- Resources allocated or enabled in the _PS0 method should be disabled + or de-allocated in the _PS3 method. + + -- Firmware will leave the resources in a reasonable state before handing + over control to the kernel. + +Such code in _PSx methods will of course be very platform specific. But, +this allows the driver to abstract out the interface for operating the device +and avoid having to read special non-standard values from ACPI tables. Further, +abstracting the use of these resources allows the hardware to change over time +without requiring updates to the driver. + + +Clocks +------ +ACPI makes the assumption that clocks are initialized by the firmware -- +UEFI, in this case -- to some working value before control is handed over +to the kernel. This has implications for devices such as UARTs, or SoC-driven +LCD displays, for example. + +When the kernel boots, the clocks are assumed to be set to reasonable +working values. If for some reason the frequency needs to change -- e.g., +throttling for power management -- the device driver should expect that +process to be abstracted out into some ACPI method that can be invoked +(please see the ACPI specification for further recommendations on standard +methods to be expected). The only exceptions to this are CPU clocks where +CPPC provides a much richer interface than ACPI methods. If the clocks +are not set, there is no direct way for Linux to control them. + +If an SoC vendor wants to provide fine-grained control of the system clocks, +they could do so by providing ACPI methods that could be invoked by Linux +drivers. However, this is NOT recommended and Linux drivers should NOT use +such methods, even if they are provided. Such methods are not currently +standardized in the ACPI specification, and using them could tie a kernel +to a very specific SoC, or tie an SoC to a very specific version of the +kernel, both of which we are trying to avoid. + + +Driver Recommendations +---------------------- +DO NOT remove any DT handling when adding ACPI support for a driver. The +same device may be used on many different systems. + +DO try to structure the driver so that it is data-driven. That is, set up +a struct containing internal per-device state based on defaults and whatever +else must be discovered by the driver probe function. Then, have the rest +of the driver operate off of the contents of that struct. Doing so should +allow most divergence between ACPI and DT functionality to be kept local to +the probe function instead of being scattered throughout the driver. For +example: + +static int device_probe_dt(struct platform_device *pdev) +{ + /* DT specific functionality */ + ... +} + +static int device_probe_acpi(struct platform_device *pdev) +{ + /* ACPI specific functionality */ + ... +} + +static int device_probe(struct platform_device *pdev) +{ + ... + struct device_node node = pdev->dev.of_node; + ... + + if (node) + ret = device_probe_dt(pdev); + else if (ACPI_HANDLE(&pdev->dev)) + ret = device_probe_acpi(pdev); + else + /* other initialization */ + ... + /* Continue with any generic probe operations */ + ... +} + +DO keep the MODULE_DEVICE_TABLE entries together in the driver to make it +clear the different names the driver is probed for, both from DT and from +ACPI: + +static struct of_device_id virtio_mmio_match[] = { + { .compatible = "virtio,mmio", }, + { } +}; +MODULE_DEVICE_TABLE(of, virtio_mmio_match); + +static const struct acpi_device_id virtio_mmio_acpi_match[] = { + { "LNRO0005", }, + { } +}; +MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); + + +ASWG +---- +The ACPI specification changes regularly. During the year 2014, for instance, +version 5.1 was released and version 6.0 substantially completed, with most of +the changes being driven by ARM-specific requirements. Proposed changes are +presented and discussed in the ASWG (ACPI Specification Working Group) which +is a part of the UEFI Forum. + +Participation in this group is open to all UEFI members. Please see +http://www.uefi.org/workinggroup for details on group membership. + +It is the intent of the ARMv8 ACPI kernel code to follow the ACPI specification +as closely as possible, and to only implement functionality that complies with +the released standards from UEFI ASWG. As a practical matter, there will be +vendors that provide bad ACPI tables or violate the standards in some way. +If this is because of errors, quirks and fixups may be necessary, but will +be avoided if possible. If there are features missing from ACPI that preclude +it from being used on a platform, ECRs (Engineering Change Requests) should be +submitted to ASWG and go through the normal approval process; for those that +are not UEFI members, many other members of the Linux community are and would +likely be willing to assist in submitting ECRs. + + +Linux Code +---------- +Individual items specific to Linux on ARM, contained in the the Linux +source code, are in the list that follows: + +ACPI_OS_NAME This macro defines the string to be returned when + an ACPI method invokes the _OS method. On ARM64 + systems, this macro will be "Linux" by default. + The command line parameter acpi_os=<string> + can be used to set it to some other value. The + default value for other architectures is "Microsoft + Windows NT", for example. + +ACPI Objects +------------ +Detailed expectations for ACPI tables and object are listed in the file +Documentation/arm64/acpi_object_usage.txt. + + +References +---------- +[0] http://silver.arm.com -- document ARM-DEN-0029, or newer + "Server Base System Architecture", version 2.3, dated 27 Mar 2014 + +[1] http://infocenter.arm.com/help/topic/com.arm.doc.den0044a/Server_Base_Boot_Requirements.pdf + Document ARM-DEN-0044A, or newer: "Server Base Boot Requirements, System + Software on ARM Platforms", dated 16 Aug 2014 + +[2] http://www.secretlab.ca/archives/151, 10 Jan 2015, Copyright (c) 2015, + Linaro Ltd., written by Grant Likely. A copy of the verbatim text (apart + from formatting) is also in Documentation/arm64/why_use_acpi.txt. + +[3] AMD ACPI for Seattle platform documentation: + http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Seattle_ACPI_Guide.pdf + +[4] http://www.uefi.org/acpi -- please see the link for the "ACPI _DSD Device + Property Registry Instructions" + +[5] http://www.uefi.org/acpi -- please see the link for the "_DSD (Device + Specific Data) Implementation Guide" + +[6] Kernel code for the unified device property interface can be found in + include/linux/property.h and drivers/base/property.c. + + +Authors +------- +Al Stone <al.stone@linaro.org> +Graeme Gregory <graeme.gregory@linaro.org> +Hanjun Guo <hanjun.guo@linaro.org> + +Grant Likely <grant.likely@linaro.org>, for the "Why ACPI on ARM?" section diff --git a/Documentation/devicetree/bindings/arc/pct.txt b/Documentation/devicetree/bindings/arc/pct.txt new file mode 100644 index 000000000000..7b9588444f20 --- /dev/null +++ b/Documentation/devicetree/bindings/arc/pct.txt @@ -0,0 +1,20 @@ +* ARC Performance Counters + +The ARC700 can be configured with a pipeline performance monitor for counting +CPU and cache events like cache misses and hits. Like conventional PCT there +are 100+ hardware conditions dynamically mapped to upto 32 counters + +Note that: + * The ARC 700 PCT does not support interrupts; although HW events may be + counted, the HW events themselves cannot serve as a trigger for a sample. + +Required properties: + +- compatible : should contain + "snps,arc700-pct" + +Example: + +pmu { + compatible = "snps,arc700-pct"; +}; diff --git a/Documentation/devicetree/bindings/arc/pmu.txt b/Documentation/devicetree/bindings/arc/pmu.txt deleted file mode 100644 index 49d517340de3..000000000000 --- a/Documentation/devicetree/bindings/arc/pmu.txt +++ /dev/null @@ -1,24 +0,0 @@ -* ARC Performance Monitor Unit - -The ARC 700 can be configured with a pipeline performance monitor for counting -CPU and cache events like cache misses and hits. - -Note that: - * ARC 700 refers to a family of ARC processor cores; - - There is only one type of PMU available for the whole family; - - The PMU may support different sets of events; supported events are probed - at boot time, as required by the reference manual. - - * The ARC 700 PMU does not support interrupts; although HW events may be - counted, the HW events themselves cannot serve as a trigger for a sample. - -Required properties: - -- compatible : should contain - "snps,arc700-pmu" - -Example: - -pmu { - compatible = "snps,arc700-pmu"; -}; diff --git a/Documentation/devicetree/bindings/arm/altera.txt b/Documentation/devicetree/bindings/arm/altera.txt new file mode 100644 index 000000000000..558735aacca8 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/altera.txt @@ -0,0 +1,14 @@ +Altera's SoCFPGA platform device tree bindings +--------------------------------------------- + +Boards with Cyclone 5 SoC: +Required root node properties: +compatible = "altr,socfpga-cyclone5", "altr,socfpga"; + +Boards with Arria 5 SoC: +Required root node properties: +compatible = "altr,socfpga-arria5", "altr,socfpga"; + +Boards with Arria 10 SoC: +Required root node properties: +compatible = "altr,socfpga-arria10", "altr,socfpga"; diff --git a/Documentation/devicetree/bindings/arm/arch_timer.txt b/Documentation/devicetree/bindings/arm/arch_timer.txt index 256b4d8bab7b..e774128935d5 100644 --- a/Documentation/devicetree/bindings/arm/arch_timer.txt +++ b/Documentation/devicetree/bindings/arm/arch_timer.txt @@ -17,7 +17,10 @@ to deliver its interrupts via SPIs. - interrupts : Interrupt list for secure, non-secure, virtual and hypervisor timers, in that order. -- clock-frequency : The frequency of the main counter, in Hz. Optional. +- clock-frequency : The frequency of the main counter, in Hz. Should be present + only where necessary to work around broken firmware which does not configure + CNTFRQ on all CPUs to a uniform correct value. Use of this property is + strongly discouraged; fix your firmware unless absolutely impossible. - always-on : a boolean property. If present, the timer is powered through an always-on power domain, therefore it never loses context. @@ -46,7 +49,8 @@ Example: - compatible : Should at least contain "arm,armv7-timer-mem". -- clock-frequency : The frequency of the main counter, in Hz. Optional. +- clock-frequency : The frequency of the main counter, in Hz. Should be present + only when firmware has not configured the MMIO CNTFRQ registers. - reg : The control frame base address. diff --git a/Documentation/devicetree/bindings/arm/msm/timer.txt b/Documentation/devicetree/bindings/arm/msm/timer.txt index 74607b6c1117..5e10c345548f 100644 --- a/Documentation/devicetree/bindings/arm/msm/timer.txt +++ b/Documentation/devicetree/bindings/arm/msm/timer.txt @@ -9,11 +9,17 @@ Properties: "qcom,scss-timer" - scorpion subsystem - interrupts : Interrupts for the debug timer, the first general purpose - timer, and optionally a second general purpose timer in that - order. + timer, and optionally a second general purpose timer, and + optionally as well, 2 watchdog interrupts, in that order. - reg : Specifies the base address of the timer registers. +- clocks: Reference to the parent clocks, one per output clock. The parents + must appear in the same order as the clock names. + +- clock-names: The name of the clocks as free-form strings. They should be in + the same order as the clocks. + - clock-frequency : The frequency of the debug timer and the general purpose timer(s) in Hz in that order. @@ -29,9 +35,13 @@ Example: compatible = "qcom,scss-timer", "qcom,msm-timer"; interrupts = <1 1 0x301>, <1 2 0x301>, - <1 3 0x301>; + <1 3 0x301>, + <1 4 0x301>, + <1 5 0x301>; reg = <0x0200a000 0x100>; clock-frequency = <19200000>, <32768>; + clocks = <&sleep_clk>; + clock-names = "sleep"; cpu-offset = <0x40000>; }; diff --git a/Documentation/devicetree/bindings/common-properties.txt b/Documentation/devicetree/bindings/common-properties.txt new file mode 100644 index 000000000000..3193979b1d05 --- /dev/null +++ b/Documentation/devicetree/bindings/common-properties.txt @@ -0,0 +1,60 @@ +Common properties + +The ePAPR specification does not define any properties related to hardware +byteswapping, but endianness issues show up frequently in porting Linux to +different machine types. This document attempts to provide a consistent +way of handling byteswapping across drivers. + +Optional properties: + - big-endian: Boolean; force big endian register accesses + unconditionally (e.g. ioread32be/iowrite32be). Use this if you + know the peripheral always needs to be accessed in BE mode. + - little-endian: Boolean; force little endian register accesses + unconditionally (e.g. readl/writel). Use this if you know the + peripheral always needs to be accessed in LE mode. + - native-endian: Boolean; always use register accesses matched to the + endianness of the kernel binary (e.g. LE vmlinux -> readl/writel, + BE vmlinux -> ioread32be/iowrite32be). In this case no byteswaps + will ever be performed. Use this if the hardware "self-adjusts" + register endianness based on the CPU's configured endianness. + +If a binding supports these properties, then the binding should also +specify the default behavior if none of these properties are present. +In such cases, little-endian is the preferred default, but it is not +a requirement. The of_device_is_big_endian() and of_fdt_is_big_endian() +helper functions do assume that little-endian is the default, because +most existing (PCI-based) drivers implicitly default to LE by using +readl/writel for MMIO accesses. + +Examples: +Scenario 1 : CPU in LE mode & device in LE mode. +dev: dev@40031000 { + compatible = "name"; + reg = <0x40031000 0x1000>; + ... + native-endian; +}; + +Scenario 2 : CPU in LE mode & device in BE mode. +dev: dev@40031000 { + compatible = "name"; + reg = <0x40031000 0x1000>; + ... + big-endian; +}; + +Scenario 3 : CPU in BE mode & device in BE mode. +dev: dev@40031000 { + compatible = "name"; + reg = <0x40031000 0x1000>; + ... + native-endian; +}; + +Scenario 4 : CPU in BE mode & device in LE mode. +dev: dev@40031000 { + compatible = "name"; + reg = <0x40031000 0x1000>; + ... + little-endian; +}; diff --git a/Documentation/devicetree/bindings/cris/axis.txt b/Documentation/devicetree/bindings/cris/axis.txt new file mode 100644 index 000000000000..d209ca2a47c0 --- /dev/null +++ b/Documentation/devicetree/bindings/cris/axis.txt @@ -0,0 +1,9 @@ +Axis Communications AB +ARTPEC series SoC Device Tree Bindings + + +CRISv32 based SoCs are ETRAX FS and ARTPEC-3: + + - compatible = "axis,crisv32"; + + diff --git a/Documentation/devicetree/bindings/cris/boards.txt b/Documentation/devicetree/bindings/cris/boards.txt new file mode 100644 index 000000000000..533dd273ccf7 --- /dev/null +++ b/Documentation/devicetree/bindings/cris/boards.txt @@ -0,0 +1,8 @@ +Boards based on the CRIS SoCs: + +Required root node properties: + - compatible = should be one or more of the following: + - "axis,dev88" - for Axis devboard 88 with ETRAX FS + +Optional: + diff --git a/Documentation/devicetree/bindings/cris/interrupts.txt b/Documentation/devicetree/bindings/cris/interrupts.txt new file mode 100644 index 000000000000..e8b123b0a5e6 --- /dev/null +++ b/Documentation/devicetree/bindings/cris/interrupts.txt @@ -0,0 +1,23 @@ +* CRISv32 Interrupt Controller + +Interrupt controller for the CRISv32 SoCs. + +Main node required properties: + +- compatible : should be: + "axis,crisv32-intc" +- interrupt-controller : Identifies the node as an interrupt controller +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The type shall be a <u32> and the value shall be 1. +- reg: physical base address and size of the intc registers map. + +Example: + + intc: interrupt-controller { + compatible = "axis,crisv32-intc"; + reg = <0xb001c000 0x1000>; + interrupt-controller; + #interrupt-cells = <1>; + }; + + diff --git a/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt b/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt new file mode 100644 index 000000000000..d3058768b23d --- /dev/null +++ b/Documentation/devicetree/bindings/dma/apm-xgene-dma.txt @@ -0,0 +1,47 @@ +Applied Micro X-Gene SoC DMA nodes + +DMA nodes are defined to describe on-chip DMA interfaces in +APM X-Gene SoC. + +Required properties for DMA interfaces: +- compatible: Should be "apm,xgene-dma". +- device_type: set to "dma". +- reg: Address and length of the register set for the device. + It contains the information of registers in the following order: + 1st - DMA control and status register address space. + 2nd - Descriptor ring control and status register address space. + 3rd - Descriptor ring command register address space. + 4th - Soc efuse register address space. +- interrupts: DMA has 5 interrupts sources. 1st interrupt is + DMA error reporting interrupt. 2nd, 3rd, 4th and 5th interrupts + are completion interrupts for each DMA channels. +- clocks: Reference to the clock entry. + +Optional properties: +- dma-coherent : Present if dma operations are coherent + +Example: + dmaclk: dmaclk@1f27c000 { + compatible = "apm,xgene-device-clock"; + #clock-cells = <1>; + clocks = <&socplldiv2 0>; + reg = <0x0 0x1f27c000 0x0 0x1000>; + reg-names = "csr-reg"; + clock-output-names = "dmaclk"; + }; + + dma: dma@1f270000 { + compatible = "apm,xgene-storm-dma"; + device_type = "dma"; + reg = <0x0 0x1f270000 0x0 0x10000>, + <0x0 0x1f200000 0x0 0x10000>, + <0x0 0x1b008000 0x0 0x2000>, + <0x0 0x1054a000 0x0 0x100>; + interrupts = <0x0 0x82 0x4>, + <0x0 0xb8 0x4>, + <0x0 0xb9 0x4>, + <0x0 0xba 0x4>, + <0x0 0xbb 0x4>; + dma-coherent; + clocks = <&dmaclk 0>; + }; diff --git a/Documentation/devicetree/bindings/dma/jz4780-dma.txt b/Documentation/devicetree/bindings/dma/jz4780-dma.txt new file mode 100644 index 000000000000..f25feee62b15 --- /dev/null +++ b/Documentation/devicetree/bindings/dma/jz4780-dma.txt @@ -0,0 +1,56 @@ +* Ingenic JZ4780 DMA Controller + +Required properties: + +- compatible: Should be "ingenic,jz4780-dma" +- reg: Should contain the DMA controller registers location and length. +- interrupts: Should contain the interrupt specifier of the DMA controller. +- interrupt-parent: Should be the phandle of the interrupt controller that +- clocks: Should contain a clock specifier for the JZ4780 PDMA clock. +- #dma-cells: Must be <2>. Number of integer cells in the dmas property of + DMA clients (see below). + +Optional properties: + +- ingenic,reserved-channels: Bitmask of channels to reserve for devices that + need a specific channel. These channels will only be assigned when explicitly + requested by a client. The primary use for this is channels 0 and 1, which + can be configured to have special behaviour for NAND/BCH when using + programmable firmware. + +Example: + +dma: dma@13420000 { + compatible = "ingenic,jz4780-dma"; + reg = <0x13420000 0x10000>; + + interrupt-parent = <&intc>; + interrupts = <10>; + + clocks = <&cgu JZ4780_CLK_PDMA>; + + #dma-cells = <2>; + + ingenic,reserved-channels = <0x3>; +}; + +DMA clients must use the format described in dma.txt, giving a phandle to the +DMA controller plus the following 2 integer cells: + +1. Request type: The DMA request type for transfers to/from the device on + the allocated channel, as defined in the SoC documentation. + +2. Channel: If set to 0xffffffff, any available channel will be allocated for + the client. Otherwise, the exact channel specified will be used. The channel + should be reserved on the DMA controller using the ingenic,reserved-channels + property. + +Example: + +uart0: serial@10030000 { + ... + dmas = <&dma 0x14 0xffffffff + &dma 0x15 0xffffffff>; + dma-names = "tx", "rx"; + ... +}; diff --git a/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt b/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt index f8c3311b7153..1c9d48ea4914 100644 --- a/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt +++ b/Documentation/devicetree/bindings/dma/qcom_bam_dma.txt @@ -4,6 +4,7 @@ Required properties: - compatible: must be one of the following: * "qcom,bam-v1.4.0" for MSM8974, APQ8074 and APQ8084 * "qcom,bam-v1.3.0" for APQ8064, IPQ8064 and MSM8960 + * "qcom,bam-v1.7.0" for MSM8916 - reg: Address range for DMA registers - interrupts: Should contain the one interrupt shared by all channels - #dma-cells: must be <1>, the cell in the dmas property of the client device diff --git a/Documentation/devicetree/bindings/dma/rcar-audmapp.txt b/Documentation/devicetree/bindings/dma/rcar-audmapp.txt deleted file mode 100644 index 61bca509d7b9..000000000000 --- a/Documentation/devicetree/bindings/dma/rcar-audmapp.txt +++ /dev/null @@ -1,29 +0,0 @@ -* R-Car Audio DMAC peri peri Device Tree bindings - -Required properties: -- compatible: should be "renesas,rcar-audmapp" -- #dma-cells: should be <1>, see "dmas" property below - -Example: - audmapp: audio-dma-pp@0xec740000 { - compatible = "renesas,rcar-audmapp"; - #dma-cells = <1>; - - reg = <0 0xec740000 0 0x200>; - }; - - -* DMA client - -Required properties: -- dmas: a list of <[DMA multiplexer phandle] [SRS << 8 | DRS]> pairs. - where SRS/DRS are specified in the SoC manual. - It will be written into PDMACHCR as high 16-bit parts. -- dma-names: a list of DMA channel names, one per "dmas" entry - -Example: - - dmas = <&audmapp 0x2d00 - &audmapp 0x3700>; - dma-names = "src0_ssiu0", - "dvc0_ssiu0"; diff --git a/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt new file mode 100644 index 000000000000..040f365954cc --- /dev/null +++ b/Documentation/devicetree/bindings/dma/renesas,usb-dmac.txt @@ -0,0 +1,37 @@ +* Renesas USB DMA Controller Device Tree bindings + +Required Properties: +- compatible: must contain "renesas,usb-dmac" +- reg: base address and length of the registers block for the DMAC +- interrupts: interrupt specifiers for the DMAC, one for each entry in + interrupt-names. +- interrupt-names: one entry per channel, named "ch%u", where %u is the + channel number ranging from zero to the number of channels minus one. +- clocks: a list of phandle + clock-specifier pairs. +- #dma-cells: must be <1>, the cell specifies the channel number of the DMAC + port connected to the DMA client. +- dma-channels: number of DMA channels + +Example: R8A7790 (R-Car H2) USB-DMACs + + usb_dmac0: dma-controller@e65a0000 { + compatible = "renesas,usb-dmac"; + reg = <0 0xe65a0000 0 0x100>; + interrupts = <0 109 IRQ_TYPE_LEVEL_HIGH + 0 109 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "ch0", "ch1"; + clocks = <&mstp3_clks R8A7790_CLK_USBDMAC0>; + #dma-cells = <1>; + dma-channels = <2>; + }; + + usb_dmac1: dma-controller@e65b0000 { + compatible = "renesas,usb-dmac"; + reg = <0 0xe65b0000 0 0x100>; + interrupts = <0 110 IRQ_TYPE_LEVEL_HIGH + 0 110 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "ch0", "ch1"; + clocks = <&mstp3_clks R8A7790_CLK_USBDMAC1>; + #dma-cells = <1>; + dma-channels = <2>; + }; diff --git a/Documentation/devicetree/bindings/mtd/m25p80.txt b/Documentation/devicetree/bindings/mtd/m25p80.txt index 4611aa83531b..f20b111b502a 100644 --- a/Documentation/devicetree/bindings/mtd/m25p80.txt +++ b/Documentation/devicetree/bindings/mtd/m25p80.txt @@ -3,10 +3,13 @@ Required properties: - #address-cells, #size-cells : Must be present if the device has sub-nodes representing partitions. -- compatible : Should be the manufacturer and the name of the chip. Bear in mind - the DT binding is not Linux-only, but in case of Linux, see the - "spi_nor_ids" table in drivers/mtd/spi-nor/spi-nor.c for the list - of supported chips. +- compatible : May include a device-specific string consisting of the + manufacturer and name of the chip. Bear in mind the DT binding + is not Linux-only, but in case of Linux, see the "m25p_ids" + table in drivers/mtd/devices/m25p80.c for the list of supported + chips. + Must also include "nor-jedec" for any SPI NOR flash that can be + identified by the JEDEC READ ID opcode (0x9F). - reg : Chip-Select number - spi-max-frequency : Maximum frequency of the SPI bus the chip can operate at @@ -22,7 +25,7 @@ Example: flash: m25p80@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "spansion,m25p80"; + compatible = "spansion,m25p80", "nor-jedec"; reg = <0>; spi-max-frequency = <40000000>; m25p,fast-read; diff --git a/Documentation/devicetree/bindings/mtd/pxa3xx-nand.txt b/Documentation/devicetree/bindings/mtd/pxa3xx-nand.txt index de8b517a5521..4f833e3c4f51 100644 --- a/Documentation/devicetree/bindings/mtd/pxa3xx-nand.txt +++ b/Documentation/devicetree/bindings/mtd/pxa3xx-nand.txt @@ -14,7 +14,7 @@ Optional properties: - marvell,nand-enable-arbiter: Set to enable the bus arbiter - marvell,nand-keep-config: Set to keep the NAND controller config as set by the bootloader - - num-cs: Number of chipselect lines to usw + - num-cs: Number of chipselect lines to use - nand-on-flash-bbt: boolean to enable on flash bbt option if not present false - nand-ecc-strength: number of bits to correct per ECC step diff --git a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt index 0273adb8638c..086d6f44c4b9 100644 --- a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt +++ b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt @@ -21,7 +21,7 @@ Optional properties: - nand-ecc-mode : one of the supported ECC modes ("hw", "hw_syndrome", "soft", "soft_bch" or "none") -see Documentation/devicetree/mtd/nand.txt for generic bindings. +see Documentation/devicetree/bindings/mtd/nand.txt for generic bindings. Examples: diff --git a/Documentation/devicetree/bindings/pwm/imx-pwm.txt b/Documentation/devicetree/bindings/pwm/imx-pwm.txt index b50d7a6d9d7f..e00c2e9f484d 100644 --- a/Documentation/devicetree/bindings/pwm/imx-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/imx-pwm.txt @@ -1,10 +1,17 @@ Freescale i.MX PWM controller Required properties: -- compatible: should be "fsl,<soc>-pwm" +- compatible : should be "fsl,<soc>-pwm" and one of the following + compatible strings: + - "fsl,imx1-pwm" for PWM compatible with the one integrated on i.MX1 + - "fsl,imx27-pwm" for PWM compatible with the one integrated on i.MX27 - reg: physical base address and length of the controller's registers - #pwm-cells: should be 2. See pwm.txt in this directory for a description of the cells format. +- clocks : Clock specifiers for both ipg and per clocks. +- clock-names : Clock names should include both "ipg" and "per" +See the clock consumer binding, + Documentation/devicetree/bindings/clock/clock-bindings.txt - interrupts: The interrupt for the pwm controller Example: @@ -13,5 +20,8 @@ pwm1: pwm@53fb4000 { #pwm-cells = <2>; compatible = "fsl,imx53-pwm", "fsl,imx27-pwm"; reg = <0x53fb4000 0x4000>; + clocks = <&clks IMX5_CLK_PWM1_IPG_GATE>, + <&clks IMX5_CLK_PWM1_HF_GATE>; + clock-names = "ipg", "per"; interrupts = <61>; }; diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 83737a3403d7..80339192c93e 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -26,6 +26,7 @@ aptina Aptina Imaging arasan Arasan Chip Systems arm ARM Ltd. armadeus ARMadeus Systems SARL +artesyn Artesyn Embedded Technologies Inc. asahi-kasei Asahi Kasei Corp. atmel Atmel Corporation auo AU Optronics Corporation diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt index bb9753b635a3..480c8de3c2c4 100644 --- a/Documentation/dma-buf-sharing.txt +++ b/Documentation/dma-buf-sharing.txt @@ -49,25 +49,26 @@ The dma_buf buffer sharing API usage contains the following steps: The buffer exporter announces its wish to export a buffer. In this, it connects its own private buffer data, provides implementation for operations that can be performed on the exported dma_buf, and flags for the file - associated with this buffer. + associated with this buffer. All these fields are filled in struct + dma_buf_export_info, defined via the DEFINE_DMA_BUF_EXPORT_INFO macro. Interface: - struct dma_buf *dma_buf_export_named(void *priv, struct dma_buf_ops *ops, - size_t size, int flags, - const char *exp_name) + DEFINE_DMA_BUF_EXPORT_INFO(exp_info) + struct dma_buf *dma_buf_export(struct dma_buf_export_info *exp_info) - If this succeeds, dma_buf_export_named allocates a dma_buf structure, and + If this succeeds, dma_buf_export allocates a dma_buf structure, and returns a pointer to the same. It also associates an anonymous file with this buffer, so it can be exported. On failure to allocate the dma_buf object, it returns NULL. - 'exp_name' is the name of exporter - to facilitate information while - debugging. + 'exp_name' in struct dma_buf_export_info is the name of exporter - to + facilitate information while debugging. It is set to KBUILD_MODNAME by + default, so exporters don't have to provide a specific name, if they don't + wish to. + + DEFINE_DMA_BUF_EXPORT_INFO macro defines the struct dma_buf_export_info, + zeroes it out and pre-populates exp_name in it. - Exporting modules which do not wish to provide any specific name may use the - helper define 'dma_buf_export()', with the same arguments as above, but - without the last argument; a KBUILD_MODNAME pre-processor directive will be - inserted in place of 'exp_name' instead. 2. Userspace gets a handle to pass around to potential buffer-users diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt index 724043858b08..95c13aa575ff 100644 --- a/Documentation/filesystems/nfs/nfs-rdma.txt +++ b/Documentation/filesystems/nfs/nfs-rdma.txt @@ -187,8 +187,10 @@ Check RDMA and NFS Setup To further test the InfiniBand software stack, use IPoIB (this assumes you have two IB hosts named host1 and host2): - host1$ ifconfig ib0 a.b.c.x - host2$ ifconfig ib0 a.b.c.y + host1$ ip link set dev ib0 up + host1$ ip address add dev ib0 a.b.c.x + host2$ ip link set dev ib0 up + host2$ ip address add dev ib0 a.b.c.y host1$ ping a.b.c.y host2$ ping a.b.c.x @@ -229,7 +231,8 @@ NFS/RDMA Setup $ modprobe ib_mthca $ modprobe ib_ipoib - $ ifconfig ib0 a.b.c.d + $ ip li set dev ib0 up + $ ip addr add dev ib0 a.b.c.d NOTE: use unique addresses for the client and server diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 0bfafe108357..5a5a05582b58 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -228,30 +228,19 @@ default behaviour. Deprecated Mount Options ======================== - delaylog/nodelaylog - Delayed logging is the only logging method that XFS supports - now, so these mount options are now ignored. - - Due for removal in 3.12. - - ihashsize=value - In memory inode hashes have been removed, so this option has - no function as of August 2007. Option is deprecated. - - Due for removal in 3.12. +None at present. - irixsgid - This behaviour is now controlled by a sysctl, so the mount - option is ignored. - Due for removal in 3.12. +Removed Mount Options +===================== - osyncisdsync - osyncisosync - O_SYNC and O_DSYNC are fully supported, so there is no need - for these options any more. + Name Removed + ---- ------- + delaylog/nodelaylog v3.20 + ihashsize v3.20 + irixsgid v3.20 + osyncisdsync/osyncisosync v3.20 - Due for removal in 3.12. sysctls ======= diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 8136e1fd30fd..51f4221657bf 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -321,6 +321,7 @@ Code Seq#(hex) Include File Comments 0xDB 00-0F drivers/char/mwave/mwavepub.h 0xDD 00-3F ZFCP device driver see drivers/s390/scsi/ <mailto:aherrman@de.ibm.com> +0xEC 00-01 drivers/platform/chrome/cros_ec_dev.h ChromeOS EC driver 0xF3 00-3F drivers/usb/misc/sisusbvga/sisusb.h sisfb (in development) <mailto:thomas@winischhofer.net> 0xF4 00-1F video/mbxfb.h mbxfb diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 84960c66c77b..f6befa9855c1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -165,7 +165,7 @@ multipliers 'Kilo', 'Mega', and 'Giga', equalling 2^10, 2^20, and 2^30 bytes respectively. Such letter suffixes can also be entirely omitted. - acpi= [HW,ACPI,X86] + acpi= [HW,ACPI,X86,ARM64] Advanced Configuration and Power Interface Format: { force | off | strict | noirq | rsdt } force -- enable ACPI if default was off @@ -175,6 +175,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. strictly ACPI specification compliant. rsdt -- prefer RSDT over (default) XSDT copy_dsdt -- copy DSDT to memory + For ARM64, ONLY "acpi=off" or "acpi=force" are available See also Documentation/power/runtime_pm.txt, pci=noacpi diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt index fc04c14de4bb..72a150d8f3df 100644 --- a/Documentation/laptops/thinkpad-acpi.txt +++ b/Documentation/laptops/thinkpad-acpi.txt @@ -1355,6 +1355,24 @@ Sysfs notes: rfkill controller switch "tpacpi_uwb_sw": refer to Documentation/rfkill.txt for details. +Adaptive keyboard +----------------- + +sysfs device attribute: adaptive_kbd_mode + +This sysfs attribute controls the keyboard "face" that will be shown on the +Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read +and set. + +1 = Home mode +2 = Web-browser mode +3 = Web-conference mode +4 = Function mode +5 = Layflat mode + +For more details about which buttons will appear depending on the mode, please +review the laptop's user guide: +http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf Multiple Commands, Module Parameters ------------------------------------ diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt new file mode 100644 index 000000000000..de1af7db3355 --- /dev/null +++ b/Documentation/md-cluster.txt @@ -0,0 +1,176 @@ +The cluster MD is a shared-device RAID for a cluster. + + +1. On-disk format + +Separate write-intent-bitmap are used for each cluster node. +The bitmaps record all writes that may have been started on that node, +and may not yet have finished. The on-disk layout is: + +0 4k 8k 12k +------------------------------------------------------------------- +| idle | md super | bm super [0] + bits | +| bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] | +| bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits | +| bm bits [3, contd] | | | + +During "normal" functioning we assume the filesystem ensures that only one +node writes to any given block at a time, so a write +request will + - set the appropriate bit (if not already set) + - commit the write to all mirrors + - schedule the bit to be cleared after a timeout. + +Reads are just handled normally. It is up to the filesystem to +ensure one node doesn't read from a location where another node (or the same +node) is writing. + + +2. DLM Locks for management + +There are two locks for managing the device: + +2.1 Bitmap lock resource (bm_lockres) + + The bm_lockres protects individual node bitmaps. They are named in the + form bitmap001 for node 1, bitmap002 for node and so on. When a node + joins the cluster, it acquires the lock in PW mode and it stays so + during the lifetime the node is part of the cluster. The lock resource + number is based on the slot number returned by the DLM subsystem. Since + DLM starts node count from one and bitmap slots start from zero, one is + subtracted from the DLM slot number to arrive at the bitmap slot number. + +3. Communication + +Each node has to communicate with other nodes when starting or ending +resync, and metadata superblock updates. + +3.1 Message Types + + There are 3 types, of messages which are passed + + 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has been + updated, and the node must re-read the md superblock. This is performed + synchronously. + + 3.1.2 RESYNC: informs other nodes that a resync is initiated or ended + so that each node may suspend or resume the region. + +3.2 Communication mechanism + + The DLM LVB is used to communicate within nodes of the cluster. There + are three resources used for the purpose: + + 3.2.1 Token: The resource which protects the entire communication + system. The node having the token resource is allowed to + communicate. + + 3.2.2 Message: The lock resource which carries the data to + communicate. + + 3.2.3 Ack: The resource, acquiring which means the message has been + acknowledged by all nodes in the cluster. The BAST of the resource + is used to inform the receive node that a node wants to communicate. + +The algorithm is: + + 1. receive status + + sender receiver receiver + ACK:CR ACK:CR ACK:CR + + 2. sender get EX of TOKEN + sender get EX of MESSAGE + sender receiver receiver + TOKEN:EX ACK:CR ACK:CR + MESSAGE:EX + ACK:CR + + Sender checks that it still needs to send a message. Messages received + or other events that happened while waiting for the TOKEN may have made + this message inappropriate or redundant. + + 3. sender write LVB. + sender down-convert MESSAGE from EX to CR + sender try to get EX of ACK + [ wait until all receiver has *processed* the MESSAGE ] + + [ triggered by bast of ACK ] + receiver get CR of MESSAGE + receiver read LVB + receiver processes the message + [ wait finish ] + receiver release ACK + + sender receiver receiver + TOKEN:EX MESSAGE:CR MESSAGE:CR + MESSAGE:CR + ACK:EX + + 4. triggered by grant of EX on ACK (indicating all receivers have processed + message) + sender down-convert ACK from EX to CR + sender release MESSAGE + sender release TOKEN + receiver upconvert to EX of MESSAGE + receiver get CR of ACK + receiver release MESSAGE + + sender receiver receiver + ACK:CR ACK:CR ACK:CR + + +4. Handling Failures + +4.1 Node Failure + When a node fails, the DLM informs the cluster with the slot. The node + starts a cluster recovery thread. The cluster recovery thread: + - acquires the bitmap<number> lock of the failed node + - opens the bitmap + - reads the bitmap of the failed node + - copies the set bitmap to local node + - cleans the bitmap of the failed node + - releases bitmap<number> lock of the failed node + - initiates resync of the bitmap on the current node + + The resync process, is the regular md resync. However, in a clustered + environment when a resync is performed, it needs to tell other nodes + of the areas which are suspended. Before a resync starts, the node + send out RESYNC_START with the (lo,hi) range of the area which needs + to be suspended. Each node maintains a suspend_list, which contains + the list of ranges which are currently suspended. On receiving + RESYNC_START, the node adds the range to the suspend_list. Similarly, + when the node performing resync finishes, it send RESYNC_FINISHED + to other nodes and other nodes remove the corresponding entry from + the suspend_list. + + A helper function, should_suspend() can be used to check if a particular + I/O range should be suspended or not. + +4.2 Device Failure + Device failures are handled and communicated with the metadata update + routine. + +5. Adding a new Device +For adding a new device, it is necessary that all nodes "see" the new device +to be added. For this, the following algorithm is used: + + 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues + ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD) + 2. Node 1 sends NEWDISK with uuid and slot number + 3. Other nodes issue kobject_uevent_env with uuid and slot number + (Steps 4,5 could be a udev rule) + 4. In userspace, the node searches for the disk, perhaps + using blkid -t SUB_UUID="" + 5. Other nodes issue either of the following depending on whether the disk + was found: + ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and + disc.number set to slot number) + ioctl(CLUSTERED_DISK_NACK) + 6. Other nodes drop lock on no-new-devs (CR) if device is found + 7. Node 1 attempts EX lock on no-new-devs + 8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk + as SpareLocal + 9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED + 10. Other nodes get the information whether a disk is added or not + by the following METADATA_UPDATED. diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt index cbfac0949635..59f4db2a0c85 100644 --- a/Documentation/networking/scaling.txt +++ b/Documentation/networking/scaling.txt @@ -282,7 +282,7 @@ following is true: - The current CPU's queue head counter >= the recorded tail counter value in rps_dev_flow[i] -- The current CPU is unset (equal to RPS_NO_CPU) +- The current CPU is unset (>= nr_cpu_ids) - The current CPU is offline After this check, the packet is sent to the (possibly updated) current diff --git a/Documentation/target/tcm_mod_builder.py b/Documentation/target/tcm_mod_builder.py index 2b47704f75cb..2ba71cea0172 100755 --- a/Documentation/target/tcm_mod_builder.py +++ b/Documentation/target/tcm_mod_builder.py @@ -237,8 +237,7 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name): buf += "#include \"" + fabric_mod_name + "_base.h\"\n" buf += "#include \"" + fabric_mod_name + "_fabric.h\"\n\n" - buf += "/* Local pointer to allocated TCM configfs fabric module */\n" - buf += "struct target_fabric_configfs *" + fabric_mod_name + "_fabric_configfs;\n\n" + buf += "static const struct target_core_fabric_ops " + fabric_mod_name + "_ops;\n\n" buf += "static struct se_node_acl *" + fabric_mod_name + "_make_nodeacl(\n" buf += " struct se_portal_group *se_tpg,\n" @@ -309,8 +308,8 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name): buf += " }\n" buf += " tpg->" + fabric_mod_port + " = " + fabric_mod_port + ";\n" buf += " tpg->" + fabric_mod_port + "_tpgt = tpgt;\n\n" - buf += " ret = core_tpg_register(&" + fabric_mod_name + "_fabric_configfs->tf_ops, wwn,\n" - buf += " &tpg->se_tpg, (void *)tpg,\n" + buf += " ret = core_tpg_register(&" + fabric_mod_name + "_ops, wwn,\n" + buf += " &tpg->se_tpg, tpg,\n" buf += " TRANSPORT_TPG_TYPE_NORMAL);\n" buf += " if (ret < 0) {\n" buf += " kfree(tpg);\n" @@ -370,7 +369,10 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name): buf += " NULL,\n" buf += "};\n\n" - buf += "static struct target_core_fabric_ops " + fabric_mod_name + "_ops = {\n" + buf += "static const struct target_core_fabric_ops " + fabric_mod_name + "_ops = {\n" + buf += " .module = THIS_MODULE,\n" + buf += " .name = " + fabric_mod_name + ",\n" + buf += " .get_fabric_proto_ident = " + fabric_mod_name + "_get_fabric_proto_ident,\n" buf += " .get_fabric_name = " + fabric_mod_name + "_get_fabric_name,\n" buf += " .get_fabric_proto_ident = " + fabric_mod_name + "_get_fabric_proto_ident,\n" buf += " .tpg_get_wwn = " + fabric_mod_name + "_get_fabric_wwn,\n" @@ -413,75 +415,18 @@ def tcm_mod_build_configfs(proto_ident, fabric_mod_dir_var, fabric_mod_name): buf += " .fabric_drop_np = NULL,\n" buf += " .fabric_make_nodeacl = " + fabric_mod_name + "_make_nodeacl,\n" buf += " .fabric_drop_nodeacl = " + fabric_mod_name + "_drop_nodeacl,\n" - buf += "};\n\n" - - buf += "static int " + fabric_mod_name + "_register_configfs(void)\n" - buf += "{\n" - buf += " struct target_fabric_configfs *fabric;\n" - buf += " int ret;\n\n" - buf += " printk(KERN_INFO \"" + fabric_mod_name.upper() + " fabric module %s on %s/%s\"\n" - buf += " \" on \"UTS_RELEASE\"\\n\"," + fabric_mod_name.upper() + "_VERSION, utsname()->sysname,\n" - buf += " utsname()->machine);\n" - buf += " /*\n" - buf += " * Register the top level struct config_item_type with TCM core\n" - buf += " */\n" - buf += " fabric = target_fabric_configfs_init(THIS_MODULE, \"" + fabric_mod_name + "\");\n" - buf += " if (IS_ERR(fabric)) {\n" - buf += " printk(KERN_ERR \"target_fabric_configfs_init() failed\\n\");\n" - buf += " return PTR_ERR(fabric);\n" - buf += " }\n" - buf += " /*\n" - buf += " * Setup fabric->tf_ops from our local " + fabric_mod_name + "_ops\n" - buf += " */\n" - buf += " fabric->tf_ops = " + fabric_mod_name + "_ops;\n" - buf += " /*\n" - buf += " * Setup default attribute lists for various fabric->tf_cit_tmpl\n" - buf += " */\n" - buf += " fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = " + fabric_mod_name + "_wwn_attrs;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL;\n" - buf += " fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL;\n" - buf += " /*\n" - buf += " * Register the fabric for use within TCM\n" - buf += " */\n" - buf += " ret = target_fabric_configfs_register(fabric);\n" - buf += " if (ret < 0) {\n" - buf += " printk(KERN_ERR \"target_fabric_configfs_register() failed\"\n" - buf += " \" for " + fabric_mod_name.upper() + "\\n\");\n" - buf += " return ret;\n" - buf += " }\n" - buf += " /*\n" - buf += " * Setup our local pointer to *fabric\n" - buf += " */\n" - buf += " " + fabric_mod_name + "_fabric_configfs = fabric;\n" - buf += " printk(KERN_INFO \"" + fabric_mod_name.upper() + "[0] - Set fabric -> " + fabric_mod_name + "_fabric_configfs\\n\");\n" - buf += " return 0;\n" - buf += "};\n\n" - buf += "static void __exit " + fabric_mod_name + "_deregister_configfs(void)\n" - buf += "{\n" - buf += " if (!" + fabric_mod_name + "_fabric_configfs)\n" - buf += " return;\n\n" - buf += " target_fabric_configfs_deregister(" + fabric_mod_name + "_fabric_configfs);\n" - buf += " " + fabric_mod_name + "_fabric_configfs = NULL;\n" - buf += " printk(KERN_INFO \"" + fabric_mod_name.upper() + "[0] - Cleared " + fabric_mod_name + "_fabric_configfs\\n\");\n" + buf += "\n" + buf += " .tfc_wwn_attrs = " + fabric_mod_name + "_wwn_attrs;\n" buf += "};\n\n" buf += "static int __init " + fabric_mod_name + "_init(void)\n" buf += "{\n" - buf += " int ret;\n\n" - buf += " ret = " + fabric_mod_name + "_register_configfs();\n" - buf += " if (ret < 0)\n" - buf += " return ret;\n\n" - buf += " return 0;\n" + buf += " return target_register_template(" + fabric_mod_name + "_ops);\n" buf += "};\n\n" + buf += "static void __exit " + fabric_mod_name + "_exit(void)\n" buf += "{\n" - buf += " " + fabric_mod_name + "_deregister_configfs();\n" + buf += " target_unregister_template(" + fabric_mod_name + "_ops);\n" buf += "};\n\n" buf += "MODULE_DESCRIPTION(\"" + fabric_mod_name.upper() + " series fabric driver\");\n" diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt index 5518465290bf..43e94ea6d2ca 100644 --- a/Documentation/target/tcmu-design.txt +++ b/Documentation/target/tcmu-design.txt @@ -138,27 +138,40 @@ signals the kernel via a 4-byte write(). When cmd_head equals cmd_tail, the ring is empty -- no commands are currently waiting to be processed by userspace. -TCMU commands start with a common header containing "len_op", a 32-bit -value that stores the length, as well as the opcode in the lowest -unused bits. Currently only two opcodes are defined, TCMU_OP_PAD and -TCMU_OP_CMD. When userspace encounters a command with PAD opcode, it -should skip ahead by the bytes in "length". (The kernel inserts PAD -entries to ensure each CMD entry fits contigously into the circular -buffer.) - -When userspace handles a CMD, it finds the SCSI CDB (Command Data -Block) via tcmu_cmd_entry.req.cdb_off. This is an offset from the -start of the overall shared memory region, not the entry. The data -in/out buffers are accessible via tht req.iov[] array. Note that -each iov.iov_base is also an offset from the start of the region. - -TCMU currently does not support BIDI operations. +TCMU commands are 8-byte aligned. They start with a common header +containing "len_op", a 32-bit value that stores the length, as well as +the opcode in the lowest unused bits. It also contains cmd_id and +flags fields for setting by the kernel (kflags) and userspace +(uflags). + +Currently only two opcodes are defined, TCMU_OP_CMD and TCMU_OP_PAD. + +When the opcode is CMD, the entry in the command ring is a struct +tcmu_cmd_entry. Userspace finds the SCSI CDB (Command Data Block) via +tcmu_cmd_entry.req.cdb_off. This is an offset from the start of the +overall shared memory region, not the entry. The data in/out buffers +are accessible via tht req.iov[] array. iov_cnt contains the number of +entries in iov[] needed to describe either the Data-In or Data-Out +buffers. For bidirectional commands, iov_cnt specifies how many iovec +entries cover the Data-Out area, and iov_bidi_count specifies how many +iovec entries immediately after that in iov[] cover the Data-In +area. Just like other fields, iov.iov_base is an offset from the start +of the region. When completing a command, userspace sets rsp.scsi_status, and rsp.sense_buffer if necessary. Userspace then increments mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the kernel via the UIO method, a 4-byte write to the file descriptor. +When the opcode is PAD, userspace only updates cmd_tail as above -- +it's a no-op. (The kernel inserts PAD entries to ensure each CMD entry +is contiguous within the command ring.) + +More opcodes may be added in the future. If userspace encounters an +opcode it does not handle, it must set UNKNOWN_OP bit (bit 0) in +hdr.uflags, update cmd_tail, and proceed with processing additional +commands, if any. + The Data Area: This is shared-memory space after the command ring. The organization diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bc9f6fe44e27..9fa2bf8c3f6f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3573,3 +3573,20 @@ struct { @ar - access register number KVM handlers should exit to userspace with rc = -EREMOTE. + + +8. Other capabilities. +---------------------- + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. diff --git a/MAINTAINERS b/MAINTAINERS index f6f595021d6b..2e5bbc0d68b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3066,10 +3066,16 @@ F: drivers/net/fddi/defxx.* DELL LAPTOP DRIVER M: Matthew Garrett <mjg59@srcf.ucam.org> +M: Pali Rohár <pali.rohar@gmail.com> L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/dell-laptop.c +DELL LAPTOP FREEFALL DRIVER +M: Pali Rohár <pali.rohar@gmail.com> +S: Maintained +F: drivers/platform/x86/dell-smo8800.c + DELL LAPTOP SMM DRIVER M: Guenter Roeck <linux@roeck-us.net> S: Maintained @@ -3084,6 +3090,7 @@ F: drivers/firmware/dcdbas.* DELL WMI EXTRAS DRIVER M: Matthew Garrett <mjg59@srcf.ucam.org> +M: Pali Rohár <pali.rohar@gmail.com> S: Maintained F: drivers/platform/x86/dell-wmi.c @@ -3271,12 +3278,6 @@ F: drivers/firmware/dmi-id.c F: drivers/firmware/dmi_scan.c F: include/linux/dmi.h -DOCKING STATION DRIVER -M: Shaohua Li <shaohua.li@intel.com> -L: linux-acpi@vger.kernel.org -S: Supported -F: drivers/acpi/dock.c - DOCUMENTATION M: Jonathan Corbet <corbet@lwn.net> L: linux-doc@vger.kernel.org @@ -5009,6 +5010,11 @@ W: http://industrypack.sourceforge.net S: Maintained F: drivers/ipack/ +INGENIC JZ4780 DMA Driver +M: Zubair Lutfullah Kakakhel <Zubair.Kakakhel@imgtec.com> +S: Maintained +F: drivers/dma/dma-jz4780.c + INTEGRITY MEASUREMENT ARCHITECTURE (IMA) M: Mimi Zohar <zohar@linux.vnet.ibm.com> M: Dmitry Kasatkin <dmitry.kasatkin@gmail.com> @@ -7533,7 +7539,6 @@ S: Maintained F: drivers/pci/host/pci-exynos.c PCI DRIVER FOR SYNOPSIS DESIGNWARE -M: Mohit Kumar <mohit.kumar@st.com> M: Jingoo Han <jg1.han@samsung.com> L: linux-pci@vger.kernel.org S: Maintained @@ -7548,9 +7553,8 @@ F: Documentation/devicetree/bindings/pci/host-generic-pci.txt F: drivers/pci/host/pci-host-generic.c PCIE DRIVER FOR ST SPEAR13XX -M: Mohit Kumar <mohit.kumar@st.com> L: linux-pci@vger.kernel.org -S: Maintained +S: Orphan F: drivers/pci/host/*spear* PCMCIA SUBSYSTEM @@ -8805,6 +8809,15 @@ W: http://www.emulex.com S: Supported F: drivers/net/ethernet/emulex/benet/ +EMULEX ONECONNECT ROCE DRIVER +M: Selvin Xavier <selvin.xavier@emulex.com> +M: Devesh Sharma <devesh.sharma@emulex.com> +M: Mitesh Ahuja <mitesh.ahuja@emulex.com> +L: linux-rdma@vger.kernel.org +W: http://www.emulex.com +S: Supported +F: drivers/infiniband/hw/ocrdma/ + SFC NETWORK DRIVER M: Solarflare linux maintainers <linux-net-drivers@solarflare.com> M: Shradha Shah <sshah@solarflare.com> @@ -9937,10 +9950,23 @@ S: Maintained F: drivers/platform/x86/topstar-laptop.c TOSHIBA ACPI EXTRAS DRIVER +M: Azael Avalos <coproscefalo@gmail.com> L: platform-driver-x86@vger.kernel.org -S: Orphan +S: Maintained F: drivers/platform/x86/toshiba_acpi.c +TOSHIBA BLUETOOTH DRIVER +M: Azael Avalos <coproscefalo@gmail.com> +L: platform-driver-x86@vger.kernel.org +S: Maintained +F: drivers/platform/x86/toshiba_bluetooth.c + +TOSHIBA HDD ACTIVE PROTECTION SENSOR DRIVER +M: Azael Avalos <coproscefalo@gmail.com> +L: platform-driver-x86@vger.kernel.org +S: Maintained +F: drivers/platform/x86/toshiba_haps.c + TOSHIBA SMM DRIVER M: Jonathan Buzzard <jonathan@buzzard.org.uk> L: tlinux-users@tce.toshiba-dme.co.jp @@ -10517,6 +10543,12 @@ S: Maintained F: drivers/vhost/ F: include/uapi/linux/vhost.h +VIRTIO INPUT DRIVER +M: Gerd Hoffmann <kraxel@redhat.com> +S: Maintained +F: drivers/virtio/virtio_input.c +F: include/uapi/linux/virtio_input.h + VIA RHINE NETWORK DRIVER M: Roger Luethi <rl@hellgate.ch> S: Maintained @@ -1,7 +1,7 @@ VERSION = 4 -PATCHLEVEL = 0 +PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Hurr durr I'ma sheep # *DOCUMENTATION* diff --git a/arch/arc/boot/dts/angel4.dts b/arch/arc/boot/dts/angel4.dts index 757e0c62c4f9..3b076fbd8366 100644 --- a/arch/arc/boot/dts/angel4.dts +++ b/arch/arc/boot/dts/angel4.dts @@ -64,7 +64,7 @@ }; arcpmu0: pmu { - compatible = "snps,arc700-pmu"; + compatible = "snps,arc700-pct"; }; }; }; diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index 278dacf2a3f9..d2ac4e56ba1d 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -2,6 +2,9 @@ CONFIG_CROSS_COMPILE="arc-linux-uclibc-" # CONFIG_LOCALVERSION_AUTO is not set CONFIG_DEFAULT_HOSTNAME="ARCLinux" # CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y @@ -9,7 +12,7 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="../arc_initramfs" +CONFIG_INITRAMFS_SOURCE="../arc_initramfs/" CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y # CONFIG_SLUB_DEBUG is not set @@ -21,12 +24,9 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -# CONFIG_ARC_IDE is not set -# CONFIG_ARCTANGENT_EMAC is not set # CONFIG_ARC_HAS_RTSC is not set CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci" # CONFIG_COMPACTION is not set -# CONFIG_CROSS_MEMORY_ATTACH is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -39,23 +39,23 @@ CONFIG_INET=y # CONFIG_FIRMWARE_IN_KERNEL is not set # CONFIG_BLK_DEV is not set CONFIG_NETDEVICES=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set +# CONFIG_INPUT_MOUSEDEV is not set +CONFIG_INPUT_EVDEV=y # CONFIG_MOUSE_PS2_ALPS is not set # CONFIG_MOUSE_PS2_LOGIPS2PP is not set # CONFIG_MOUSE_PS2_SYNAPTICS is not set +# CONFIG_MOUSE_PS2_CYPRESS is not set # CONFIG_MOUSE_PS2_TRACKPOINT is not set CONFIG_MOUSE_PS2_TOUCHKIT=y -# CONFIG_SERIO_I8042 is not set # CONFIG_SERIO_SERPORT is not set CONFIG_SERIO_ARC_PS2=y # CONFIG_LEGACY_PTYS is not set # CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DW=y +CONFIG_SERIAL_8250_NR_UARTS=1 +CONFIG_SERIAL_8250_RUNTIME_UARTS=1 CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_SERIAL_ARC=y -CONFIG_SERIAL_ARC_CONSOLE=y # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set CONFIG_FB=y @@ -72,4 +72,3 @@ CONFIG_TMPFS=y CONFIG_NFS_FS=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_XZ_DEC=y diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index be33db8a2ee3..e2b1b1211b0d 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -30,6 +30,7 @@ #define ARC_REG_D_UNCACH_BCR 0x6A #define ARC_REG_BPU_BCR 0xc0 #define ARC_REG_ISA_CFG_BCR 0xc1 +#define ARC_REG_RTT_BCR 0xF2 #define ARC_REG_SMART_BCR 0xFF /* status32 Bits Positions */ @@ -50,11 +51,7 @@ * [15: 8] = Exception Cause Code * [ 7: 0] = Exception Parameters (for certain types only) */ -#define ECR_VEC_MASK 0xff0000 -#define ECR_CODE_MASK 0x00ff00 -#define ECR_PARAM_MASK 0x0000ff - -/* Exception Cause Vector Values */ +#define ECR_V_MEM_ERR 0x01 #define ECR_V_INSN_ERR 0x02 #define ECR_V_MACH_CHK 0x20 #define ECR_V_ITLB_MISS 0x21 @@ -62,7 +59,8 @@ #define ECR_V_PROTV 0x23 #define ECR_V_TRAP 0x25 -/* Protection Violation Exception Cause Code Values */ +/* DTLB Miss and Protection Violation Cause Codes */ + #define ECR_C_PROTV_INST_FETCH 0x00 #define ECR_C_PROTV_LOAD 0x01 #define ECR_C_PROTV_STORE 0x02 @@ -173,11 +171,11 @@ } \ } -#define WRITE_BCR(reg, into) \ +#define WRITE_AUX(reg, into) \ { \ unsigned int tmp; \ if (sizeof(tmp) == sizeof(into)) { \ - tmp = (*(unsigned int *)(into)); \ + tmp = (*(unsigned int *)&(into)); \ write_aux_reg(reg, tmp); \ } else { \ extern void bogus_undefined(void); \ diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index 1a5bf07eefe2..4051e9525939 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -32,6 +32,20 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *m) m += nr >> 5; + /* + * ARC ISA micro-optimization: + * + * Instructions dealing with bitpos only consider lower 5 bits (0-31) + * e.g (x << 33) is handled like (x << 1) by ASL instruction + * (mem pointer still needs adjustment to point to next word) + * + * Hence the masking to clamp @nr arg can be elided in general. + * + * However if @nr is a constant (above assumed it in a register), + * and greater than 31, gcc can optimize away (x << 33) to 0, + * as overflow, given the 32-bit ISA. Thus masking needs to be done + * for constant @nr, but no code is generated due to const prop. + */ if (__builtin_constant_p(nr)) nr &= 0x1f; @@ -374,29 +388,20 @@ __test_and_change_bit(unsigned long nr, volatile unsigned long *m) * This routine doesn't need to be atomic. */ static inline int -__constant_test_bit(unsigned int nr, const volatile unsigned long *addr) -{ - return ((1UL << (nr & 31)) & - (((const volatile unsigned int *)addr)[nr >> 5])) != 0; -} - -static inline int -__test_bit(unsigned int nr, const volatile unsigned long *addr) +test_bit(unsigned int nr, const volatile unsigned long *addr) { unsigned long mask; addr += nr >> 5; - /* ARC700 only considers 5 bits in bit-fiddling insn */ + if (__builtin_constant_p(nr)) + nr &= 0x1f; + mask = 1 << nr; return ((mask & *addr) != 0); } -#define test_bit(nr, addr) (__builtin_constant_p(nr) ? \ - __constant_test_bit((nr), (addr)) : \ - __test_bit((nr), (addr))) - /* * Count the number of zeros, starting from MSB * Helper for fls( ) friends diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index cbf755e32a03..2b8880e953a2 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -54,29 +54,13 @@ struct arc_reg_cc_build { #define PERF_COUNT_ARC_BPOK (PERF_COUNT_HW_MAX + 3) #define PERF_COUNT_ARC_EDTLB (PERF_COUNT_HW_MAX + 4) #define PERF_COUNT_ARC_EITLB (PERF_COUNT_HW_MAX + 5) -#define PERF_COUNT_ARC_HW_MAX (PERF_COUNT_HW_MAX + 6) +#define PERF_COUNT_ARC_LDC (PERF_COUNT_HW_MAX + 6) +#define PERF_COUNT_ARC_STC (PERF_COUNT_HW_MAX + 7) + +#define PERF_COUNT_ARC_HW_MAX (PERF_COUNT_HW_MAX + 8) /* - * The "generalized" performance events seem to really be a copy - * of the available events on x86 processors; the mapping to ARC - * events is not always possible 1-to-1. Fortunately, there doesn't - * seem to be an exact definition for these events, so we can cheat - * a bit where necessary. - * - * In particular, the following PERF events may behave a bit differently - * compared to other architectures: - * - * PERF_COUNT_HW_CPU_CYCLES - * Cycles not in halted state - * - * PERF_COUNT_HW_REF_CPU_CYCLES - * Reference cycles not in halted state, same as PERF_COUNT_HW_CPU_CYCLES - * for now as we don't do Dynamic Voltage/Frequency Scaling (yet) - * - * PERF_COUNT_HW_BUS_CYCLES - * Unclear what this means, Intel uses 0x013c, which according to - * their datasheet means "unhalted reference cycles". It sounds similar - * to PERF_COUNT_HW_REF_CPU_CYCLES, and we use the same counter for it. + * Some ARC pct quirks: * * PERF_COUNT_HW_STALLED_CYCLES_BACKEND * PERF_COUNT_HW_STALLED_CYCLES_FRONTEND @@ -91,21 +75,38 @@ struct arc_reg_cc_build { * Note that I$ cache misses aren't counted by either of the two! */ +/* + * ARC PCT has hardware conditions with fixed "names" but variable "indexes" + * (based on a specific RTL build) + * Below is the static map between perf generic/arc specific event_id and + * h/w condition names. + * At the time of probe, we loop thru each index and find it's name to + * complete the mapping of perf event_id to h/w index as latter is needed + * to program the counter really + */ static const char * const arc_pmu_ev_hw_map[] = { + /* count cycles */ [PERF_COUNT_HW_CPU_CYCLES] = "crun", [PERF_COUNT_HW_REF_CPU_CYCLES] = "crun", [PERF_COUNT_HW_BUS_CYCLES] = "crun", - [PERF_COUNT_HW_INSTRUCTIONS] = "iall", - [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = "bflush", [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = "bstall", - [PERF_COUNT_ARC_DCLM] = "dclm", - [PERF_COUNT_ARC_DCSM] = "dcsm", - [PERF_COUNT_ARC_ICM] = "icm", - [PERF_COUNT_ARC_BPOK] = "bpok", - [PERF_COUNT_ARC_EDTLB] = "edtlb", - [PERF_COUNT_ARC_EITLB] = "eitlb", + + /* counts condition */ + [PERF_COUNT_HW_INSTRUCTIONS] = "iall", + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", + [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ + [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ + + [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ + [PERF_COUNT_ARC_STC] = "imemwrc", /* Instr: mem write cached */ + + [PERF_COUNT_ARC_DCLM] = "dclm", /* D-cache Load Miss */ + [PERF_COUNT_ARC_DCSM] = "dcsm", /* D-cache Store Miss */ + [PERF_COUNT_ARC_ICM] = "icm", /* I-cache Miss */ + [PERF_COUNT_ARC_EDTLB] = "edtlb", /* D-TLB Miss */ + [PERF_COUNT_ARC_EITLB] = "eitlb", /* I-TLB Miss */ }; #define C(_x) PERF_COUNT_HW_CACHE_##_x @@ -114,11 +115,11 @@ static const char * const arc_pmu_ev_hw_map[] = { static const unsigned arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(L1D)] = { [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, [C(RESULT_MISS)] = PERF_COUNT_ARC_DCLM, }, [C(OP_WRITE)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_STC, [C(RESULT_MISS)] = PERF_COUNT_ARC_DCSM, }, [C(OP_PREFETCH)] = { @@ -128,7 +129,7 @@ static const unsigned arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { }, [C(L1I)] = { [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_ACCESS)] = PERF_COUNT_HW_INSTRUCTIONS, [C(RESULT_MISS)] = PERF_COUNT_ARC_ICM, }, [C(OP_WRITE)] = { @@ -156,9 +157,10 @@ static const unsigned arc_pmu_cache_map[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { }, [C(DTLB)] = { [C(OP_READ)] = { - [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_ACCESS)] = PERF_COUNT_ARC_LDC, [C(RESULT_MISS)] = PERF_COUNT_ARC_EDTLB, }, + /* DTLB LD/ST Miss not segregated by h/w*/ [C(OP_WRITE)] = { [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index ae1c485cbc68..fd2ec50102f2 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -16,6 +16,7 @@ #include <linux/perf_event.h> #include <linux/platform_device.h> #include <asm/arcregs.h> +#include <asm/stacktrace.h> struct arc_pmu { struct pmu pmu; @@ -25,6 +26,46 @@ struct arc_pmu { int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; }; +struct arc_callchain_trace { + int depth; + void *perf_stuff; +}; + +static int callchain_trace(unsigned int addr, void *data) +{ + struct arc_callchain_trace *ctrl = data; + struct perf_callchain_entry *entry = ctrl->perf_stuff; + perf_callchain_store(entry, addr); + + if (ctrl->depth++ < 3) + return 0; + + return -1; +} + +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +{ + struct arc_callchain_trace ctrl = { + .depth = 0, + .perf_stuff = entry, + }; + + arc_unwind_core(NULL, regs, callchain_trace, &ctrl); +} + +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +{ + /* + * User stack can't be unwound trivially with kernel dwarf unwinder + * So for now just record the user PC + */ + perf_callchain_store(entry, instruction_pointer(regs)); +} + +static struct arc_pmu *arc_pmu; + /* read counter #idx; note that counter# != event# on ARC! */ static uint64_t arc_pmu_read_counter(int idx) { @@ -47,7 +88,6 @@ static uint64_t arc_pmu_read_counter(int idx) static void arc_perf_event_update(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - struct arc_pmu *arc_pmu = container_of(event->pmu, struct arc_pmu, pmu); uint64_t prev_raw_count, new_raw_count; int64_t delta; @@ -89,13 +129,16 @@ static int arc_pmu_cache_event(u64 config) if (ret == CACHE_OP_UNSUPPORTED) return -ENOENT; + pr_debug("init cache event: type/op/result %d/%d/%d with h/w %d \'%s\'\n", + cache_type, cache_op, cache_result, ret, + arc_pmu_ev_hw_map[ret]); + return ret; } /* initializes hw_perf_event structure if event is supported */ static int arc_pmu_event_init(struct perf_event *event) { - struct arc_pmu *arc_pmu = container_of(event->pmu, struct arc_pmu, pmu); struct hw_perf_event *hwc = &event->hw; int ret; @@ -106,8 +149,9 @@ static int arc_pmu_event_init(struct perf_event *event) if (arc_pmu->ev_hw_idx[event->attr.config] < 0) return -ENOENT; hwc->config = arc_pmu->ev_hw_idx[event->attr.config]; - pr_debug("initializing event %d with cfg %d\n", - (int) event->attr.config, (int) hwc->config); + pr_debug("init event %d with h/w %d \'%s\'\n", + (int) event->attr.config, (int) hwc->config, + arc_pmu_ev_hw_map[event->attr.config]); return 0; case PERF_TYPE_HW_CACHE: ret = arc_pmu_cache_event(event->attr.config); @@ -183,8 +227,6 @@ static void arc_pmu_stop(struct perf_event *event, int flags) static void arc_pmu_del(struct perf_event *event, int flags) { - struct arc_pmu *arc_pmu = container_of(event->pmu, struct arc_pmu, pmu); - arc_pmu_stop(event, PERF_EF_UPDATE); __clear_bit(event->hw.idx, arc_pmu->used_mask); @@ -194,7 +236,6 @@ static void arc_pmu_del(struct perf_event *event, int flags) /* allocate hardware counter and optionally start counting */ static int arc_pmu_add(struct perf_event *event, int flags) { - struct arc_pmu *arc_pmu = container_of(event->pmu, struct arc_pmu, pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -247,10 +288,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS); READ_BCR(ARC_REG_CC_BUILD, cc_bcr); - if (!cc_bcr.v) { - pr_err("Performance counters exist, but no countable conditions?\n"); - return -ENODEV; - } + BUG_ON(!cc_bcr.v); /* Counters exist but No countable conditions ? */ arc_pmu = devm_kzalloc(&pdev->dev, sizeof(struct arc_pmu), GFP_KERNEL); if (!arc_pmu) @@ -263,19 +301,22 @@ static int arc_pmu_device_probe(struct platform_device *pdev) arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c); cc_name.str[8] = 0; - for (i = 0; i < PERF_COUNT_HW_MAX; i++) + for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++) arc_pmu->ev_hw_idx[i] = -1; + /* loop thru all available h/w condition indexes */ for (j = 0; j < cc_bcr.c; j++) { write_aux_reg(ARC_REG_CC_INDEX, j); cc_name.indiv.word0 = read_aux_reg(ARC_REG_CC_NAME0); cc_name.indiv.word1 = read_aux_reg(ARC_REG_CC_NAME1); + + /* See if it has been mapped to a perf event_id */ for (i = 0; i < ARRAY_SIZE(arc_pmu_ev_hw_map); i++) { if (arc_pmu_ev_hw_map[i] && !strcmp(arc_pmu_ev_hw_map[i], cc_name.str) && strlen(arc_pmu_ev_hw_map[i])) { - pr_debug("mapping %d to idx %d with name %s\n", - i, j, cc_name.str); + pr_debug("mapping perf event %2d to h/w event \'%8s\' (idx %d)\n", + i, cc_name.str, j); arc_pmu->ev_hw_idx[i] = j; } } @@ -302,7 +343,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) #ifdef CONFIG_OF static const struct of_device_id arc_pmu_match[] = { - { .compatible = "snps,arc700-pmu" }, + { .compatible = "snps,arc700-pct" }, {}, }; MODULE_DEVICE_TABLE(of, arc_pmu_match); @@ -310,7 +351,7 @@ MODULE_DEVICE_TABLE(of, arc_pmu_match); static struct platform_driver arc_pmu_driver = { .driver = { - .name = "arc700-pmu", + .name = "arc700-pct", .of_match_table = of_match_ptr(arc_pmu_match), }, .probe = arc_pmu_device_probe, diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index f46efd14059d..e095c557afdd 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -49,7 +49,10 @@ void arch_cpu_idle(void) asmlinkage void ret_from_fork(void); -/* Layout of Child kernel mode stack as setup at the end of this function is +/* + * Copy architecture-specific thread state + * + * Layout of Child kernel mode stack as setup at the end of this function is * * | ... | * | ... | @@ -81,7 +84,7 @@ asmlinkage void ret_from_fork(void); * ------------------ <===== END of PAGE */ int copy_thread(unsigned long clone_flags, - unsigned long usp, unsigned long arg, + unsigned long usp, unsigned long kthread_arg, struct task_struct *p) { struct pt_regs *c_regs; /* child's pt_regs */ @@ -112,7 +115,7 @@ int copy_thread(unsigned long clone_flags, if (unlikely(p->flags & PF_KTHREAD)) { memset(c_regs, 0, sizeof(struct pt_regs)); - c_callee->r13 = arg; /* argument to kernel thread */ + c_callee->r13 = kthread_arg; c_callee->r14 = usp; /* function */ return 0; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 900f68a70088..1d167c6df8ca 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -120,7 +120,10 @@ static void read_arc_build_cfg_regs(void) READ_BCR(ARC_REG_SMART_BCR, bcr); cpu->extn.smart = bcr.ver ? 1 : 0; - cpu->extn.debug = cpu->extn.ap | cpu->extn.smart; + READ_BCR(ARC_REG_RTT_BCR, bcr); + cpu->extn.rtt = bcr.ver ? 1 : 0; + + cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt; } static const struct cpuinfo_data arc_cpu_tbl[] = { diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index 3eadfdabc322..c927aa84e652 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -42,7 +42,7 @@ void die(const char *str, struct pt_regs *regs, unsigned long address) * -for kernel, chk if due to copy_(to|from)_user, otherwise die() */ static noinline int -handle_exception(const char *str, struct pt_regs *regs, siginfo_t *info) +unhandled_exception(const char *str, struct pt_regs *regs, siginfo_t *info) { if (user_mode(regs)) { struct task_struct *tsk = current; @@ -71,7 +71,7 @@ int name(unsigned long address, struct pt_regs *regs) \ .si_code = sicode, \ .si_addr = (void __user *)address, \ }; \ - return handle_exception(str, regs, &info);\ + return unhandled_exception(str, regs, &info);\ } /* diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 523412369f70..d44eedd8c322 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -71,7 +71,7 @@ early_param("initrd", early_initrd); */ void __init setup_arch_memory(void) { - unsigned long zones_size[MAX_NR_ZONES] = { 0, 0 }; + unsigned long zones_size[MAX_NR_ZONES]; unsigned long end_mem = CONFIG_LINUX_LINK_BASE + arc_mem_sz; init_mm.start_code = (unsigned long)_text; @@ -90,7 +90,7 @@ void __init setup_arch_memory(void) /*------------- externs in mm need setting up ---------------*/ /* first page of system - kernel .vector starts here */ - min_low_pfn = PFN_DOWN(CONFIG_LINUX_LINK_BASE); + min_low_pfn = ARCH_PFN_OFFSET; /* Last usable page of low mem (no HIGHMEM yet for ARC port) */ max_low_pfn = max_pfn = PFN_DOWN(end_mem); @@ -111,7 +111,7 @@ void __init setup_arch_memory(void) /*-------------- node setup --------------------------------*/ memset(zones_size, 0, sizeof(zones_size)); - zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn; + zones_size[ZONE_NORMAL] = max_mapnr; /* * We can't use the helper free_area_init(zones[]) because it uses @@ -123,6 +123,8 @@ void __init setup_arch_memory(void) zones_size, /* num pages per zone */ min_low_pfn, /* first pfn of node */ NULL); /* NO holes */ + + high_memory = (void *)end_mem; } /* @@ -133,7 +135,6 @@ void __init setup_arch_memory(void) */ void __init mem_init(void) { - high_memory = (void *)(CONFIG_LINUX_LINK_BASE + arc_mem_sz); free_all_bootmem(); mem_init_print_info(NULL); } diff --git a/arch/arm/boot/dts/qcom-ipq8064.dtsi b/arch/arm/boot/dts/qcom-ipq8064.dtsi index 1bc5fdd0e4b3..9f727d8eadf6 100644 --- a/arch/arm/boot/dts/qcom-ipq8064.dtsi +++ b/arch/arm/boot/dts/qcom-ipq8064.dtsi @@ -61,6 +61,14 @@ }; }; + clocks { + sleep_clk: sleep_clk { + compatible = "fixed-clock"; + clock-frequency = <32768>; + #clock-cells = <0>; + }; + }; + soc: soc { #address-cells = <1>; #size-cells = <1>; @@ -105,10 +113,14 @@ compatible = "qcom,kpss-timer", "qcom,msm-timer"; interrupts = <1 1 0x301>, <1 2 0x301>, - <1 3 0x301>; + <1 3 0x301>, + <1 4 0x301>, + <1 5 0x301>; reg = <0x0200a000 0x100>; clock-frequency = <25000000>, <32768>; + clocks = <&sleep_clk>; + clock-names = "sleep"; cpu-offset = <0x80000>; }; diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 2499867dd0d8..df3f60cb1168 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -195,8 +195,14 @@ struct kvm_arch_memory_slot { #define KVM_ARM_IRQ_CPU_IRQ 0 #define KVM_ARM_IRQ_CPU_FIQ 1 -/* Highest supported SPI, from VGIC_NR_IRQS */ +/* + * This used to hold the highest supported SPI, but it is now obsolete + * and only here to provide source code level compatibility with older + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. + */ +#ifndef __KERNEL__ #define KVM_ARM_IRQ_GIC_MAX 127 +#endif /* One single KVM irqchip, ie. the VGIC */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index cc176b67c134..aebfbf79a1a3 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -80,9 +80,9 @@ ENTRY(stext) ldr r13, =__mmap_switched @ address to jump to after @ initialising sctlr adr lr, BSYM(1f) @ return (PIC) address - ARM( add pc, r10, #PROCINFO_INITFUNC ) - THUMB( add r12, r10, #PROCINFO_INITFUNC ) - THUMB( ret r12 ) + ldr r12, [r10, #PROCINFO_INITFUNC] + add r12, r12, r10 + ret r12 1: b __after_proc_init ENDPROC(stext) @@ -117,9 +117,9 @@ ENTRY(secondary_startup) adr lr, BSYM(__after_proc_init) @ return address mov r13, r12 @ __secondary_switched address - ARM( add pc, r10, #PROCINFO_INITFUNC ) - THUMB( add r12, r10, #PROCINFO_INITFUNC ) - THUMB( ret r12 ) + ldr r12, [r10, #PROCINFO_INITFUNC] + add r12, r12, r10 + ret r12 ENDPROC(secondary_startup) ENTRY(__secondary_switched) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 6f536451ab78..d9631ecddd56 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -671,8 +671,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, if (!irqchip_in_kernel(kvm)) return -ENXIO; - if (irq_num < VGIC_NR_PRIVATE_IRQS || - irq_num > KVM_ARM_IRQ_GIC_MAX) + if (irq_num < VGIC_NR_PRIVATE_IRQS) return -EINVAL; return kvm_vgic_inject_irq(kvm, 0, irq_num, level); diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c index 36aaeb12e1a5..bf37e3c532f6 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva.c @@ -754,12 +754,12 @@ static struct platform_device vcc_sdhi1 = { }; /* SDHI0 */ -static struct sh_mobile_sdhi_info sdhi0_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | +static struct tmio_mmc_data sdhi0_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD, - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_USE_GPIO_CD, + .flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_USE_GPIO_CD, .cd_gpio = 167, }; @@ -796,12 +796,12 @@ static struct platform_device sdhi0_device = { }; /* SDHI1 */ -static struct sh_mobile_sdhi_info sdhi1_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI1_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI1_RX, - .tmio_caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | +static struct tmio_mmc_data sdhi1_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI1_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI1_RX, + .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD, - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_USE_GPIO_CD, + .flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_USE_GPIO_CD, /* Port72 cannot generate IRQs, will be used in polling mode. */ .cd_gpio = 72, }; diff --git a/arch/arm/mach-shmobile/board-bockw.c b/arch/arm/mach-shmobile/board-bockw.c index f27b5a833bf0..25558d1f417f 100644 --- a/arch/arm/mach-shmobile/board-bockw.c +++ b/arch/arm/mach-shmobile/board-bockw.c @@ -201,12 +201,12 @@ static struct rcar_phy_platform_data usb_phy_platform_data __initdata = /* SDHI */ -static struct sh_mobile_sdhi_info sdhi0_info __initdata = { - .dma_slave_tx = HPBDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = HPBDMA_SLAVE_SDHI0_RX, - .tmio_caps = MMC_CAP_SD_HIGHSPEED, - .tmio_ocr_mask = MMC_VDD_165_195 | MMC_VDD_32_33 | MMC_VDD_33_34, - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT, +static struct tmio_mmc_data sdhi0_info __initdata = { + .chan_priv_tx = (void *)HPBDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)HPBDMA_SLAVE_SDHI0_RX, + .capabilities = MMC_CAP_SD_HIGHSPEED, + .ocr_mask = MMC_VDD_165_195 | MMC_VDD_32_33 | MMC_VDD_33_34, + .flags = TMIO_MMC_HAS_IDLE_WAIT, }; static struct resource sdhi0_resources[] __initdata = { @@ -683,7 +683,7 @@ static void __init bockw_init(void) platform_device_register_resndata( NULL, "sh_mobile_sdhi", 0, sdhi0_resources, ARRAY_SIZE(sdhi0_resources), - &sdhi0_info, sizeof(struct sh_mobile_sdhi_info)); + &sdhi0_info, sizeof(struct tmio_mmc_data)); } /* for Audio */ diff --git a/arch/arm/mach-shmobile/board-kzm9g.c b/arch/arm/mach-shmobile/board-kzm9g.c index 7c9b63bdde9f..260d8319fd82 100644 --- a/arch/arm/mach-shmobile/board-kzm9g.c +++ b/arch/arm/mach-shmobile/board-kzm9g.c @@ -442,11 +442,11 @@ static struct platform_device vcc_sdhi2 = { }; /* SDHI */ -static struct sh_mobile_sdhi_info sdhi0_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT, - .tmio_caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | +static struct tmio_mmc_data sdhi0_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .flags = TMIO_MMC_HAS_IDLE_WAIT, + .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD, }; @@ -484,13 +484,13 @@ static struct platform_device sdhi0_device = { }; /* Micro SD */ -static struct sh_mobile_sdhi_info sdhi2_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI2_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI2_RX, - .tmio_flags = TMIO_MMC_HAS_IDLE_WAIT | +static struct tmio_mmc_data sdhi2_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI2_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI2_RX, + .flags = TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_USE_GPIO_CD | TMIO_MMC_WRPROTECT_DISABLE, - .tmio_caps = MMC_CAP_SD_HIGHSPEED | MMC_CAP_POWER_OFF_CARD, + .capabilities = MMC_CAP_SD_HIGHSPEED | MMC_CAP_POWER_OFF_CARD, .cd_gpio = 13, }; diff --git a/arch/arm/mach-shmobile/board-marzen.c b/arch/arm/mach-shmobile/board-marzen.c index 598f704f76ae..51db288f192a 100644 --- a/arch/arm/mach-shmobile/board-marzen.c +++ b/arch/arm/mach-shmobile/board-marzen.c @@ -122,11 +122,11 @@ static struct resource sdhi0_resources[] = { }, }; -static struct sh_mobile_sdhi_info sdhi0_platform_data = { - .dma_slave_tx = HPBDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = HPBDMA_SLAVE_SDHI0_RX, - .tmio_flags = TMIO_MMC_WRPROTECT_DISABLE | TMIO_MMC_HAS_IDLE_WAIT, - .tmio_caps = MMC_CAP_SD_HIGHSPEED, +static struct tmio_mmc_data sdhi0_platform_data = { + .chan_priv_tx = (void *)HPBDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)HPBDMA_SLAVE_SDHI0_RX, + .flags = TMIO_MMC_WRPROTECT_DISABLE | TMIO_MMC_HAS_IDLE_WAIT, + .capabilities = MMC_CAP_SD_HIGHSPEED, }; static struct platform_device sdhi0_device = { diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index b7644310236b..b4f92b9a13ac 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -827,7 +827,7 @@ config KUSER_HELPERS config VDSO bool "Enable VDSO for acceleration of some system calls" - depends on AEABI && MMU + depends on AEABI && MMU && CPU_V7 default y if ARM_ARCH_TIMER select GENERIC_TIME_VSYSCALL help diff --git a/arch/arm/vdso/.gitignore b/arch/arm/vdso/.gitignore index f8b69d84238e..6b47f6e0b032 100644 --- a/arch/arm/vdso/.gitignore +++ b/arch/arm/vdso/.gitignore @@ -1 +1,3 @@ vdso.lds +vdso.so.raw +vdsomunge diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index bab0a8be7924..8aa791051029 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -10,8 +10,8 @@ ccflags-y := -shared -fPIC -fno-common -fno-builtin -fno-stack-protector ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 -DDISABLE_BRANCH_PROFILING ccflags-y += -Wl,--no-undefined $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) -obj-y += vdso.o -extra-y += vdso.lds +obj-$(CONFIG_VDSO) += vdso.o +extra-$(CONFIG_VDSO) += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) CFLAGS_REMOVE_vdso.o = -pg diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index da5f20e8cc50..4269dba63cf1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,5 +1,7 @@ config ARM64 def_bool y + select ACPI_GENERIC_GSI if ACPI + select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL @@ -758,6 +760,8 @@ source "drivers/Kconfig" source "drivers/firmware/Kconfig" +source "drivers/acpi/Kconfig" + source "fs/Kconfig" source "arch/arm64/kvm/Kconfig" diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi index e74f6e0a208c..c8d3e0e86678 100644 --- a/arch/arm64/boot/dts/apm/apm-storm.dtsi +++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi @@ -102,6 +102,7 @@ #address-cells = <2>; #size-cells = <2>; ranges; + dma-ranges = <0x0 0x0 0x0 0x0 0x400 0x0>; clocks { #address-cells = <2>; @@ -362,6 +363,15 @@ reg-names = "csr-reg"; clock-output-names = "pcie4clk"; }; + + dmaclk: dmaclk@1f27c000 { + compatible = "apm,xgene-device-clock"; + #clock-cells = <1>; + clocks = <&socplldiv2 0>; + reg = <0x0 0x1f27c000 0x0 0x1000>; + reg-names = "csr-reg"; + clock-output-names = "dmaclk"; + }; }; pcie0: pcie@1f2b0000 { @@ -684,5 +694,21 @@ interrupts = <0x0 0x41 0x4>; clocks = <&rngpkaclk 0>; }; + + dma: dma@1f270000 { + compatible = "apm,xgene-storm-dma"; + device_type = "dma"; + reg = <0x0 0x1f270000 0x0 0x10000>, + <0x0 0x1f200000 0x0 0x10000>, + <0x0 0x1b008000 0x0 0x2000>, + <0x0 0x1054a000 0x0 0x100>; + interrupts = <0x0 0x82 0x4>, + <0x0 0xb8 0x4>, + <0x0 0xb9 0x4>, + <0x0 0xba 0x4>, + <0x0 0xbb 0x4>; + dma-coherent; + clocks = <&dmaclk 0>; + }; }; }; diff --git a/arch/arm64/include/asm/acenv.h b/arch/arm64/include/asm/acenv.h new file mode 100644 index 000000000000..b49166fde7ea --- /dev/null +++ b/arch/arm64/include/asm/acenv.h @@ -0,0 +1,18 @@ +/* + * ARM64 specific ACPICA environments and implementation + * + * Copyright (C) 2014, Linaro Ltd. + * Author: Hanjun Guo <hanjun.guo@linaro.org> + * Author: Graeme Gregory <graeme.gregory@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_ACENV_H +#define _ASM_ACENV_H + +/* It is required unconditionally by ACPI core, update it when needed. */ + +#endif /* _ASM_ACENV_H */ diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h new file mode 100644 index 000000000000..59c05d8ea4a0 --- /dev/null +++ b/arch/arm64/include/asm/acpi.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013-2014, Linaro Ltd. + * Author: Al Stone <al.stone@linaro.org> + * Author: Graeme Gregory <graeme.gregory@linaro.org> + * Author: Hanjun Guo <hanjun.guo@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation; + */ + +#ifndef _ASM_ACPI_H +#define _ASM_ACPI_H + +#include <linux/mm.h> +#include <linux/irqchip/arm-gic-acpi.h> + +#include <asm/cputype.h> +#include <asm/smp_plat.h> + +/* Basic configuration for ACPI */ +#ifdef CONFIG_ACPI +/* ACPI table mapping after acpi_gbl_permanent_mmap is set */ +static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, + acpi_size size) +{ + if (!page_is_ram(phys >> PAGE_SHIFT)) + return ioremap(phys, size); + + return ioremap_cache(phys, size); +} +#define acpi_os_ioremap acpi_os_ioremap + +typedef u64 phys_cpuid_t; +#define PHYS_CPUID_INVALID INVALID_HWID + +#define acpi_strict 1 /* No out-of-spec workarounds on ARM64 */ +extern int acpi_disabled; +extern int acpi_noirq; +extern int acpi_pci_disabled; + +/* 1 to indicate PSCI 0.2+ is implemented */ +static inline bool acpi_psci_present(void) +{ + return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_COMPLIANT; +} + +/* 1 to indicate HVC must be used instead of SMC as the PSCI conduit */ +static inline bool acpi_psci_use_hvc(void) +{ + return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; +} + +static inline void disable_acpi(void) +{ + acpi_disabled = 1; + acpi_pci_disabled = 1; + acpi_noirq = 1; +} + +static inline void enable_acpi(void) +{ + acpi_disabled = 0; + acpi_pci_disabled = 0; + acpi_noirq = 0; +} + +/* + * The ACPI processor driver for ACPI core code needs this macro + * to find out this cpu was already mapped (mapping from CPU hardware + * ID to CPU logical ID) or not. + */ +#define cpu_physical_id(cpu) cpu_logical_map(cpu) + +/* + * It's used from ACPI core in kdump to boot UP system with SMP kernel, + * with this check the ACPI core will not override the CPU index + * obtained from GICC with 0 and not print some error message as well. + * Since MADT must provide at least one GICC structure for GIC + * initialization, CPU will be always available in MADT on ARM64. + */ +static inline bool acpi_has_cpu_in_madt(void) +{ + return true; +} + +static inline void arch_fix_phys_package_id(int num, u32 slot) { } +void __init acpi_init_cpus(void); + +#else +static inline bool acpi_psci_present(void) { return false; } +static inline bool acpi_psci_use_hvc(void) { return false; } +static inline void acpi_init_cpus(void) { } +#endif /* CONFIG_ACPI */ + +#endif /*_ASM_ACPI_H*/ diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h index da301ee7395c..5a31d6716914 100644 --- a/arch/arm64/include/asm/cpu_ops.h +++ b/arch/arm64/include/asm/cpu_ops.h @@ -66,5 +66,6 @@ struct cpu_operations { extern const struct cpu_operations *cpu_ops[NR_CPUS]; int __init cpu_read_ops(struct device_node *dn, int cpu); void __init cpu_read_bootcpu_ops(void); +const struct cpu_operations *cpu_get_ops(const char *name); #endif /* ifndef __ASM_CPU_OPS_H */ diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h index 926495686554..95e6b6dcbe37 100644 --- a/arch/arm64/include/asm/fixmap.h +++ b/arch/arm64/include/asm/fixmap.h @@ -62,6 +62,9 @@ void __init early_fixmap_init(void); #define __early_set_fixmap __set_fixmap +#define __late_set_fixmap __set_fixmap +#define __late_clear_fixmap(idx) __set_fixmap((idx), 0, FIXMAP_PAGE_CLEAR) + extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot); #include <asm-generic/fixmap.h> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 94c53674a31d..bbb251b14746 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,6 +1,8 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H +#include <linux/irqchip/arm-gic-acpi.h> + #include <asm-generic/irq.h> struct pt_regs; @@ -8,4 +10,15 @@ struct pt_regs; extern void migrate_irqs(void); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); +static inline void acpi_irq_init(void) +{ + /* + * Hardcode ACPI IRQ chip initialization to GICv2 for now. + * Proper irqchip infrastructure will be implemented along with + * incoming GICv2m|GICv3|ITS bits. + */ + acpi_gic_init(); +} +#define acpi_irq_init acpi_irq_init + #endif diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h index 872ba939fcb2..b008a72f8bc0 100644 --- a/arch/arm64/include/asm/pci.h +++ b/arch/arm64/include/asm/pci.h @@ -27,6 +27,12 @@ extern int isa_dma_bridge_buggy; #ifdef CONFIG_PCI +static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) +{ + /* no legacy IRQ on arm64 */ + return -ENODEV; +} + static inline int pci_proc_domain(struct pci_bus *bus) { return 1; diff --git a/arch/arm64/include/asm/psci.h b/arch/arm64/include/asm/psci.h index e5312ea0ec1a..2454bc59c916 100644 --- a/arch/arm64/include/asm/psci.h +++ b/arch/arm64/include/asm/psci.h @@ -14,6 +14,7 @@ #ifndef __ASM_PSCI_H #define __ASM_PSCI_H -int psci_init(void); +int psci_dt_init(void); +int psci_acpi_init(void); #endif /* __ASM_PSCI_H */ diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 780f82c827b6..bf22650b1a78 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -39,9 +39,10 @@ extern void show_ipi_list(struct seq_file *p, int prec); extern void handle_IPI(int ipinr, struct pt_regs *regs); /* - * Setup the set of possible CPUs (via set_cpu_possible) + * Discover the set of possible CPUs and determine their + * SMP operations. */ -extern void smp_init_cpus(void); +extern void of_smp_init_cpus(void); /* * Provide a function to raise an IPI cross call on CPUs in callmap. diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c154c0b7eb60..d26832022127 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -188,8 +188,14 @@ struct kvm_arch_memory_slot { #define KVM_ARM_IRQ_CPU_IRQ 0 #define KVM_ARM_IRQ_CPU_FIQ 1 -/* Highest supported SPI, from VGIC_NR_IRQS */ +/* + * This used to hold the highest supported SPI, but it is now obsolete + * and only here to provide source code level compatibility with older + * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. + */ +#ifndef __KERNEL__ #define KVM_ARM_IRQ_GIC_MAX 127 +#endif /* One single KVM irqchip, ie. the VGIC */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index b12e15b80516..426d0763c81b 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -35,6 +35,7 @@ arm64-obj-$(CONFIG_KGDB) += kgdb.o arm64-obj-$(CONFIG_EFI) += efi.o efi-stub.o efi-entry.o arm64-obj-$(CONFIG_PCI) += pci.o arm64-obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o +arm64-obj-$(CONFIG_ACPI) += acpi.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c new file mode 100644 index 000000000000..8b839558838e --- /dev/null +++ b/arch/arm64/kernel/acpi.c @@ -0,0 +1,345 @@ +/* + * ARM64 Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2013-2014, Linaro Ltd. + * Author: Al Stone <al.stone@linaro.org> + * Author: Graeme Gregory <graeme.gregory@linaro.org> + * Author: Hanjun Guo <hanjun.guo@linaro.org> + * Author: Tomasz Nowicki <tomasz.nowicki@linaro.org> + * Author: Naresh Bhat <naresh.bhat@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> +#include <linux/smp.h> + +#include <asm/cputype.h> +#include <asm/cpu_ops.h> +#include <asm/smp_plat.h> + +int acpi_noirq = 1; /* skip ACPI IRQ initialization */ +int acpi_disabled = 1; +EXPORT_SYMBOL(acpi_disabled); + +int acpi_pci_disabled = 1; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); + +/* Processors with enabled flag and sane MPIDR */ +static int enabled_cpus; + +/* Boot CPU is valid or not in MADT */ +static bool bootcpu_valid __initdata; + +static bool param_acpi_off __initdata; +static bool param_acpi_force __initdata; + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) + param_acpi_off = true; + else if (strcmp(arg, "force") == 0) /* force ACPI to be enabled */ + param_acpi_force = true; + else + return -EINVAL; /* Core will print when we return error */ + + return 0; +} +early_param("acpi", parse_acpi); + +static int __init dt_scan_depth1_nodes(unsigned long node, + const char *uname, int depth, + void *data) +{ + /* + * Return 1 as soon as we encounter a node at depth 1 that is + * not the /chosen node. + */ + if (depth == 1 && (strcmp(uname, "chosen") != 0)) + return 1; + return 0; +} + +/* + * __acpi_map_table() will be called before page_init(), so early_ioremap() + * or early_memremap() should be called here to for ACPI table mapping. + */ +char *__init __acpi_map_table(unsigned long phys, unsigned long size) +{ + if (!size) + return NULL; + + return early_memremap(phys, size); +} + +void __init __acpi_unmap_table(char *map, unsigned long size) +{ + if (!map || !size) + return; + + early_memunmap(map, size); +} + +/** + * acpi_map_gic_cpu_interface - generates a logical cpu number + * and map to MPIDR represented by GICC structure + */ +static void __init +acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor) +{ + int i; + u64 mpidr = processor->arm_mpidr & MPIDR_HWID_BITMASK; + bool enabled = !!(processor->flags & ACPI_MADT_ENABLED); + + if (mpidr == INVALID_HWID) { + pr_info("Skip MADT cpu entry with invalid MPIDR\n"); + return; + } + + total_cpus++; + if (!enabled) + return; + + if (enabled_cpus >= NR_CPUS) { + pr_warn("NR_CPUS limit of %d reached, Processor %d/0x%llx ignored.\n", + NR_CPUS, total_cpus, mpidr); + return; + } + + /* Check if GICC structure of boot CPU is available in the MADT */ + if (cpu_logical_map(0) == mpidr) { + if (bootcpu_valid) { + pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n", + mpidr); + return; + } + + bootcpu_valid = true; + } + + /* + * Duplicate MPIDRs are a recipe for disaster. Scan + * all initialized entries and check for + * duplicates. If any is found just ignore the CPU. + */ + for (i = 1; i < enabled_cpus; i++) { + if (cpu_logical_map(i) == mpidr) { + pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n", + mpidr); + return; + } + } + + if (!acpi_psci_present()) + return; + + cpu_ops[enabled_cpus] = cpu_get_ops("psci"); + /* CPU 0 was already initialized */ + if (enabled_cpus) { + if (!cpu_ops[enabled_cpus]) + return; + + if (cpu_ops[enabled_cpus]->cpu_init(NULL, enabled_cpus)) + return; + + /* map the logical cpu id to cpu MPIDR */ + cpu_logical_map(enabled_cpus) = mpidr; + } + + enabled_cpus++; +} + +static int __init +acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *processor; + + processor = (struct acpi_madt_generic_interrupt *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + acpi_map_gic_cpu_interface(processor); + return 0; +} + +/* Parse GIC cpu interface entries in MADT for SMP init */ +void __init acpi_init_cpus(void) +{ + int count, i; + + /* + * do a partial walk of MADT to determine how many CPUs + * we have including disabled CPUs, and get information + * we need for SMP init + */ + count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT, + acpi_parse_gic_cpu_interface, 0); + + if (!count) { + pr_err("No GIC CPU interface entries present\n"); + return; + } else if (count < 0) { + pr_err("Error parsing GIC CPU interface entry\n"); + return; + } + + if (!bootcpu_valid) { + pr_err("MADT missing boot CPU MPIDR, not enabling secondaries\n"); + return; + } + + for (i = 0; i < enabled_cpus; i++) + set_cpu_possible(i, true); + + /* Make boot-up look pretty */ + pr_info("%d CPUs enabled, %d CPUs total\n", enabled_cpus, total_cpus); +} + +/* + * acpi_fadt_sanity_check() - Check FADT presence and carry out sanity + * checks on it + * + * Return 0 on success, <0 on failure + */ +static int __init acpi_fadt_sanity_check(void) +{ + struct acpi_table_header *table; + struct acpi_table_fadt *fadt; + acpi_status status; + acpi_size tbl_size; + int ret = 0; + + /* + * FADT is required on arm64; retrieve it to check its presence + * and carry out revision and ACPI HW reduced compliancy tests + */ + status = acpi_get_table_with_size(ACPI_SIG_FADT, 0, &table, &tbl_size); + if (ACPI_FAILURE(status)) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get FADT table, %s\n", msg); + return -ENODEV; + } + + fadt = (struct acpi_table_fadt *)table; + + /* + * Revision in table header is the FADT Major revision, and there + * is a minor revision of FADT which was introduced by ACPI 5.1, + * we only deal with ACPI 5.1 or newer revision to get GIC and SMP + * boot protocol configuration data. + */ + if (table->revision < 5 || + (table->revision == 5 && fadt->minor_revision < 1)) { + pr_err("Unsupported FADT revision %d.%d, should be 5.1+\n", + table->revision, fadt->minor_revision); + ret = -EINVAL; + goto out; + } + + if (!(fadt->flags & ACPI_FADT_HW_REDUCED)) { + pr_err("FADT not ACPI hardware reduced compliant\n"); + ret = -EINVAL; + } + +out: + /* + * acpi_get_table_with_size() creates FADT table mapping that + * should be released after parsing and before resuming boot + */ + early_acpi_os_unmap_memory(table, tbl_size); + return ret; +} + +/* + * acpi_boot_table_init() called from setup_arch(), always. + * 1. find RSDP and get its address, and then find XSDT + * 2. extract all tables and checksums them all + * 3. check ACPI FADT revision + * 4. check ACPI FADT HW reduced flag + * + * We can parse ACPI boot-time tables such as MADT after + * this function is called. + * + * On return ACPI is enabled if either: + * + * - ACPI tables are initialized and sanity checks passed + * - acpi=force was passed in the command line and ACPI was not disabled + * explicitly through acpi=off command line parameter + * + * ACPI is disabled on function return otherwise + */ +void __init acpi_boot_table_init(void) +{ + /* + * Enable ACPI instead of device tree unless + * - ACPI has been disabled explicitly (acpi=off), or + * - the device tree is not empty (it has more than just a /chosen node) + * and ACPI has not been force enabled (acpi=force) + */ + if (param_acpi_off || + (!param_acpi_force && of_scan_flat_dt(dt_scan_depth1_nodes, NULL))) + return; + + /* + * ACPI is disabled at this point. Enable it in order to parse + * the ACPI tables and carry out sanity checks + */ + enable_acpi(); + + /* + * If ACPI tables are initialized and FADT sanity checks passed, + * leave ACPI enabled and carry on booting; otherwise disable ACPI + * on initialization error. + * If acpi=force was passed on the command line it forces ACPI + * to be enabled even if its initialization failed. + */ + if (acpi_table_init() || acpi_fadt_sanity_check()) { + pr_err("Failed to init ACPI tables\n"); + if (!param_acpi_force) + disable_acpi(); + } +} + +void __init acpi_gic_init(void) +{ + struct acpi_table_header *table; + acpi_status status; + acpi_size tbl_size; + int err; + + if (acpi_disabled) + return; + + status = acpi_get_table_with_size(ACPI_SIG_MADT, 0, &table, &tbl_size); + if (ACPI_FAILURE(status)) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get MADT table, %s\n", msg); + return; + } + + err = gic_v2_acpi_init(table); + if (err) + pr_err("Failed to initialize GIC IRQ controller"); + + early_acpi_os_unmap_memory((char *)table, tbl_size); +} diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index cce952440c64..fb8ff9ba467a 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -35,7 +35,7 @@ static const struct cpu_operations *supported_cpu_ops[] __initconst = { NULL, }; -static const struct cpu_operations * __init cpu_get_ops(const char *name) +const struct cpu_operations * __init cpu_get_ops(const char *name) { const struct cpu_operations **ops = supported_cpu_ops; diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 6f93c24ca801..4095379dc069 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -10,6 +10,7 @@ * */ +#include <linux/acpi.h> #include <linux/init.h> #include <linux/io.h> #include <linux/kernel.h> @@ -46,3 +47,27 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } + +/* + * raw_pci_read/write - Platform-specific PCI config space access. + */ +int raw_pci_read(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *val) +{ + return -ENXIO; +} + +int raw_pci_write(unsigned int domain, unsigned int bus, + unsigned int devfn, int reg, int len, u32 val) +{ + return -ENXIO; +} + +#ifdef CONFIG_ACPI +/* Root bridge scanning */ +struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) +{ + /* TODO: Should be revisited when implementing PCI on ACPI */ + return NULL; +} +#endif diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 9b8a70ae64a1..ea18cb53921e 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) "psci: " fmt +#include <linux/acpi.h> #include <linux/init.h> #include <linux/of.h> #include <linux/smp.h> @@ -24,6 +25,7 @@ #include <linux/slab.h> #include <uapi/linux/psci.h> +#include <asm/acpi.h> #include <asm/compiler.h> #include <asm/cpu_ops.h> #include <asm/errno.h> @@ -273,39 +275,8 @@ static void psci_sys_poweroff(void) invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0); } -/* - * PSCI Function IDs for v0.2+ are well defined so use - * standard values. - */ -static int __init psci_0_2_init(struct device_node *np) +static void __init psci_0_2_set_functions(void) { - int err, ver; - - err = get_set_conduit_method(np); - - if (err) - goto out_put_node; - - ver = psci_get_version(); - - if (ver == PSCI_RET_NOT_SUPPORTED) { - /* PSCI v0.2 mandates implementation of PSCI_ID_VERSION. */ - pr_err("PSCI firmware does not comply with the v0.2 spec.\n"); - err = -EOPNOTSUPP; - goto out_put_node; - } else { - pr_info("PSCIv%d.%d detected in firmware.\n", - PSCI_VERSION_MAJOR(ver), - PSCI_VERSION_MINOR(ver)); - - if (PSCI_VERSION_MAJOR(ver) == 0 && - PSCI_VERSION_MINOR(ver) < 2) { - err = -EINVAL; - pr_err("Conflicting PSCI version detected.\n"); - goto out_put_node; - } - } - pr_info("Using standard PSCI v0.2 function IDs\n"); psci_function_id[PSCI_FN_CPU_SUSPEND] = PSCI_0_2_FN64_CPU_SUSPEND; psci_ops.cpu_suspend = psci_cpu_suspend; @@ -329,6 +300,60 @@ static int __init psci_0_2_init(struct device_node *np) arm_pm_restart = psci_sys_reset; pm_power_off = psci_sys_poweroff; +} + +/* + * Probe function for PSCI firmware versions >= 0.2 + */ +static int __init psci_probe(void) +{ + int ver = psci_get_version(); + + if (ver == PSCI_RET_NOT_SUPPORTED) { + /* + * PSCI versions >=0.2 mandates implementation of + * PSCI_VERSION. + */ + pr_err("PSCI firmware does not comply with the v0.2 spec.\n"); + return -EOPNOTSUPP; + } else { + pr_info("PSCIv%d.%d detected in firmware.\n", + PSCI_VERSION_MAJOR(ver), + PSCI_VERSION_MINOR(ver)); + + if (PSCI_VERSION_MAJOR(ver) == 0 && + PSCI_VERSION_MINOR(ver) < 2) { + pr_err("Conflicting PSCI version detected.\n"); + return -EINVAL; + } + } + + psci_0_2_set_functions(); + + return 0; +} + +/* + * PSCI init function for PSCI versions >=0.2 + * + * Probe based on PSCI PSCI_VERSION function + */ +static int __init psci_0_2_init(struct device_node *np) +{ + int err; + + err = get_set_conduit_method(np); + + if (err) + goto out_put_node; + /* + * Starting with v0.2, the PSCI specification introduced a call + * (PSCI_VERSION) that allows probing the firmware version, so + * that PSCI function IDs and version specific initialization + * can be carried out according to the specific version reported + * by firmware + */ + err = psci_probe(); out_put_node: of_node_put(np); @@ -381,7 +406,7 @@ static const struct of_device_id psci_of_match[] __initconst = { {}, }; -int __init psci_init(void) +int __init psci_dt_init(void) { struct device_node *np; const struct of_device_id *matched_np; @@ -396,6 +421,27 @@ int __init psci_init(void) return init_fn(np); } +/* + * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's + * explicitly clarified in SBBR + */ +int __init psci_acpi_init(void) +{ + if (!acpi_psci_present()) { + pr_info("is not implemented in ACPI.\n"); + return -EOPNOTSUPP; + } + + pr_info("probing for conduit method from ACPI.\n"); + + if (acpi_psci_use_hvc()) + invoke_psci_fn = __invoke_psci_fn_hvc; + else + invoke_psci_fn = __invoke_psci_fn_smc; + + return psci_probe(); +} + #ifdef CONFIG_SMP static int __init cpu_psci_cpu_init(struct device_node *dn, unsigned int cpu) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 51ef97274b52..74753132c3ac 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -17,6 +17,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/acpi.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/stddef.h> @@ -46,6 +47,7 @@ #include <linux/efi.h> #include <linux/personality.h> +#include <asm/acpi.h> #include <asm/fixmap.h> #include <asm/cpu.h> #include <asm/cputype.h> @@ -395,18 +397,27 @@ void __init setup_arch(char **cmdline_p) efi_init(); arm64_memblock_init(); + /* Parse the ACPI tables for possible boot-time configuration */ + acpi_boot_table_init(); + paging_init(); request_standard_resources(); early_ioremap_reset(); - unflatten_device_tree(); - - psci_init(); + if (acpi_disabled) { + unflatten_device_tree(); + psci_dt_init(); + cpu_read_bootcpu_ops(); +#ifdef CONFIG_SMP + of_smp_init_cpus(); +#endif + } else { + psci_acpi_init(); + acpi_init_cpus(); + } - cpu_read_bootcpu_ops(); #ifdef CONFIG_SMP - smp_init_cpus(); smp_build_mpidr_hash(); #endif diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 714411f62391..2cb008177252 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -323,7 +323,7 @@ void __init smp_prepare_boot_cpu(void) * cpu logical map array containing MPIDR values related to logical * cpus. Assumes that cpu_logical_map(0) has already been initialized. */ -void __init smp_init_cpus(void) +void __init of_smp_init_cpus(void) { struct device_node *dn = NULL; unsigned int i, cpu = 1; diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 1a7125c3099b..42f9195cf2f8 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -35,6 +35,7 @@ #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/clk-provider.h> +#include <linux/acpi.h> #include <clocksource/arm_arch_timer.h> @@ -72,6 +73,12 @@ void __init time_init(void) tick_setup_hrtimer_broadcast(); + /* + * Since ACPI or FDT will only one be available in the system, + * we can use acpi_generic_timer_init() here safely + */ + acpi_generic_timer_init(); + arch_timer_rate = arch_timer_get_rate(); if (!arch_timer_rate) panic("Unable to initialise architected timer.\n"); diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig index 383007877b2b..99c00d835f47 100644 --- a/arch/blackfin/configs/BF518F-EZBRD_defconfig +++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig @@ -48,7 +48,6 @@ CONFIG_IP_PNP=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_JEDECPROBE=m CONFIG_MTD_RAM=y diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig index cd0636bb24a0..cdeb51856f26 100644 --- a/arch/blackfin/configs/BF527-TLL6527M_defconfig +++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig @@ -67,7 +67,6 @@ CONFIG_BFIN_SIR0=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig index 16273a922056..ed7d2c096739 100644 --- a/arch/blackfin/configs/BF533-EZKIT_defconfig +++ b/arch/blackfin/configs/BF533-EZKIT_defconfig @@ -50,7 +50,6 @@ CONFIG_IRTTY_SIR=m CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_JEDECPROBE=y CONFIG_MTD_CFI_AMDSTD=y diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig index 0df2f921f7e5..0c241f4d28d7 100644 --- a/arch/blackfin/configs/BF533-STAMP_defconfig +++ b/arch/blackfin/configs/BF533-STAMP_defconfig @@ -50,7 +50,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=m CONFIG_MTD_CFI_AMDSTD=m diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig index 91d3eda42742..e5360b30e39a 100644 --- a/arch/blackfin/configs/BF537-STAMP_defconfig +++ b/arch/blackfin/configs/BF537-STAMP_defconfig @@ -55,13 +55,14 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=m CONFIG_MTD_CFI_AMDSTD=m CONFIG_MTD_RAM=y CONFIG_MTD_ROM=m CONFIG_MTD_PHYSMAP=m +CONFIG_MTD_M25P80=y +CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y CONFIG_NET_BFIN=y diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig index be03be6ba543..60f6fb86125c 100644 --- a/arch/blackfin/configs/BF538-EZKIT_defconfig +++ b/arch/blackfin/configs/BF538-EZKIT_defconfig @@ -60,7 +60,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=m CONFIG_MTD_CFI_AMDSTD=m diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig index 802f9c421621..78f6bc79f910 100644 --- a/arch/blackfin/configs/BF561-ACVILON_defconfig +++ b/arch/blackfin/configs/BF561-ACVILON_defconfig @@ -50,7 +50,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_PLATRAM=y CONFIG_MTD_PHRAM=y diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig index e2a2fa5935ce..fac8bb578249 100644 --- a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig @@ -52,7 +52,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig index 680730eeaf23..2a2e4d0cebc1 100644 --- a/arch/blackfin/configs/BF561-EZKIT_defconfig +++ b/arch/blackfin/configs/BF561-EZKIT_defconfig @@ -54,7 +54,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig index fcec5ce71392..ba4267f658af 100644 --- a/arch/blackfin/configs/BF609-EZKIT_defconfig +++ b/arch/blackfin/configs/BF609-EZKIT_defconfig @@ -105,6 +105,7 @@ CONFIG_SPI=y CONFIG_SPI_ADI_V3=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_MCP23S08=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_BFIN_WDT=y diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig index 05108b85ab12..1902bb05d086 100644 --- a/arch/blackfin/configs/CM-BF527_defconfig +++ b/arch/blackfin/configs/CM-BF527_defconfig @@ -55,7 +55,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/CM-BF533_defconfig b/arch/blackfin/configs/CM-BF533_defconfig index 5e0db82b679e..9a5716d57ebc 100644 --- a/arch/blackfin/configs/CM-BF533_defconfig +++ b/arch/blackfin/configs/CM-BF533_defconfig @@ -37,7 +37,6 @@ CONFIG_UNIX=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/CM-BF537E_defconfig b/arch/blackfin/configs/CM-BF537E_defconfig index 2e47df77490f..684592884349 100644 --- a/arch/blackfin/configs/CM-BF537E_defconfig +++ b/arch/blackfin/configs/CM-BF537E_defconfig @@ -52,7 +52,6 @@ CONFIG_IP_PNP=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/CM-BF537U_defconfig b/arch/blackfin/configs/CM-BF537U_defconfig index 6da629ffc2f1..d9915e984787 100644 --- a/arch/blackfin/configs/CM-BF537U_defconfig +++ b/arch/blackfin/configs/CM-BF537U_defconfig @@ -48,7 +48,6 @@ CONFIG_INET=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/CM-BF548_defconfig b/arch/blackfin/configs/CM-BF548_defconfig index 9ff79df6825c..92d8130cdb51 100644 --- a/arch/blackfin/configs/CM-BF548_defconfig +++ b/arch/blackfin/configs/CM-BF548_defconfig @@ -54,7 +54,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/CM-BF561_defconfig b/arch/blackfin/configs/CM-BF561_defconfig index d6dd98e67146..fa8d91132a57 100644 --- a/arch/blackfin/configs/CM-BF561_defconfig +++ b/arch/blackfin/configs/CM-BF561_defconfig @@ -52,7 +52,6 @@ CONFIG_INET=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/configs/DNP5370_defconfig b/arch/blackfin/configs/DNP5370_defconfig index 2b58cb221283..88600593c731 100644 --- a/arch/blackfin/configs/DNP5370_defconfig +++ b/arch/blackfin/configs/DNP5370_defconfig @@ -36,7 +36,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_DEBUG=y CONFIG_MTD_DEBUG_VERBOSE=1 -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_NFTL=y CONFIG_NFTL_RW=y diff --git a/arch/blackfin/configs/IP0X_defconfig b/arch/blackfin/configs/IP0X_defconfig index 5adf0da58499..9e3ae4b36d20 100644 --- a/arch/blackfin/configs/IP0X_defconfig +++ b/arch/blackfin/configs/IP0X_defconfig @@ -43,7 +43,6 @@ CONFIG_IP_NF_TARGET_REJECT=y CONFIG_IP_NF_MANGLE=y # CONFIG_WIRELESS is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig index a6a7298962ed..c7926812971c 100644 --- a/arch/blackfin/configs/PNAV-10_defconfig +++ b/arch/blackfin/configs/PNAV-10_defconfig @@ -46,7 +46,6 @@ CONFIG_IP_PNP=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_RAM=y CONFIG_MTD_COMPLEX_MAPPINGS=y diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig index bc216646fe18..23fdc57d657a 100644 --- a/arch/blackfin/configs/SRV1_defconfig +++ b/arch/blackfin/configs/SRV1_defconfig @@ -38,7 +38,6 @@ CONFIG_IRTTY_SIR=m # CONFIG_WIRELESS is not set # CONFIG_FW_LOADER is not set CONFIG_MTD=y -CONFIG_MTD_CHAR=m CONFIG_MTD_BLOCK=y CONFIG_MTD_JEDECPROBE=m CONFIG_MTD_RAM=y diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig index ea88158ab432..e28959479fe0 100644 --- a/arch/blackfin/configs/TCM-BF518_defconfig +++ b/arch/blackfin/configs/TCM-BF518_defconfig @@ -55,7 +55,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_ADV_OPTIONS=y diff --git a/arch/blackfin/configs/TCM-BF537_defconfig b/arch/blackfin/configs/TCM-BF537_defconfig index c1f45f15295c..39e85cce95d7 100644 --- a/arch/blackfin/configs/TCM-BF537_defconfig +++ b/arch/blackfin/configs/TCM-BF537_defconfig @@ -44,7 +44,6 @@ CONFIG_INET=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y diff --git a/arch/blackfin/include/asm/io.h b/arch/blackfin/include/asm/io.h index dccae26805b0..4e8ad0523118 100644 --- a/arch/blackfin/include/asm/io.h +++ b/arch/blackfin/include/asm/io.h @@ -11,27 +11,12 @@ #include <linux/types.h> #include <asm/byteorder.h> -#define DECLARE_BFIN_RAW_READX(size, type, asm, asm_sign) \ -static inline type __raw_read##size(const volatile void __iomem *addr) \ -{ \ - unsigned int val; \ - int tmp; \ - __asm__ __volatile__ ( \ - "cli %1;" \ - "NOP; NOP; SSYNC;" \ - "%0 = "#asm" [%2] "#asm_sign";" \ - "sti %1;" \ - : "=d"(val), "=d"(tmp) \ - : "a"(addr) \ - ); \ - return (type) val; \ -} -DECLARE_BFIN_RAW_READX(b, u8, b, (z)) -#define __raw_readb __raw_readb -DECLARE_BFIN_RAW_READX(w, u16, w, (z)) -#define __raw_readw __raw_readw -DECLARE_BFIN_RAW_READX(l, u32, , ) -#define __raw_readl __raw_readl +#define __raw_readb bfin_read8 +#define __raw_readw bfin_read16 +#define __raw_readl bfin_read32 +#define __raw_writeb(val, addr) bfin_write8(addr, val) +#define __raw_writew(val, addr) bfin_write16(addr, val) +#define __raw_writel(val, addr) bfin_write32(addr, val) extern void outsb(unsigned long port, const void *addr, unsigned long count); extern void outsw(unsigned long port, const void *addr, unsigned long count); @@ -50,14 +35,6 @@ extern void insl_16(unsigned long port, void *addr, unsigned long count); #define insw insw #define insl insl -extern void dma_outsb(unsigned long port, const void *addr, unsigned short count); -extern void dma_outsw(unsigned long port, const void *addr, unsigned short count); -extern void dma_outsl(unsigned long port, const void *addr, unsigned short count); - -extern void dma_insb(unsigned long port, void *addr, unsigned short count); -extern void dma_insw(unsigned long port, void *addr, unsigned short count); -extern void dma_insl(unsigned long port, void *addr, unsigned short count); - /** * I/O write barrier * diff --git a/arch/blackfin/include/uapi/asm/unistd.h b/arch/blackfin/include/uapi/asm/unistd.h index a4511649a864..0cb9078ef482 100644 --- a/arch/blackfin/include/uapi/asm/unistd.h +++ b/arch/blackfin/include/uapi/asm/unistd.h @@ -401,8 +401,18 @@ #define __NR_sendmmsg 380 #define __NR_process_vm_readv 381 #define __NR_process_vm_writev 382 +#define __NR_kcmp 383 +#define __NR_finit_module 384 +#define __NR_sched_setattr 385 +#define __NR_sched_getattr 386 +#define __NR_renameat2 387 +#define __NR_seccomp 388 +#define __NR_getrandom 389 +#define __NR_memfd_create 390 +#define __NR_bpf 391 +#define __NR_execveat 392 -#define __NR_syscall 383 +#define __NR_syscall 393 /* For internal using, not implemented */ #define NR_syscalls __NR_syscall /* Old optional stuff no one actually uses */ diff --git a/arch/blackfin/kernel/debug-mmrs.c b/arch/blackfin/kernel/debug-mmrs.c index 947ad0832338..86b1cd3a0309 100644 --- a/arch/blackfin/kernel/debug-mmrs.c +++ b/arch/blackfin/kernel/debug-mmrs.c @@ -1620,7 +1620,6 @@ static int __init bfin_debug_mmrs_init(void) D16(USB_APHY_CNTRL); D16(USB_APHY_CALIB); D16(USB_APHY_CNTRL2); - D16(USB_PHY_TEST); D16(USB_PLLOSC_CTRL); D16(USB_SRP_CLKDIV); D16(USB_EP_NI0_TXMAXP); diff --git a/arch/blackfin/kernel/kgdb.c b/arch/blackfin/kernel/kgdb.c index fa53faeeb0e9..cf773f0f1f30 100644 --- a/arch/blackfin/kernel/kgdb.c +++ b/arch/blackfin/kernel/kgdb.c @@ -330,9 +330,6 @@ static void bfin_disable_hw_debug(struct pt_regs *regs) } #ifdef CONFIG_SMP -extern void generic_exec_single(int cpu, struct call_single_data *data, int wait); -static struct call_single_data kgdb_smp_ipi_data[NR_CPUS]; - void kgdb_passive_cpu_callback(void *info) { kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); @@ -343,15 +340,14 @@ void kgdb_roundup_cpus(unsigned long flags) unsigned int cpu; for (cpu = cpumask_first(cpu_online_mask); cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, cpu_online_mask)) { - kgdb_smp_ipi_data[cpu].func = kgdb_passive_cpu_callback; - generic_exec_single(cpu, &kgdb_smp_ipi_data[cpu], 0); - } + cpu = cpumask_next(cpu, cpu_online_mask)) + smp_call_function_single(cpu, kgdb_passive_cpu_callback, + NULL, 0); } void kgdb_roundup_cpu(int cpu, unsigned long flags) { - generic_exec_single(cpu, &kgdb_smp_ipi_data[cpu], 0); + smp_call_function_single(cpu, kgdb_passive_cpu_callback, NULL, 0); } #endif @@ -359,19 +355,6 @@ void kgdb_roundup_cpu(int cpu, unsigned long flags) static unsigned long kgdb_arch_imask; #endif -void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) -{ - if (kgdb_single_step) - preempt_enable(); - -#ifdef CONFIG_IPIPE - if (kgdb_arch_imask) { - cpu_pda[raw_smp_processor_id()].ex_imask = kgdb_arch_imask; - kgdb_arch_imask = 0; - } -#endif -} - int kgdb_arch_handle_exception(int vector, int signo, int err_code, char *remcom_in_buffer, char *remcom_out_buffer, diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c index 4f424ae3b36d..ad82468bd94d 100644 --- a/arch/blackfin/kernel/setup.c +++ b/arch/blackfin/kernel/setup.c @@ -1464,5 +1464,5 @@ void __init cmdline_init(const char *r0) { early_shadow_stamp(); if (r0) - strncpy(command_line, r0, COMMAND_LINE_SIZE); + strlcpy(command_line, r0, COMMAND_LINE_SIZE); } diff --git a/arch/blackfin/mach-bf527/include/mach/cdefBF525.h b/arch/blackfin/mach-bf527/include/mach/cdefBF525.h index d90a85b6b6b9..bd045318a250 100644 --- a/arch/blackfin/mach-bf527/include/mach/cdefBF525.h +++ b/arch/blackfin/mach-bf527/include/mach/cdefBF525.h @@ -122,11 +122,6 @@ #define bfin_read_USB_APHY_CNTRL2() bfin_read16(USB_APHY_CNTRL2) #define bfin_write_USB_APHY_CNTRL2(val) bfin_write16(USB_APHY_CNTRL2, val) -/* (PHY_TEST is for ADI usage only) */ - -#define bfin_read_USB_PHY_TEST() bfin_read16(USB_PHY_TEST) -#define bfin_write_USB_PHY_TEST(val) bfin_write16(USB_PHY_TEST, val) - #define bfin_read_USB_PLLOSC_CTRL() bfin_read16(USB_PLLOSC_CTRL) #define bfin_write_USB_PLLOSC_CTRL(val) bfin_write16(USB_PLLOSC_CTRL, val) #define bfin_read_USB_SRP_CLKDIV() bfin_read16(USB_SRP_CLKDIV) diff --git a/arch/blackfin/mach-bf527/include/mach/defBF525.h b/arch/blackfin/mach-bf527/include/mach/defBF525.h index 71578d964d00..591e00ff620a 100644 --- a/arch/blackfin/mach-bf527/include/mach/defBF525.h +++ b/arch/blackfin/mach-bf527/include/mach/defBF525.h @@ -77,10 +77,6 @@ #define USB_APHY_CNTRL2 0xffc039e8 /* Register used to prevent re-enumeration once Moab goes into hibernate mode */ -/* (PHY_TEST is for ADI usage only) */ - -#define USB_PHY_TEST 0xffc039ec /* Used for reducing simulation time and simplifies FIFO testability */ - #define USB_PLLOSC_CTRL 0xffc039f0 /* Used to program different parameters for USB PLL and Oscillator */ #define USB_SRP_CLKDIV 0xffc039f4 /* Used to program clock divide value for the clock fed to the SRP detection logic */ diff --git a/arch/blackfin/mach-bf548/include/mach/cdefBF542.h b/arch/blackfin/mach-bf548/include/mach/cdefBF542.h index d09c19cd1b7b..916347901d5a 100644 --- a/arch/blackfin/mach-bf548/include/mach/cdefBF542.h +++ b/arch/blackfin/mach-bf548/include/mach/cdefBF542.h @@ -241,10 +241,6 @@ #define bfin_read_USB_APHY_CNTRL2() bfin_read16(USB_APHY_CNTRL2) #define bfin_write_USB_APHY_CNTRL2(val) bfin_write16(USB_APHY_CNTRL2, val) -/* (PHY_TEST is for ADI usage only) */ - -#define bfin_read_USB_PHY_TEST() bfin_read16(USB_PHY_TEST) -#define bfin_write_USB_PHY_TEST(val) bfin_write16(USB_PHY_TEST, val) #define bfin_read_USB_PLLOSC_CTRL() bfin_read16(USB_PLLOSC_CTRL) #define bfin_write_USB_PLLOSC_CTRL(val) bfin_write16(USB_PLLOSC_CTRL, val) #define bfin_read_USB_SRP_CLKDIV() bfin_read16(USB_SRP_CLKDIV) diff --git a/arch/blackfin/mach-bf548/include/mach/cdefBF547.h b/arch/blackfin/mach-bf548/include/mach/cdefBF547.h index bcb9726dea54..be83f645bba8 100644 --- a/arch/blackfin/mach-bf548/include/mach/cdefBF547.h +++ b/arch/blackfin/mach-bf548/include/mach/cdefBF547.h @@ -408,10 +408,6 @@ #define bfin_read_USB_APHY_CNTRL2() bfin_read16(USB_APHY_CNTRL2) #define bfin_write_USB_APHY_CNTRL2(val) bfin_write16(USB_APHY_CNTRL2, val) -/* (PHY_TEST is for ADI usage only) */ - -#define bfin_read_USB_PHY_TEST() bfin_read16(USB_PHY_TEST) -#define bfin_write_USB_PHY_TEST(val) bfin_write16(USB_PHY_TEST, val) #define bfin_read_USB_PLLOSC_CTRL() bfin_read16(USB_PLLOSC_CTRL) #define bfin_write_USB_PLLOSC_CTRL(val) bfin_write16(USB_PLLOSC_CTRL, val) #define bfin_read_USB_SRP_CLKDIV() bfin_read16(USB_SRP_CLKDIV) diff --git a/arch/blackfin/mach-bf548/include/mach/defBF542.h b/arch/blackfin/mach-bf548/include/mach/defBF542.h index 51161575a163..ae4b889e3606 100644 --- a/arch/blackfin/mach-bf548/include/mach/defBF542.h +++ b/arch/blackfin/mach-bf548/include/mach/defBF542.h @@ -140,9 +140,6 @@ #define USB_APHY_CALIB 0xffc03de4 /* Register used to set some calibration values */ #define USB_APHY_CNTRL2 0xffc03de8 /* Register used to prevent re-enumeration once Moab goes into hibernate mode */ -/* (PHY_TEST is for ADI usage only) */ - -#define USB_PHY_TEST 0xffc03dec /* Used for reducing simulation time and simplifies FIFO testability */ #define USB_PLLOSC_CTRL 0xffc03df0 /* Used to program different parameters for USB PLL and Oscillator */ #define USB_SRP_CLKDIV 0xffc03df4 /* Used to program clock divide value for the clock fed to the SRP detection logic */ diff --git a/arch/blackfin/mach-bf548/include/mach/defBF547.h b/arch/blackfin/mach-bf548/include/mach/defBF547.h index d55dcc0f5324..7cc7928a3c73 100644 --- a/arch/blackfin/mach-bf548/include/mach/defBF547.h +++ b/arch/blackfin/mach-bf548/include/mach/defBF547.h @@ -254,9 +254,6 @@ #define USB_APHY_CALIB 0xffc03de4 /* Register used to set some calibration values */ #define USB_APHY_CNTRL2 0xffc03de8 /* Register used to prevent re-enumeration once Moab goes into hibernate mode */ -/* (PHY_TEST is for ADI usage only) */ - -#define USB_PHY_TEST 0xffc03dec /* Used for reducing simulation time and simplifies FIFO testability */ #define USB_PLLOSC_CTRL 0xffc03df0 /* Used to program different parameters for USB PLL and Oscillator */ #define USB_SRP_CLKDIV 0xffc03df4 /* Used to program clock divide value for the clock fed to the SRP detection logic */ diff --git a/arch/blackfin/mach-bf609/boards/ezkit.c b/arch/blackfin/mach-bf609/boards/ezkit.c index 7f9fc272ec30..2c61fc0c98f9 100644 --- a/arch/blackfin/mach-bf609/boards/ezkit.c +++ b/arch/blackfin/mach-bf609/boards/ezkit.c @@ -780,8 +780,8 @@ static struct adi_spi3_chip spidev_chip_info = { }; #endif -#if IS_ENABLED(CONFIG_SND_BF5XX_I2S) -static struct platform_device bfin_i2s_pcm = { +#if IS_ENABLED(CONFIG_SND_BF6XX_PCM) +static struct platform_device bfin_pcm = { .name = "bfin-i2s-pcm-audio", .id = -1, }; @@ -1034,7 +1034,6 @@ static struct adv7842_platform_data adv7842_data = { .i2c_infoframe = 0x48, .i2c_cec = 0x49, .i2c_avlink = 0x4a, - .i2c_ex = 0x26, }; static struct bfin_capture_config bfin_capture_data = { @@ -1104,7 +1103,6 @@ static struct disp_route adv7511_routes[] = { static struct adv7511_platform_data adv7511_data = { .edid_addr = 0x7e, - .i2c_ex = 0x25, }; static struct bfin_display_config bfin_display_data = { @@ -1209,6 +1207,35 @@ static struct platform_device bfin_display_device = { }; #endif +#if defined(CONFIG_FB_BF609_NL8048) \ + || defined(CONFIG_FB_BF609_NL8048_MODULE) +static struct resource nl8048_resources[] = { + { + .start = EPPI2_STAT, + .end = EPPI2_STAT, + .flags = IORESOURCE_MEM, + }, + { + .start = CH_EPPI2_CH0, + .end = CH_EPPI2_CH0, + .flags = IORESOURCE_DMA, + }, + { + .start = IRQ_EPPI2_STAT, + .end = IRQ_EPPI2_STAT, + .flags = IORESOURCE_IRQ, + }, +}; +static struct platform_device bfin_fb_device = { + .name = "bf609_nl8048", + .num_resources = ARRAY_SIZE(nl8048_resources), + .resource = nl8048_resources, + .dev = { + .platform_data = (void *)GPIO_PC15, + }, +}; +#endif + #if defined(CONFIG_BFIN_CRC) #define BFIN_CRC_NAME "bfin-crc" @@ -1862,6 +1889,29 @@ static struct platform_device i2c_bfin_twi1_device = { }; #endif +#if IS_ENABLED(CONFIG_GPIO_MCP23S08) +#include <linux/spi/mcp23s08.h> +static const struct mcp23s08_platform_data bfin_mcp23s08_soft_switch0 = { + .base = 120, +}; +static const struct mcp23s08_platform_data bfin_mcp23s08_soft_switch1 = { + .base = 130, +}; +static const struct mcp23s08_platform_data bfin_mcp23s08_soft_switch2 = { + .base = 140, +}; +# if IS_ENABLED(CONFIG_VIDEO_ADV7842) +static const struct mcp23s08_platform_data bfin_adv7842_soft_switch = { + .base = 150, +}; +# endif +# if IS_ENABLED(CONFIG_VIDEO_ADV7511) || IS_ENABLED(CONFIG_VIDEO_ADV7343) +static const struct mcp23s08_platform_data bfin_adv7511_soft_switch = { + .base = 160, +}; +# endif +#endif + static struct i2c_board_info __initdata bfin_i2c_board_info0[] = { #if IS_ENABLED(CONFIG_INPUT_ADXL34X_I2C) { @@ -1881,6 +1931,32 @@ static struct i2c_board_info __initdata bfin_i2c_board_info0[] = { I2C_BOARD_INFO("ssm2602", 0x1b), }, #endif +#if IS_ENABLED(CONFIG_GPIO_MCP23S08) + { + I2C_BOARD_INFO("mcp23017", 0x21), + .platform_data = (void *)&bfin_mcp23s08_soft_switch0 + }, + { + I2C_BOARD_INFO("mcp23017", 0x22), + .platform_data = (void *)&bfin_mcp23s08_soft_switch1 + }, + { + I2C_BOARD_INFO("mcp23017", 0x23), + .platform_data = (void *)&bfin_mcp23s08_soft_switch2 + }, +# if IS_ENABLED(CONFIG_VIDEO_ADV7842) + { + I2C_BOARD_INFO("mcp23017", 0x26), + .platform_data = (void *)&bfin_adv7842_soft_switch + }, +# endif +# if IS_ENABLED(CONFIG_VIDEO_ADV7511) || IS_ENABLED(CONFIG_VIDEO_ADV7343) + { + I2C_BOARD_INFO("mcp23017", 0x25), + .platform_data = (void *)&bfin_adv7511_soft_switch + }, +# endif +#endif }; static struct i2c_board_info __initdata bfin_i2c_board_info1[] = { @@ -2023,8 +2099,8 @@ static struct platform_device *ezkit_devices[] __initdata = { #if IS_ENABLED(CONFIG_MTD_PHYSMAP) &ezkit_flash_device, #endif -#if IS_ENABLED(CONFIG_SND_BF5XX_I2S) - &bfin_i2s_pcm, +#if IS_ENABLED(CONFIG_SND_BF6XX_PCM) + &bfin_pcm, #endif #if IS_ENABLED(CONFIG_SND_BF6XX_SOC_I2S) &bfin_i2s, @@ -2060,7 +2136,7 @@ static struct pinctrl_map __initdata bfin_pinmux_map[] = { PIN_MAP_MUX_GROUP_DEFAULT("bfin-rotary", "pinctrl-adi2.0", NULL, "rotary"), PIN_MAP_MUX_GROUP_DEFAULT("bfin_can.0", "pinctrl-adi2.0", NULL, "can0"), PIN_MAP_MUX_GROUP_DEFAULT("physmap-flash.0", "pinctrl-adi2.0", NULL, "smc0"), - PIN_MAP_MUX_GROUP_DEFAULT("bf609_nl8048.2", "pinctrl-adi2.0", "ppi2_16bgrp", "ppi2"), + PIN_MAP_MUX_GROUP_DEFAULT("bf609_nl8048.0", "pinctrl-adi2.0", "ppi2_16bgrp", "ppi2"), PIN_MAP_MUX_GROUP("bfin_display.0", "8bit", "pinctrl-adi2.0", "ppi2_8bgrp", "ppi2"), PIN_MAP_MUX_GROUP_DEFAULT("bfin_display.0", "pinctrl-adi2.0", "ppi2_16bgrp", "ppi2"), PIN_MAP_MUX_GROUP("bfin_display.0", "16bit", "pinctrl-adi2.0", "ppi2_16bgrp", "ppi2"), diff --git a/arch/blackfin/mach-bf609/clock.c b/arch/blackfin/mach-bf609/clock.c index 244fa4ab4c56..378305844b2c 100644 --- a/arch/blackfin/mach-bf609/clock.c +++ b/arch/blackfin/mach-bf609/clock.c @@ -363,6 +363,12 @@ static struct clk ethclk = { .ops = &dummy_clk_ops, }; +static struct clk ethpclk = { + .name = "pclk", + .parent = &sclk0, + .ops = &dummy_clk_ops, +}; + static struct clk spiclk = { .name = "spi", .parent = &sclk1, @@ -381,6 +387,7 @@ static struct clk_lookup bf609_clks[] = { CLK(dclk, NULL, "DCLK"), CLK(oclk, NULL, "OCLK"), CLK(ethclk, NULL, "stmmaceth"), + CLK(ethpclk, NULL, "pclk"), CLK(spiclk, NULL, "spi"), }; diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 86b5a095c5a1..8d9431e22e8c 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1694,6 +1694,16 @@ ENTRY(_sys_call_table) .long _sys_sendmmsg /* 380 */ .long _sys_process_vm_readv .long _sys_process_vm_writev + .long _sys_kcmp + .long _sys_finit_module + .long _sys_sched_setattr /* 385 */ + .long _sys_sched_getattr + .long _sys_renameat2 + .long _sys_seccomp + .long _sys_getrandom + .long _sys_memfd_create /* 390 */ + .long _sys_bpf + .long _sys_execveat .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff --git a/arch/blackfin/mach-common/pm.c b/arch/blackfin/mach-common/pm.c index 1387a94bcfd5..a66d979ec651 100644 --- a/arch/blackfin/mach-common/pm.c +++ b/arch/blackfin/mach-common/pm.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/io.h> #include <linux/irq.h> +#include <linux/delay.h> #include <asm/cplb.h> #include <asm/gpio.h> @@ -180,6 +181,7 @@ int bfin_pm_suspend_mem_enter(void) #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK) flushinv_all_dcache(); + udelay(1); #endif _disable_dcplb(); _disable_icplb(); diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 4a03911053ab..0314e325a669 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -46,12 +46,18 @@ config CRIS select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW select GENERIC_IOMAP - select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 select GENERIC_CMOS_UPDATE select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 select OLD_SIGSUSPEND select OLD_SIGACTION + select ARCH_REQUIRE_GPIOLIB + select IRQ_DOMAIN if ETRAX_ARCH_V32 + select OF if ETRAX_ARCH_V32 + select OF_EARLY_FLATTREE if ETRAX_ARCH_V32 + select CLKSRC_MMIO if ETRAX_ARCH_V32 + select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32 + select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32 config HZ int @@ -61,6 +67,10 @@ config NR_CPUS int default "1" +config BUILTIN_DTB + string "DTB to build into the kernel image" + depends on OF + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/cris/Makefile b/arch/cris/Makefile index 39dc7d00083e..4a5404b3d0e4 100644 --- a/arch/cris/Makefile +++ b/arch/cris/Makefile @@ -40,6 +40,10 @@ else MACH := endif +ifneq ($(CONFIG_BUILTIN_DTB),"") +core-$(CONFIG_OF) += arch/cris/boot/dts/ +endif + LD = $(CROSS_COMPILE)ld -mcrislinux OBJCOPYFLAGS := -O binary -R .note -R .comment -S diff --git a/arch/cris/arch-v32/kernel/Makefile b/arch/cris/arch-v32/kernel/Makefile index 40358355d0cb..d9fc617ea253 100644 --- a/arch/cris/arch-v32/kernel/Makefile +++ b/arch/cris/arch-v32/kernel/Makefile @@ -9,7 +9,6 @@ obj-y := entry.o traps.o irq.o debugport.o \ process.o ptrace.o setup.o signal.o traps.o time.o \ cache.o cacheflush.o -obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_ETRAX_KGDB) += kgdb.o kgdb_asm.o obj-$(CONFIG_ETRAX_FAST_TIMER) += fasttimer.o obj-$(CONFIG_MODULES) += crisksyms.o diff --git a/arch/cris/arch-v32/kernel/entry.S b/arch/cris/arch-v32/kernel/entry.S index 2f19ac6217aa..026a0b21b8f0 100644 --- a/arch/cris/arch-v32/kernel/entry.S +++ b/arch/cris/arch-v32/kernel/entry.S @@ -99,6 +99,8 @@ ret_from_kernel_thread: .type ret_from_intr,@function ret_from_intr: + moveq 0, $r9 ; not a syscall + ;; Check for resched if preemptive kernel, or if we're going back to ;; user-mode. This test matches the user_regs(regs) macro. Don't simply ;; test CCS since that doesn't necessarily reflect what mode we'll @@ -145,7 +147,7 @@ system_call: ;; Stack-frame similar to the irq heads, which is reversed in ;; ret_from_sys_call. - sub.d 92, $sp ; Skip EXS and EDA. + sub.d 92, $sp ; Skip EDA. movem $r13, [$sp] move.d $sp, $r8 addq 14*4, $r8 @@ -156,8 +158,9 @@ system_call: move $ccs, $r4 move $srp, $r5 move $erp, $r6 + move.d $r9, $r7 ; Store syscall number in EXS subq 4, $sp - movem $r6, [$r8] + movem $r7, [$r8] ei ; Enable interrupts while processing syscalls. move.d $r10, [$sp] @@ -278,43 +281,14 @@ _syscall_exit_work: .type _work_pending,@function _work_pending: addoq +TI_flags, $r0, $acr - move.d [$acr], $r10 - btstq TIF_NEED_RESCHED, $r10 ; Need resched? - bpl _work_notifysig ; No, must be signal/notify. - nop - .size _work_pending, . - _work_pending - - .type _work_resched,@function -_work_resched: - move.d $r9, $r1 ; Preserve R9. - jsr schedule - nop - move.d $r1, $r9 - di - - addoq +TI_flags, $r0, $acr - move.d [$acr], $r1 - and.d _TIF_WORK_MASK, $r1 ; Ignore sycall trace counter. - beq _Rexit - nop - btstq TIF_NEED_RESCHED, $r1 - bmi _work_resched ; current->work.need_resched. - nop - .size _work_resched, . - _work_resched - - .type _work_notifysig,@function -_work_notifysig: - ;; Deal with pending signals and notify-resume requests. - - addoq +TI_flags, $r0, $acr move.d [$acr], $r12 ; The thread_info_flags parameter. move.d $sp, $r11 ; The regs param. - jsr do_notify_resume - move.d $r9, $r10 ; do_notify_resume syscall/irq param. + jsr do_work_pending + move.d $r9, $r10 ; The syscall/irq param. ba _Rexit nop - .size _work_notifysig, . - _work_notifysig + .size _work_pending, . - _work_pending ;; We get here as a sidetrack when we've entered a syscall with the ;; trace-bit set. We need to call do_syscall_trace and then continue diff --git a/arch/cris/arch-v32/kernel/head.S b/arch/cris/arch-v32/kernel/head.S index 51e34165ece7..74a66e0e3777 100644 --- a/arch/cris/arch-v32/kernel/head.S +++ b/arch/cris/arch-v32/kernel/head.S @@ -52,11 +52,6 @@ tstart: GIO_INIT -#ifdef CONFIG_SMP -secondary_cpu_entry: /* Entry point for secondary CPUs */ - di -#endif - ;; Setup and enable the MMU. Use same configuration for both the data ;; and the instruction MMU. ;; @@ -164,33 +159,6 @@ secondary_cpu_entry: /* Entry point for secondary CPUs */ nop nop -#ifdef CONFIG_SMP - ;; Read CPU ID - move 0, $srs - nop - nop - nop - move $s12, $r0 - cmpq 0, $r0 - beq master_cpu - nop -slave_cpu: - ; Time to boot-up. Get stack location provided by master CPU. - move.d smp_init_current_idle_thread, $r1 - move.d [$r1], $sp - add.d 8192, $sp - move.d ebp_start, $r0 ; Defined in linker-script. - move $r0, $ebp - jsr smp_callin - nop -master_cpu: - /* Set up entry point for secondary CPUs. The boot ROM has set up - * EBP at start of internal memory. The CPU will get there - * later when we issue an IPI to them... */ - move.d MEM_INTMEM_START + IPI_INTR_VECT * 4, $r0 - move.d secondary_cpu_entry, $r1 - move.d $r1, [$r0] -#endif ; Check if starting from DRAM (network->RAM boot or unpacked ; compressed kernel), or directly from flash. lapcq ., $r0 diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c index 25437ae28128..6a881e0e92b4 100644 --- a/arch/cris/arch-v32/kernel/irq.c +++ b/arch/cris/arch-v32/kernel/irq.c @@ -10,6 +10,8 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/profile.h> +#include <linux/of.h> +#include <linux/of_irq.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/threads.h> @@ -56,9 +58,6 @@ struct cris_irq_allocation irq_allocations[NR_REAL_IRQS] = static unsigned long irq_regs[NR_CPUS] = { regi_irq, -#ifdef CONFIG_SMP - regi_irq2, -#endif }; #if NR_REAL_IRQS > 32 @@ -431,6 +430,19 @@ crisv32_do_multiple(struct pt_regs* regs) irq_exit(); } +static int crisv32_irq_map(struct irq_domain *h, unsigned int virq, + irq_hw_number_t hw_irq_num) +{ + irq_set_chip_and_handler(virq, &crisv32_irq_type, handle_simple_irq); + + return 0; +} + +static struct irq_domain_ops crisv32_irq_ops = { + .map = crisv32_irq_map, + .xlate = irq_domain_xlate_onecell, +}; + /* * This is called by start_kernel. It fixes the IRQ masks and setup the * interrupt vector table to point to bad_interrupt pointers. @@ -441,6 +453,8 @@ init_IRQ(void) int i; int j; reg_intr_vect_rw_mask vect_mask = {0}; + struct device_node *np; + struct irq_domain *domain; /* Clear all interrupts masks. */ for (i = 0; i < NBR_REGS; i++) @@ -449,10 +463,15 @@ init_IRQ(void) for (i = 0; i < 256; i++) etrax_irv->v[i] = weird_irq; - /* Point all IRQ's to bad handlers. */ + np = of_find_compatible_node(NULL, NULL, "axis,crisv32-intc"); + domain = irq_domain_add_legacy(np, NR_IRQS - FIRST_IRQ, + FIRST_IRQ, FIRST_IRQ, + &crisv32_irq_ops, NULL); + BUG_ON(!domain); + irq_set_default_host(domain); + of_node_put(np); + for (i = FIRST_IRQ, j = 0; j < NR_IRQS; i++, j++) { - irq_set_chip_and_handler(j, &crisv32_irq_type, - handle_simple_irq); set_exception_vector(i, interrupt[j]); } diff --git a/arch/cris/arch-v32/kernel/setup.c b/arch/cris/arch-v32/kernel/setup.c index 81715c683baf..cd1865d68b2e 100644 --- a/arch/cris/arch-v32/kernel/setup.c +++ b/arch/cris/arch-v32/kernel/setup.c @@ -63,11 +63,6 @@ int show_cpuinfo(struct seq_file *m, void *v) info = &cpinfo[ARRAY_SIZE(cpinfo) - 1]; -#ifdef CONFIG_SMP - if (!cpu_online(cpu)) - return 0; -#endif - revision = rdvr(); for (i = 0; i < ARRAY_SIZE(cpinfo); i++) { diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index 0c9ce9eac614..3a36ae6b79d5 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -72,6 +72,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) /* Make that the user-mode flag is set. */ regs->ccs |= (1 << (U_CCS_BITNR + CCS_SHIFT)); + /* Don't perform syscall restarting */ + regs->exs = -1; + /* Restore the old USP. */ err |= __get_user(old_usp, &sc->usp); wrusp(old_usp); @@ -425,6 +428,8 @@ do_signal(int canrestart, struct pt_regs *regs) { struct ksignal ksig; + canrestart = canrestart && ((int)regs->exs >= 0); + /* * The common case should go fast, which is why this point is * reached from kernel-mode. If that's the case, just return diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c deleted file mode 100644 index 0698582467ca..000000000000 --- a/arch/cris/arch-v32/kernel/smp.c +++ /dev/null @@ -1,358 +0,0 @@ -#include <linux/types.h> -#include <asm/delay.h> -#include <irq.h> -#include <hwregs/intr_vect.h> -#include <hwregs/intr_vect_defs.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <hwregs/asm/mmu_defs_asm.h> -#include <hwregs/supp_reg.h> -#include <linux/atomic.h> - -#include <linux/err.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/cpumask.h> -#include <linux/interrupt.h> -#include <linux/module.h> - -#define IPI_SCHEDULE 1 -#define IPI_CALL 2 -#define IPI_FLUSH_TLB 4 -#define IPI_BOOT 8 - -#define FLUSH_ALL (void*)0xffffffff - -/* Vector of locks used for various atomic operations */ -spinlock_t cris_atomic_locks[] = { - [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(cris_atomic_locks) -}; - -/* CPU masks */ -cpumask_t phys_cpu_present_map = CPU_MASK_NONE; -EXPORT_SYMBOL(phys_cpu_present_map); - -/* Variables used during SMP boot */ -volatile int cpu_now_booting = 0; -volatile struct thread_info *smp_init_current_idle_thread; - -/* Variables used during IPI */ -static DEFINE_SPINLOCK(call_lock); -static DEFINE_SPINLOCK(tlbstate_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - int wait; -}; - -static struct call_data_struct * call_data; - -static struct mm_struct* flush_mm; -static struct vm_area_struct* flush_vma; -static unsigned long flush_addr; - -/* Mode registers */ -static unsigned long irq_regs[NR_CPUS] = { - regi_irq, - regi_irq2 -}; - -static irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id); -static int send_ipi(int vector, int wait, cpumask_t cpu_mask); -static struct irqaction irq_ipi = { - .handler = crisv32_ipi_interrupt, - .flags = 0, - .name = "ipi", -}; - -extern void cris_mmu_init(void); -extern void cris_timer_init(void); - -/* SMP initialization */ -void __init smp_prepare_cpus(unsigned int max_cpus) -{ - int i; - - /* From now on we can expect IPIs so set them up */ - setup_irq(IPI_INTR_VECT, &irq_ipi); - - /* Mark all possible CPUs as present */ - for (i = 0; i < max_cpus; i++) - cpumask_set_cpu(i, &phys_cpu_present_map); -} - -void smp_prepare_boot_cpu(void) -{ - /* PGD pointer has moved after per_cpu initialization so - * update the MMU. - */ - pgd_t **pgd; - pgd = (pgd_t**)&per_cpu(current_pgd, smp_processor_id()); - - SUPP_BANK_SEL(1); - SUPP_REG_WR(RW_MM_TLB_PGD, pgd); - SUPP_BANK_SEL(2); - SUPP_REG_WR(RW_MM_TLB_PGD, pgd); - - set_cpu_online(0, true); - cpumask_set_cpu(0, &phys_cpu_present_map); - set_cpu_possible(0, true); -} - -void __init smp_cpus_done(unsigned int max_cpus) -{ -} - -/* Bring one cpu online.*/ -static int __init -smp_boot_one_cpu(int cpuid, struct task_struct idle) -{ - unsigned timeout; - cpumask_t cpu_mask; - - cpumask_clear(&cpu_mask); - task_thread_info(idle)->cpu = cpuid; - - /* Information to the CPU that is about to boot */ - smp_init_current_idle_thread = task_thread_info(idle); - cpu_now_booting = cpuid; - - /* Kick it */ - set_cpu_online(cpuid, true); - cpumask_set_cpu(cpuid, &cpu_mask); - send_ipi(IPI_BOOT, 0, cpu_mask); - set_cpu_online(cpuid, false); - - /* Wait for CPU to come online */ - for (timeout = 0; timeout < 10000; timeout++) { - if(cpu_online(cpuid)) { - cpu_now_booting = 0; - smp_init_current_idle_thread = NULL; - return 0; /* CPU online */ - } - udelay(100); - barrier(); - } - - printk(KERN_CRIT "SMP: CPU:%d is stuck.\n", cpuid); - return -1; -} - -/* Secondary CPUs starts using C here. Here we need to setup CPU - * specific stuff such as the local timer and the MMU. */ -void __init smp_callin(void) -{ - int cpu = cpu_now_booting; - reg_intr_vect_rw_mask vect_mask = {0}; - - /* Initialise the idle task for this CPU */ - atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - - /* Set up MMU */ - cris_mmu_init(); - __flush_tlb_all(); - - /* Setup local timer. */ - cris_timer_init(); - - /* Enable IRQ and idle */ - REG_WR(intr_vect, irq_regs[cpu], rw_mask, vect_mask); - crisv32_unmask_irq(IPI_INTR_VECT); - crisv32_unmask_irq(TIMER0_INTR_VECT); - preempt_disable(); - notify_cpu_starting(cpu); - local_irq_enable(); - - set_cpu_online(cpu, true); - cpu_startup_entry(CPUHP_ONLINE); -} - -/* Stop execution on this CPU.*/ -void stop_this_cpu(void* dummy) -{ - local_irq_disable(); - asm volatile("halt"); -} - -/* Other calls */ -void smp_send_stop(void) -{ - smp_call_function(stop_this_cpu, NULL, 0); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - - -/* cache_decay_ticks is used by the scheduler to decide if a process - * is "hot" on one CPU. A higher value means a higher penalty to move - * a process to another CPU. Our cache is rather small so we report - * 1 tick. - */ -unsigned long cache_decay_ticks = 1; - -int __cpu_up(unsigned int cpu, struct task_struct *tidle) -{ - smp_boot_one_cpu(cpu, tidle); - return cpu_online(cpu) ? 0 : -ENOSYS; -} - -void smp_send_reschedule(int cpu) -{ - cpumask_t cpu_mask; - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu, &cpu_mask); - send_ipi(IPI_SCHEDULE, 0, cpu_mask); -} - -/* TLB flushing - * - * Flush needs to be done on the local CPU and on any other CPU that - * may have the same mapping. The mm->cpu_vm_mask is used to keep track - * of which CPUs that a specific process has been executed on. - */ -void flush_tlb_common(struct mm_struct* mm, struct vm_area_struct* vma, unsigned long addr) -{ - unsigned long flags; - cpumask_t cpu_mask; - - spin_lock_irqsave(&tlbstate_lock, flags); - cpu_mask = (mm == FLUSH_ALL ? cpu_all_mask : *mm_cpumask(mm)); - cpumask_clear_cpu(smp_processor_id(), &cpu_mask); - flush_mm = mm; - flush_vma = vma; - flush_addr = addr; - send_ipi(IPI_FLUSH_TLB, 1, cpu_mask); - spin_unlock_irqrestore(&tlbstate_lock, flags); -} - -void flush_tlb_all(void) -{ - __flush_tlb_all(); - flush_tlb_common(FLUSH_ALL, FLUSH_ALL, 0); -} - -void flush_tlb_mm(struct mm_struct *mm) -{ - __flush_tlb_mm(mm); - flush_tlb_common(mm, FLUSH_ALL, 0); - /* No more mappings in other CPUs */ - cpumask_clear(mm_cpumask(mm)); - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); -} - -void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - __flush_tlb_page(vma, addr); - flush_tlb_common(vma->vm_mm, vma, addr); -} - -/* Inter processor interrupts - * - * The IPIs are used for: - * * Force a schedule on a CPU - * * FLush TLB on other CPUs - * * Call a function on other CPUs - */ - -int send_ipi(int vector, int wait, cpumask_t cpu_mask) -{ - int i = 0; - reg_intr_vect_rw_ipi ipi = REG_RD(intr_vect, irq_regs[i], rw_ipi); - int ret = 0; - - /* Calculate CPUs to send to. */ - cpumask_and(&cpu_mask, &cpu_mask, cpu_online_mask); - - /* Send the IPI. */ - for_each_cpu(i, &cpu_mask) - { - ipi.vector |= vector; - REG_WR(intr_vect, irq_regs[i], rw_ipi, ipi); - } - - /* Wait for IPI to finish on other CPUS */ - if (wait) { - for_each_cpu(i, &cpu_mask) { - int j; - for (j = 0 ; j < 1000; j++) { - ipi = REG_RD(intr_vect, irq_regs[i], rw_ipi); - if (!ipi.vector) - break; - udelay(100); - } - - /* Timeout? */ - if (ipi.vector) { - printk("SMP call timeout from %d to %d\n", smp_processor_id(), i); - ret = -ETIMEDOUT; - dump_stack(); - } - } - } - return ret; -} - -/* - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function(void (*func)(void *info), void *info, int wait) -{ - cpumask_t cpu_mask; - struct call_data_struct data; - int ret; - - cpumask_setall(&cpu_mask); - cpumask_clear_cpu(smp_processor_id(), &cpu_mask); - - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - data.wait = wait; - - spin_lock(&call_lock); - call_data = &data; - ret = send_ipi(IPI_CALL, wait, cpu_mask); - spin_unlock(&call_lock); - - return ret; -} - -irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - reg_intr_vect_rw_ipi ipi; - - ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); - - if (ipi.vector & IPI_SCHEDULE) { - scheduler_ipi(); - } - if (ipi.vector & IPI_CALL) { - func(info); - } - if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) - __flush_tlb_mm(flush_mm); - else - __flush_tlb_page(flush_vma, flush_addr); - } - - ipi.vector = 0; - REG_WR(intr_vect, irq_regs[smp_processor_id()], rw_ipi, ipi); - - return IRQ_HANDLED; -} - diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c index c17b01abdc3b..4fce9f1f7cc0 100644 --- a/arch/cris/arch-v32/kernel/time.c +++ b/arch/cris/arch-v32/kernel/time.c @@ -8,12 +8,14 @@ #include <linux/timex.h> #include <linux/time.h> #include <linux/clocksource.h> +#include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/swap.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/threads.h> #include <linux/cpufreq.h> +#include <linux/sched_clock.h> #include <linux/mm.h> #include <asm/types.h> #include <asm/signal.h> @@ -36,33 +38,11 @@ /* Number of 763 counts before watchdog bites */ #define ETRAX_WD_CNT ((2*ETRAX_WD_HZ)/HZ + 1) -/* Register the continuos readonly timer available in FS and ARTPEC-3. */ -static cycle_t read_cont_rotime(struct clocksource *cs) -{ - return (u32)REG_RD(timer, regi_timer0, r_time); -} - -static struct clocksource cont_rotime = { - .name = "crisv32_rotime", - .rating = 300, - .read = read_cont_rotime, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static int __init etrax_init_cont_rotime(void) -{ - clocksource_register_khz(&cont_rotime, 100000); - return 0; -} -arch_initcall(etrax_init_cont_rotime); +#define CRISV32_TIMER_FREQ (100000000lu) unsigned long timer_regs[NR_CPUS] = { regi_timer0, -#ifdef CONFIG_SMP - regi_timer2 -#endif }; extern int set_rtc_mmss(unsigned long nowtime); @@ -189,81 +169,104 @@ void handle_watchdog_bite(struct pt_regs *regs) #endif } -/* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "xtime_update()" routine every clocktick. - */ -extern void cris_do_profile(struct pt_regs *regs); +extern void cris_profile_sample(struct pt_regs *regs); +static void __iomem *timer_base; -static inline irqreturn_t timer_interrupt(int irq, void *dev_id) +static void crisv32_clkevt_mode(enum clock_event_mode mode, + struct clock_event_device *dev) { - struct pt_regs *regs = get_irq_regs(); - int cpu = smp_processor_id(); - reg_timer_r_masked_intr masked_intr; - reg_timer_rw_ack_intr ack_intr = { 0 }; - - /* Check if the timer interrupt is for us (a tmr0 int) */ - masked_intr = REG_RD(timer, timer_regs[cpu], r_masked_intr); - if (!masked_intr.tmr0) - return IRQ_NONE; + reg_timer_rw_tmr0_ctrl ctrl = { + .op = regk_timer_hold, + .freq = regk_timer_f100, + }; - /* Acknowledge the timer irq. */ - ack_intr.tmr0 = 1; - REG_WR(timer, timer_regs[cpu], rw_ack_intr, ack_intr); + REG_WR(timer, timer_base, rw_tmr0_ctrl, ctrl); +} - /* Reset watchdog otherwise it resets us! */ - reset_watchdog(); +static int crisv32_clkevt_next_event(unsigned long evt, + struct clock_event_device *dev) +{ + reg_timer_rw_tmr0_ctrl ctrl = { + .op = regk_timer_ld, + .freq = regk_timer_f100, + }; + + REG_WR(timer, timer_base, rw_tmr0_div, evt); + REG_WR(timer, timer_base, rw_tmr0_ctrl, ctrl); + + ctrl.op = regk_timer_run; + REG_WR(timer, timer_base, rw_tmr0_ctrl, ctrl); + + return 0; +} + +static irqreturn_t crisv32_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = dev_id; + reg_timer_rw_tmr0_ctrl ctrl = { + .op = regk_timer_hold, + .freq = regk_timer_f100, + }; + reg_timer_rw_ack_intr ack = { .tmr0 = 1 }; + reg_timer_r_masked_intr intr; + + intr = REG_RD(timer, timer_base, r_masked_intr); + if (!intr.tmr0) + return IRQ_NONE; - /* Update statistics. */ - update_process_times(user_mode(regs)); + REG_WR(timer, timer_base, rw_tmr0_ctrl, ctrl); + REG_WR(timer, timer_base, rw_ack_intr, ack); - cris_do_profile(regs); /* Save profiling information */ + reset_watchdog(); +#ifdef CONFIG_SYSTEM_PROFILER + cris_profile_sample(get_irq_regs()); +#endif - /* The master CPU is responsible for the time keeping. */ - if (cpu != 0) - return IRQ_HANDLED; + evt->event_handler(evt); - /* Call the real timer interrupt handler */ - xtime_update(1); return IRQ_HANDLED; } +static struct clock_event_device crisv32_clockevent = { + .name = "crisv32-timer", + .rating = 300, + .features = CLOCK_EVT_FEAT_ONESHOT, + .set_mode = crisv32_clkevt_mode, + .set_next_event = crisv32_clkevt_next_event, +}; + /* Timer is IRQF_SHARED so drivers can add stuff to the timer irq chain. */ static struct irqaction irq_timer = { - .handler = timer_interrupt, - .flags = IRQF_SHARED, - .name = "timer" + .handler = crisv32_timer_interrupt, + .flags = IRQF_TIMER | IRQF_SHARED, + .name = "crisv32-timer", + .dev_id = &crisv32_clockevent, }; -void __init cris_timer_init(void) +static u64 notrace crisv32_timer_sched_clock(void) { - int cpu = smp_processor_id(); - reg_timer_rw_tmr0_ctrl tmr0_ctrl = { 0 }; - reg_timer_rw_tmr0_div tmr0_div = TIMER0_DIV; - reg_timer_rw_intr_mask timer_intr_mask; + return REG_RD(timer, timer_base, r_time); +} - /* Setup the etrax timers. - * Base frequency is 100MHz, divider 1000000 -> 100 HZ - * We use timer0, so timer1 is free. - * The trig timer is used by the fasttimer API if enabled. - */ +static void __init crisv32_timer_init(void) +{ + reg_timer_rw_intr_mask timer_intr_mask; + reg_timer_rw_tmr0_ctrl ctrl = { + .op = regk_timer_hold, + .freq = regk_timer_f100, + }; - tmr0_ctrl.op = regk_timer_ld; - tmr0_ctrl.freq = regk_timer_f100; - REG_WR(timer, timer_regs[cpu], rw_tmr0_div, tmr0_div); - REG_WR(timer, timer_regs[cpu], rw_tmr0_ctrl, tmr0_ctrl); /* Load */ - tmr0_ctrl.op = regk_timer_run; - REG_WR(timer, timer_regs[cpu], rw_tmr0_ctrl, tmr0_ctrl); /* Start */ + REG_WR(timer, timer_base, rw_tmr0_ctrl, ctrl); - /* Enable the timer irq. */ - timer_intr_mask = REG_RD(timer, timer_regs[cpu], rw_intr_mask); + timer_intr_mask = REG_RD(timer, timer_base, rw_intr_mask); timer_intr_mask.tmr0 = 1; - REG_WR(timer, timer_regs[cpu], rw_intr_mask, timer_intr_mask); + REG_WR(timer, timer_base, rw_intr_mask, timer_intr_mask); } void __init time_init(void) { - reg_intr_vect_rw_mask intr_mask; + int irq; + int ret; /* Probe for the RTC and read it if it exists. * Before the RTC can be probed the loops_per_usec variable needs @@ -273,17 +276,28 @@ void __init time_init(void) */ loops_per_usec = 50; - /* Start CPU local timer. */ - cris_timer_init(); + irq = TIMER0_INTR_VECT; + timer_base = (void __iomem *) regi_timer0; + + crisv32_timer_init(); + + sched_clock_register(crisv32_timer_sched_clock, 32, + CRISV32_TIMER_FREQ); + + clocksource_mmio_init(timer_base + REG_RD_ADDR_timer_r_time, + "crisv32-timer", CRISV32_TIMER_FREQ, + 300, 32, clocksource_mmio_readl_up); + + crisv32_clockevent.cpumask = cpu_possible_mask; + crisv32_clockevent.irq = irq; - /* Enable the timer irq in global config. */ - intr_mask = REG_RD_VECT(intr_vect, regi_irq, rw_mask, 1); - intr_mask.timer0 = 1; - REG_WR_VECT(intr_vect, regi_irq, rw_mask, 1, intr_mask); + ret = setup_irq(irq, &irq_timer); + if (ret) + pr_warn("failed to setup irq %d\n", irq); - /* Now actually register the timer irq handler that calls - * timer_interrupt(). */ - setup_irq(TIMER0_INTR_VECT, &irq_timer); + clockevents_config_and_register(&crisv32_clockevent, + CRISV32_TIMER_FREQ, + 2, 0xffffffff); /* Enable watchdog if we should use one. */ diff --git a/arch/cris/arch-v32/lib/Makefile b/arch/cris/arch-v32/lib/Makefile index dd296b9db034..e91cf02f625d 100644 --- a/arch/cris/arch-v32/lib/Makefile +++ b/arch/cris/arch-v32/lib/Makefile @@ -3,5 +3,5 @@ # lib-y = checksum.o checksumcopy.o string.o usercopy.o memset.o \ - csumcpfruser.o spinlock.o delay.o strcmp.o + csumcpfruser.o delay.o strcmp.o diff --git a/arch/cris/arch-v32/lib/spinlock.S b/arch/cris/arch-v32/lib/spinlock.S deleted file mode 100644 index fe610b9d775f..000000000000 --- a/arch/cris/arch-v32/lib/spinlock.S +++ /dev/null @@ -1,40 +0,0 @@ -;; Core of the spinlock implementation -;; -;; Copyright (C) 2004 Axis Communications AB. -;; -;; Author: Mikael Starvik - - - .global cris_spin_lock - .type cris_spin_lock,@function - .global cris_spin_trylock - .type cris_spin_trylock,@function - - .text - -cris_spin_lock: - clearf p -1: test.b [$r10] - beq 1b - clearf p - ax - clear.b [$r10] - bcs 1b - clearf p - ret - nop - - .size cris_spin_lock, . - cris_spin_lock - -cris_spin_trylock: - clearf p -1: move.b [$r10], $r11 - ax - clear.b [$r10] - bcs 1b - clearf p - ret - movu.b $r11,$r10 - - .size cris_spin_trylock, . - cris_spin_trylock - diff --git a/arch/cris/arch-v32/mm/init.c b/arch/cris/arch-v32/mm/init.c index 3deca5253d91..f5438ca8122d 100644 --- a/arch/cris/arch-v32/mm/init.c +++ b/arch/cris/arch-v32/mm/init.c @@ -40,17 +40,6 @@ void __init cris_mmu_init(void) */ per_cpu(current_pgd, smp_processor_id()) = init_mm.pgd; -#ifdef CONFIG_SMP - { - pgd_t **pgd; - pgd = (pgd_t**)&per_cpu(current_pgd, smp_processor_id()); - SUPP_BANK_SEL(1); - SUPP_REG_WR(RW_MM_TLB_PGD, pgd); - SUPP_BANK_SEL(2); - SUPP_REG_WR(RW_MM_TLB_PGD, pgd); - } -#endif - /* Initialise the TLB. Function found in tlb.c. */ tlb_init(); diff --git a/arch/cris/arch-v32/mm/mmu.S b/arch/cris/arch-v32/mm/mmu.S index 72727c1d8e60..c0981044eccb 100644 --- a/arch/cris/arch-v32/mm/mmu.S +++ b/arch/cris/arch-v32/mm/mmu.S @@ -115,11 +115,7 @@ move.d $r0, [$r1] ; last_refill_cause = rw_mm_cause 3: ; Probably not in a loop, continue normal processing -#ifdef CONFIG_SMP - move $s7, $acr ; PGD -#else move.d current_pgd, $acr ; PGD -#endif ; Look up PMD in PGD lsrq 24, $r0 ; Get PMD index into PGD (bit 24-31) move.d [$acr], $acr ; PGD for the current process diff --git a/arch/cris/boot/dts/Makefile b/arch/cris/boot/dts/Makefile new file mode 100644 index 000000000000..faf69fb9919f --- /dev/null +++ b/arch/cris/boot/dts/Makefile @@ -0,0 +1,6 @@ +BUILTIN_DTB := $(patsubst "%",%,$(CONFIG_BUILTIN_DTB)).dtb.o +ifneq ($(CONFIG_BUILTIN_DTB),"") +obj-$(CONFIG_OF) += $(BUILTIN_DTB) +endif + +clean-files := *.dtb.S diff --git a/arch/cris/boot/dts/dev88.dts b/arch/cris/boot/dts/dev88.dts new file mode 100644 index 000000000000..4fa5a3f9d0ec --- /dev/null +++ b/arch/cris/boot/dts/dev88.dts @@ -0,0 +1,18 @@ +/dts-v1/; + +/include/ "etraxfs.dtsi" + +/ { + model = "Axis 88 Developer Board"; + compatible = "axis,dev88"; + + aliases { + serial0 = &uart0; + }; + + soc { + uart0: serial@b00260000 { + status = "okay"; + }; + }; +}; diff --git a/arch/cris/boot/dts/etraxfs.dtsi b/arch/cris/boot/dts/etraxfs.dtsi new file mode 100644 index 000000000000..909bcedc3565 --- /dev/null +++ b/arch/cris/boot/dts/etraxfs.dtsi @@ -0,0 +1,38 @@ +/ { + #address-cells = <1>; + #size-cells = <1>; + interrupt-parent = <&intc>; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + model = "axis,crisv32"; + reg = <0>; + }; + }; + + soc { + compatible = "simple-bus"; + model = "etraxfs"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + intc: interrupt-controller { + compatible = "axis,crisv32-intc"; + reg = <0xb001c000 0x1000>; + interrupt-controller; + #interrupt-cells = <1>; + }; + + serial@b00260000 { + compatible = "axis,etraxfs-uart"; + reg = <0xb0026000 0x1000>; + interrupts = <68>; + status = "disabled"; + }; + }; +}; diff --git a/arch/cris/include/arch-v10/arch/atomic.h b/arch/cris/include/arch-v10/arch/atomic.h deleted file mode 100644 index 6ef5e7d09024..000000000000 --- a/arch/cris/include/arch-v10/arch/atomic.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_CRIS_ARCH_ATOMIC__ -#define __ASM_CRIS_ARCH_ATOMIC__ - -#define cris_atomic_save(addr, flags) local_irq_save(flags); -#define cris_atomic_restore(addr, flags) local_irq_restore(flags); - -#endif diff --git a/arch/cris/include/arch-v10/arch/system.h b/arch/cris/include/arch-v10/arch/system.h index 935fde34aa15..9b5580f58b96 100644 --- a/arch/cris/include/arch-v10/arch/system.h +++ b/arch/cris/include/arch-v10/arch/system.h @@ -36,12 +36,4 @@ static inline unsigned long _get_base(char * addr) return 0; } -#define nop() __asm__ __volatile__ ("nop"); - -#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) -#define tas(ptr) (xchg((ptr),1)) - -struct __xchg_dummy { unsigned long a[100]; }; -#define __xg(x) ((struct __xchg_dummy *)(x)) - #endif diff --git a/arch/cris/include/arch-v32/arch/atomic.h b/arch/cris/include/arch-v32/arch/atomic.h deleted file mode 100644 index 852ceff8013f..000000000000 --- a/arch/cris/include/arch-v32/arch/atomic.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef __ASM_CRIS_ARCH_ATOMIC__ -#define __ASM_CRIS_ARCH_ATOMIC__ - -#include <linux/spinlock_types.h> - -extern void cris_spin_unlock(void *l, int val); -extern void cris_spin_lock(void *l); -extern int cris_spin_trylock(void* l); - -#ifndef CONFIG_SMP -#define cris_atomic_save(addr, flags) local_irq_save(flags); -#define cris_atomic_restore(addr, flags) local_irq_restore(flags); -#else - -extern spinlock_t cris_atomic_locks[]; -#define LOCK_COUNT 128 -#define HASH_ADDR(a) (((int)a) & 127) - -#define cris_atomic_save(addr, flags) \ - local_irq_save(flags); \ - cris_spin_lock((void *)&cris_atomic_locks[HASH_ADDR(addr)].raw_lock.slock); - -#define cris_atomic_restore(addr, flags) \ - { \ - spinlock_t *lock = (void*)&cris_atomic_locks[HASH_ADDR(addr)]; \ - __asm__ volatile ("move.d %1,%0" \ - : "=m" (lock->raw_lock.slock) \ - : "r" (1) \ - : "memory"); \ - local_irq_restore(flags); \ - } - -#endif - -#endif - diff --git a/arch/cris/include/arch-v32/arch/processor.h b/arch/cris/include/arch-v32/arch/processor.h index a024b7d32fed..568759271ab5 100644 --- a/arch/cris/include/arch-v32/arch/processor.h +++ b/arch/cris/include/arch-v32/arch/processor.h @@ -25,8 +25,7 @@ struct thread_struct { */ #define TASK_SIZE (0xB0000000UL) -/* CCS I=1, enable interrupts. */ -#define INIT_THREAD { 0, 0, (1 << I_CCS_BITNR) } +#define INIT_THREAD { } #define KSTK_EIP(tsk) \ ({ \ diff --git a/arch/cris/include/arch-v32/arch/spinlock.h b/arch/cris/include/arch-v32/arch/spinlock.h deleted file mode 100644 index f13275522f4d..000000000000 --- a/arch/cris/include/arch-v32/arch/spinlock.h +++ /dev/null @@ -1,131 +0,0 @@ -#ifndef __ASM_ARCH_SPINLOCK_H -#define __ASM_ARCH_SPINLOCK_H - -#include <linux/spinlock_types.h> - -#define RW_LOCK_BIAS 0x01000000 - -extern void cris_spin_unlock(void *l, int val); -extern void cris_spin_lock(void *l); -extern int cris_spin_trylock(void *l); - -static inline int arch_spin_is_locked(arch_spinlock_t *x) -{ - return *(volatile signed char *)(&(x)->slock) <= 0; -} - -static inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __asm__ volatile ("move.d %1,%0" \ - : "=m" (lock->slock) \ - : "r" (1) \ - : "memory"); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - while (arch_spin_is_locked(lock)) - cpu_relax(); -} - -static inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - return cris_spin_trylock((void *)&lock->slock); -} - -static inline void arch_spin_lock(arch_spinlock_t *lock) -{ - cris_spin_lock((void *)&lock->slock); -} - -static inline void -arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - -/* - * Read-write spinlocks, allowing multiple readers - * but only one writer. - * - * NOTE! it is quite common to have readers in interrupts - * but no interrupt writers. For those circumstances we - * can "mix" irq-safe locks - any writer needs to get a - * irq-safe write-lock, but readers can get non-irqsafe - * read-locks. - * - */ - -static inline int arch_read_can_lock(arch_rwlock_t *x) -{ - return (int)(x)->lock > 0; -} - -static inline int arch_write_can_lock(arch_rwlock_t *x) -{ - return (x)->lock == RW_LOCK_BIAS; -} - -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - arch_spin_lock(&rw->slock); - while (rw->lock == 0); - rw->lock--; - arch_spin_unlock(&rw->slock); -} - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - arch_spin_lock(&rw->slock); - while (rw->lock != RW_LOCK_BIAS); - rw->lock = 0; - arch_spin_unlock(&rw->slock); -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - arch_spin_lock(&rw->slock); - rw->lock++; - arch_spin_unlock(&rw->slock); -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - arch_spin_lock(&rw->slock); - while (rw->lock != RW_LOCK_BIAS); - rw->lock = RW_LOCK_BIAS; - arch_spin_unlock(&rw->slock); -} - -static inline int arch_read_trylock(arch_rwlock_t *rw) -{ - int ret = 0; - arch_spin_lock(&rw->slock); - if (rw->lock != 0) { - rw->lock--; - ret = 1; - } - arch_spin_unlock(&rw->slock); - return ret; -} - -static inline int arch_write_trylock(arch_rwlock_t *rw) -{ - int ret = 0; - arch_spin_lock(&rw->slock); - if (rw->lock == RW_LOCK_BIAS) { - rw->lock = 0; - ret = 1; - } - arch_spin_unlock(&rw->slock); - return ret; -} - -#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock) -#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock) - -#define arch_spin_relax(lock) cpu_relax() -#define arch_read_relax(lock) cpu_relax() -#define arch_write_relax(lock) cpu_relax() - -#endif /* __ASM_ARCH_SPINLOCK_H */ diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 889f2de050a3..057e51859b0a 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -1,16 +1,29 @@ - +generic-y += atomic.h generic-y += barrier.h generic-y += clkdev.h +generic-y += cmpxchg.h generic-y += cputime.h +generic-y += device.h +generic-y += div64.h generic-y += exec.h +generic-y += emergency-restart.h +generic-y += futex.h +generic-y += hardirq.h +generic-y += irq_regs.h generic-y += irq_work.h +generic-y += kdebug.h +generic-y += kmap_types.h generic-y += kvm_para.h generic-y += linkage.h +generic-y += local.h +generic-y += local64.h generic-y += mcs_spinlock.h generic-y += module.h +generic-y += percpu.h generic-y += preempt.h generic-y += scatterlist.h generic-y += sections.h +generic-y += topology.h generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h deleted file mode 100644 index 279766a70664..000000000000 --- a/arch/cris/include/asm/atomic.h +++ /dev/null @@ -1,149 +0,0 @@ -/* $Id: atomic.h,v 1.3 2001/07/25 16:15:19 bjornw Exp $ */ - -#ifndef __ASM_CRIS_ATOMIC__ -#define __ASM_CRIS_ATOMIC__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <asm/cmpxchg.h> -#include <arch/atomic.h> -#include <arch/system.h> -#include <asm/barrier.h> - -/* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - -#define ATOMIC_INIT(i) { (i) } - -#define atomic_read(v) ACCESS_ONCE((v)->counter) -#define atomic_set(v,i) (((v)->counter) = (i)) - -/* These should be written in asm but we do it in C for now. */ - -#define ATOMIC_OP(op, c_op) \ -static inline void atomic_##op(int i, volatile atomic_t *v) \ -{ \ - unsigned long flags; \ - cris_atomic_save(v, flags); \ - v->counter c_op i; \ - cris_atomic_restore(v, flags); \ -} \ - -#define ATOMIC_OP_RETURN(op, c_op) \ -static inline int atomic_##op##_return(int i, volatile atomic_t *v) \ -{ \ - unsigned long flags; \ - int retval; \ - cris_atomic_save(v, flags); \ - retval = (v->counter c_op i); \ - cris_atomic_restore(v, flags); \ - return retval; \ -} - -#define ATOMIC_OPS(op, c_op) ATOMIC_OP(op, c_op) ATOMIC_OP_RETURN(op, c_op) - -ATOMIC_OPS(add, +=) -ATOMIC_OPS(sub, -=) - -#undef ATOMIC_OPS -#undef ATOMIC_OP_RETURN -#undef ATOMIC_OP - -#define atomic_add_negative(a, v) (atomic_add_return((a), (v)) < 0) - -static inline int atomic_sub_and_test(int i, volatile atomic_t *v) -{ - int retval; - unsigned long flags; - cris_atomic_save(v, flags); - retval = (v->counter -= i) == 0; - cris_atomic_restore(v, flags); - return retval; -} - -static inline void atomic_inc(volatile atomic_t *v) -{ - unsigned long flags; - cris_atomic_save(v, flags); - (v->counter)++; - cris_atomic_restore(v, flags); -} - -static inline void atomic_dec(volatile atomic_t *v) -{ - unsigned long flags; - cris_atomic_save(v, flags); - (v->counter)--; - cris_atomic_restore(v, flags); -} - -static inline int atomic_inc_return(volatile atomic_t *v) -{ - unsigned long flags; - int retval; - cris_atomic_save(v, flags); - retval = ++(v->counter); - cris_atomic_restore(v, flags); - return retval; -} - -static inline int atomic_dec_return(volatile atomic_t *v) -{ - unsigned long flags; - int retval; - cris_atomic_save(v, flags); - retval = --(v->counter); - cris_atomic_restore(v, flags); - return retval; -} -static inline int atomic_dec_and_test(volatile atomic_t *v) -{ - int retval; - unsigned long flags; - cris_atomic_save(v, flags); - retval = --(v->counter) == 0; - cris_atomic_restore(v, flags); - return retval; -} - -static inline int atomic_inc_and_test(volatile atomic_t *v) -{ - int retval; - unsigned long flags; - cris_atomic_save(v, flags); - retval = ++(v->counter) == 0; - cris_atomic_restore(v, flags); - return retval; -} - -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) -{ - int ret; - unsigned long flags; - - cris_atomic_save(v, flags); - ret = v->counter; - if (likely(ret == old)) - v->counter = new; - cris_atomic_restore(v, flags); - return ret; -} - -#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) - -static inline int __atomic_add_unless(atomic_t *v, int a, int u) -{ - int ret; - unsigned long flags; - - cris_atomic_save(v, flags); - ret = v->counter; - if (ret != u) - v->counter += a; - cris_atomic_restore(v, flags); - return ret; -} - -#endif diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h index bd49a546f4f5..8062cb52d343 100644 --- a/arch/cris/include/asm/bitops.h +++ b/arch/cris/include/asm/bitops.h @@ -19,119 +19,10 @@ #endif #include <arch/bitops.h> -#include <linux/atomic.h> #include <linux/compiler.h> #include <asm/barrier.h> -/* - * set_bit - Atomically set a bit in memory - * @nr: the bit to set - * @addr: the address to start counting from - * - * This function is atomic and may not be reordered. See __set_bit() - * if you do not require the atomic guarantees. - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ - -#define set_bit(nr, addr) (void)test_and_set_bit(nr, addr) - -/* - * clear_bit - Clears a bit in memory - * @nr: Bit to clear - * @addr: Address to start counting from - * - * clear_bit() is atomic and may not be reordered. However, it does - * not contain a memory barrier, so if it is used for locking purposes, - * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic() - * in order to ensure changes are visible on other processors. - */ - -#define clear_bit(nr, addr) (void)test_and_clear_bit(nr, addr) - -/* - * change_bit - Toggle a bit in memory - * @nr: Bit to change - * @addr: Address to start counting from - * - * change_bit() is atomic and may not be reordered. - * Note that @nr may be almost arbitrarily large; this function is not - * restricted to acting on a single-word quantity. - */ - -#define change_bit(nr, addr) (void)test_and_change_bit(nr, addr) - -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ - -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) -{ - unsigned int mask, retval; - unsigned long flags; - unsigned int *adr = (unsigned int *)addr; - - adr += nr >> 5; - mask = 1 << (nr & 0x1f); - cris_atomic_save(addr, flags); - retval = (mask & *adr) != 0; - *adr |= mask; - cris_atomic_restore(addr, flags); - return retval; -} - -/** - * test_and_clear_bit - Clear a bit and return its old value - * @nr: Bit to clear - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ - -static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) -{ - unsigned int mask, retval; - unsigned long flags; - unsigned int *adr = (unsigned int *)addr; - - adr += nr >> 5; - mask = 1 << (nr & 0x1f); - cris_atomic_save(addr, flags); - retval = (mask & *adr) != 0; - *adr &= ~mask; - cris_atomic_restore(addr, flags); - return retval; -} - -/** - * test_and_change_bit - Change a bit and return its old value - * @nr: Bit to change - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ - -static inline int test_and_change_bit(int nr, volatile unsigned long *addr) -{ - unsigned int mask, retval; - unsigned long flags; - unsigned int *adr = (unsigned int *)addr; - adr += nr >> 5; - mask = 1 << (nr & 0x1f); - cris_atomic_save(addr, flags); - retval = (mask & *adr) != 0; - *adr ^= mask; - cris_atomic_restore(addr, flags); - return retval; -} - +#include <asm-generic/bitops/atomic.h> #include <asm-generic/bitops/non-atomic.h> /* diff --git a/arch/cris/include/asm/cmpxchg.h b/arch/cris/include/asm/cmpxchg.h deleted file mode 100644 index b756dac8aa3f..000000000000 --- a/arch/cris/include/asm/cmpxchg.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef __ASM_CRIS_CMPXCHG__ -#define __ASM_CRIS_CMPXCHG__ - -#include <linux/irqflags.h> - -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) -{ - /* since Etrax doesn't have any atomic xchg instructions, we need to disable - irq's (if enabled) and do it with move.d's */ - unsigned long flags,temp; - local_irq_save(flags); /* save flags, including irq enable bit and shut off irqs */ - switch (size) { - case 1: - *((unsigned char *)&temp) = x; - x = *(unsigned char *)ptr; - *(unsigned char *)ptr = *((unsigned char *)&temp); - break; - case 2: - *((unsigned short *)&temp) = x; - x = *(unsigned short *)ptr; - *(unsigned short *)ptr = *((unsigned short *)&temp); - break; - case 4: - temp = x; - x = *(unsigned long *)ptr; - *(unsigned long *)ptr = temp; - break; - } - local_irq_restore(flags); /* restore irq enable bit */ - return x; -} - -#define xchg(ptr,x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -#include <asm-generic/cmpxchg-local.h> - -/* - * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make - * them available. - */ -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), (unsigned long)(o),\ - (unsigned long)(n), sizeof(*(ptr)))) -#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) - -#ifndef CONFIG_SMP -#include <asm-generic/cmpxchg.h> -#endif - -#endif /* __ASM_CRIS_CMPXCHG__ */ diff --git a/arch/cris/include/asm/device.h b/arch/cris/include/asm/device.h deleted file mode 100644 index d8f9872b0e2d..000000000000 --- a/arch/cris/include/asm/device.h +++ /dev/null @@ -1,7 +0,0 @@ -/* - * Arch specific extensions to struct device - * - * This file is released under the GPLv2 - */ -#include <asm-generic/device.h> - diff --git a/arch/cris/include/asm/div64.h b/arch/cris/include/asm/div64.h deleted file mode 100644 index 6cd978cefb28..000000000000 --- a/arch/cris/include/asm/div64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/div64.h> diff --git a/arch/cris/include/asm/elf.h b/arch/cris/include/asm/elf.h index 30ded8fbf592..c2a394ff55ff 100644 --- a/arch/cris/include/asm/elf.h +++ b/arch/cris/include/asm/elf.h @@ -71,7 +71,7 @@ typedef unsigned long elf_fpregset_t; the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. */ -#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) +#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, diff --git a/arch/cris/include/asm/emergency-restart.h b/arch/cris/include/asm/emergency-restart.h deleted file mode 100644 index 108d8c48e42e..000000000000 --- a/arch/cris/include/asm/emergency-restart.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_EMERGENCY_RESTART_H -#define _ASM_EMERGENCY_RESTART_H - -#include <asm-generic/emergency-restart.h> - -#endif /* _ASM_EMERGENCY_RESTART_H */ diff --git a/arch/cris/include/asm/futex.h b/arch/cris/include/asm/futex.h deleted file mode 100644 index 6a332a9f099c..000000000000 --- a/arch/cris/include/asm/futex.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_FUTEX_H -#define _ASM_FUTEX_H - -#include <asm-generic/futex.h> - -#endif diff --git a/arch/cris/include/asm/hardirq.h b/arch/cris/include/asm/hardirq.h deleted file mode 100644 index 04126f7bfab2..000000000000 --- a/arch/cris/include/asm/hardirq.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_HARDIRQ_H -#define __ASM_HARDIRQ_H - -#include <asm/irq.h> -#include <asm-generic/hardirq.h> - -#endif /* __ASM_HARDIRQ_H */ diff --git a/arch/cris/include/asm/irq_regs.h b/arch/cris/include/asm/irq_regs.h deleted file mode 100644 index 3dd9c0b70270..000000000000 --- a/arch/cris/include/asm/irq_regs.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/irq_regs.h> diff --git a/arch/cris/include/asm/kdebug.h b/arch/cris/include/asm/kdebug.h deleted file mode 100644 index 6ece1b037665..000000000000 --- a/arch/cris/include/asm/kdebug.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/kdebug.h> diff --git a/arch/cris/include/asm/kmap_types.h b/arch/cris/include/asm/kmap_types.h deleted file mode 100644 index d2d643c4ea59..000000000000 --- a/arch/cris/include/asm/kmap_types.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* Dummy header just to define km_type. None of this - * is actually used on cris. - */ - -#include <asm-generic/kmap_types.h> - -#endif diff --git a/arch/cris/include/asm/local.h b/arch/cris/include/asm/local.h deleted file mode 100644 index c11c530f74d0..000000000000 --- a/arch/cris/include/asm/local.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local.h> diff --git a/arch/cris/include/asm/local64.h b/arch/cris/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/cris/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/local64.h> diff --git a/arch/cris/include/asm/percpu.h b/arch/cris/include/asm/percpu.h deleted file mode 100644 index 6db9b43cf80a..000000000000 --- a/arch/cris/include/asm/percpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _CRIS_PERCPU_H -#define _CRIS_PERCPU_H - -#include <asm-generic/percpu.h> - -#endif /* _CRIS_PERCPU_H */ diff --git a/arch/cris/include/asm/smp.h b/arch/cris/include/asm/smp.h deleted file mode 100644 index c615a06dd757..000000000000 --- a/arch/cris/include/asm/smp.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __ASM_SMP_H -#define __ASM_SMP_H - -#include <linux/cpumask.h> - -extern cpumask_t phys_cpu_present_map; - -#define raw_smp_processor_id() (current_thread_info()->cpu) - -#endif diff --git a/arch/cris/include/asm/spinlock.h b/arch/cris/include/asm/spinlock.h deleted file mode 100644 index ed816b57face..000000000000 --- a/arch/cris/include/asm/spinlock.h +++ /dev/null @@ -1 +0,0 @@ -#include <arch/spinlock.h> diff --git a/arch/cris/include/asm/tlbflush.h b/arch/cris/include/asm/tlbflush.h index 20697e7ef4f2..b424f43a9fd6 100644 --- a/arch/cris/include/asm/tlbflush.h +++ b/arch/cris/include/asm/tlbflush.h @@ -22,16 +22,9 @@ extern void __flush_tlb_mm(struct mm_struct *mm); extern void __flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); -#ifdef CONFIG_SMP -extern void flush_tlb_all(void); -extern void flush_tlb_mm(struct mm_struct *mm); -extern void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr); -#else #define flush_tlb_all __flush_tlb_all #define flush_tlb_mm __flush_tlb_mm #define flush_tlb_page __flush_tlb_page -#endif static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) { diff --git a/arch/cris/include/asm/topology.h b/arch/cris/include/asm/topology.h deleted file mode 100644 index 2ac613d32a89..000000000000 --- a/arch/cris/include/asm/topology.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_CRIS_TOPOLOGY_H -#define _ASM_CRIS_TOPOLOGY_H - -#include <asm-generic/topology.h> - -#endif /* _ASM_CRIS_TOPOLOGY_H */ diff --git a/arch/cris/kernel/Makefile b/arch/cris/kernel/Makefile index b45640b3e600..edef71f12bb8 100644 --- a/arch/cris/kernel/Makefile +++ b/arch/cris/kernel/Makefile @@ -7,6 +7,7 @@ CPPFLAGS_vmlinux.lds := -DDRAM_VIRTUAL_BASE=0x$(CONFIG_ETRAX_DRAM_VIRTUAL_BASE) extra-y := vmlinux.lds obj-y := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o +obj-y += devicetree.o obj-$(CONFIG_MODULES) += crisksyms.o obj-$(CONFIG_MODULES) += module.o diff --git a/arch/cris/kernel/devicetree.c b/arch/cris/kernel/devicetree.c new file mode 100644 index 000000000000..53ff8d73e7e1 --- /dev/null +++ b/arch/cris/kernel/devicetree.c @@ -0,0 +1,14 @@ +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/printk.h> + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + pr_err("%s(%llx, %llx)\n", + __func__, base, size); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return alloc_bootmem_align(size, align); +} diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index 58d44ee1a71f..fd3427e563c5 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -42,3 +42,26 @@ void do_notify_resume(int canrestart, struct pt_regs *regs, tracehook_notify_resume(regs); } } + +void do_work_pending(int syscall, struct pt_regs *regs, + unsigned int thread_flags) +{ + do { + if (likely(thread_flags & _TIF_NEED_RESCHED)) { + schedule(); + } else { + if (unlikely(!user_mode(regs))) + return; + local_irq_enable(); + if (thread_flags & _TIF_SIGPENDING) { + do_signal(syscall, regs); + syscall = 0; + } else { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + } + local_irq_disable(); + thread_flags = current_thread_info()->flags; + } while (thread_flags & _TIF_WORK_MASK); +} diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index 905b70ea9939..bb12aa93201d 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -19,6 +19,9 @@ #include <linux/utsname.h> #include <linux/pfn.h> #include <linux/cpu.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_platform.h> #include <asm/setup.h> #include <arch/system.h> @@ -64,6 +67,10 @@ void __init setup_arch(char **cmdline_p) unsigned long start_pfn, max_pfn; unsigned long memory_start; +#ifdef CONFIG_OF + early_init_dt_scan(__dtb_start); +#endif + /* register an initial console printing routine for printk's */ init_etrax_debug(); @@ -141,6 +148,8 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem(PFN_PHYS(start_pfn), bootmap_size, BOOTMEM_DEFAULT); + unflatten_and_copy_device_tree(); + /* paging_init() sets up the MMU and marks all pages as reserved */ paging_init(); @@ -204,3 +213,9 @@ static int __init topology_init(void) subsys_initcall(topology_init); +static int __init cris_of_init(void) +{ + of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); + return 0; +} +core_initcall(cris_of_init); diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index fe6acdabbc8d..7780d379522f 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -79,11 +79,13 @@ cris_do_profile(struct pt_regs* regs) #endif } +#ifndef CONFIG_GENERIC_SCHED_CLOCK unsigned long long sched_clock(void) { return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ) + get_ns_in_jiffie(); } +#endif static int __init init_udelay(void) diff --git a/arch/frv/include/asm/io.h b/arch/frv/include/asm/io.h index 99bb7efaf9b7..0b78bc89e840 100644 --- a/arch/frv/include/asm/io.h +++ b/arch/frv/include/asm/io.h @@ -342,6 +342,11 @@ static inline void iowrite32(u32 val, void __iomem *p) __flush_PCI_writes(); } +#define ioread16be(addr) be16_to_cpu(ioread16(addr)) +#define ioread32be(addr) be32_to_cpu(ioread32(addr)) +#define iowrite16be(v, addr) iowrite16(cpu_to_be16(v), (addr)) +#define iowrite32be(v, addr) iowrite32(cpu_to_be32(v), (addr)) + static inline void ioread8_rep(void __iomem *p, void *dst, unsigned long count) { io_insb((unsigned long) p, dst, count); diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 4f9a6661491b..76d25b2cfbbe 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -15,6 +15,7 @@ config IA64 select ARCH_MIGHT_HAVE_PC_SERIO select PCI if (!IA64_HP_SIM) select ACPI if (!IA64_HP_SIM) + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 35bf22cc71b7..b1698bc042c8 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -887,7 +887,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 5f4243f0acfa..60e02f7747ff 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2159,7 +2159,7 @@ static const struct file_operations pfm_file_ops = { static char *pfmfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pfm:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations pfmfs_dentry_operations = { diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index 48cc65705db4..d4e162d35b34 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -240,15 +240,12 @@ static acpi_status resource_to_window(struct acpi_resource *resource, * We're only interested in _CRS descriptors that are * - address space descriptors for memory or I/O space * - non-zero size - * - producers, i.e., the address space is routed downstream, - * not consumed by the bridge itself */ status = acpi_resource_to_address64(resource, addr); if (ACPI_SUCCESS(status) && (addr->resource_type == ACPI_MEMORY_RANGE || addr->resource_type == ACPI_IO_RANGE) && - addr->address.address_length && - addr->producer_consumer == ACPI_PRODUCER) + addr->address.address_length) return AE_OK; return AE_ERROR; diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c index 483dff986a23..7f546183a0f0 100644 --- a/arch/metag/kernel/process.c +++ b/arch/metag/kernel/process.c @@ -174,8 +174,11 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, (unsigned long *)regs->ctx.AX[0].U0, regs); } +/* + * Copy architecture-specific thread state + */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *tsk) + unsigned long kthread_arg, struct task_struct *tsk) { struct pt_regs *childregs = task_pt_regs(tsk); void *kernel_context = ((void *) childregs + @@ -202,12 +205,13 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, global_base = __core_reg_get(A1GbP); childregs->ctx.AX[0].U1 = (unsigned long) global_base; childregs->ctx.AX[0].U0 = (unsigned long) kernel_context; - /* Set D1Ar1=arg and D1RtP=usp (fn) */ + /* Set D1Ar1=kthread_arg and D1RtP=usp (fn) */ childregs->ctx.DX[4].U1 = usp; - childregs->ctx.DX[3].U1 = arg; + childregs->ctx.DX[3].U1 = kthread_arg; tsk->thread.int_depth = 2; return 0; } + /* * Get a pointer to where the new child's register block should have * been pushed. diff --git a/arch/mn10300/include/asm/io.h b/arch/mn10300/include/asm/io.h index 897ba3c12b32..cc4a2ba9e228 100644 --- a/arch/mn10300/include/asm/io.h +++ b/arch/mn10300/include/asm/io.h @@ -197,6 +197,11 @@ static inline void outsl(unsigned long addr, const void *buffer, int count) #define iowrite16(v, addr) writew((v), (addr)) #define iowrite32(v, addr) writel((v), (addr)) +#define ioread16be(addr) be16_to_cpu(readw(addr)) +#define ioread32be(addr) be32_to_cpu(readl(addr)) +#define iowrite16be(v, addr) writew(cpu_to_be16(v), (addr)) +#define iowrite32be(v, addr) writel(cpu_to_be32(v), (addr)) + #define ioread8_rep(p, dst, count) \ insb((unsigned long) (p), (dst), (count)) #define ioread16_rep(p, dst, count) \ diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 01c75f36e8b3..24b3d8999ac7 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -46,7 +46,6 @@ generic-y += segment.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h -generic-y += shmparam.h generic-y += siginfo.h generic-y += signal.h generic-y += socket.h diff --git a/arch/nios2/include/asm/shmparam.h b/arch/nios2/include/asm/shmparam.h new file mode 100644 index 000000000000..60784294e407 --- /dev/null +++ b/arch/nios2/include/asm/shmparam.h @@ -0,0 +1,21 @@ +/* + * Copyright Altera Corporation (C) <2015>. All rights reserved + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ASM_NIOS2_SHMPARAM_H +#define _ASM_NIOS2_SHMPARAM_H + +#define SHMLBA CONFIG_NIOS2_DCACHE_SIZE + +#endif /* _ASM_NIOS2_SHMPARAM_H */ diff --git a/arch/nios2/include/uapi/asm/ptrace.h b/arch/nios2/include/uapi/asm/ptrace.h index eff00e67c0a2..1d35de90a977 100644 --- a/arch/nios2/include/uapi/asm/ptrace.h +++ b/arch/nios2/include/uapi/asm/ptrace.h @@ -14,6 +14,8 @@ #ifndef __ASSEMBLY__ +#include <linux/types.h> + /* * Register numbers used by 'ptrace' system call interface. */ diff --git a/arch/nios2/kernel/entry.S b/arch/nios2/kernel/entry.S index 27b006c52e12..1e515ccd698e 100644 --- a/arch/nios2/kernel/entry.S +++ b/arch/nios2/kernel/entry.S @@ -92,35 +92,35 @@ exception_table: trap_table: .word handle_system_call /* 0 */ - .word instruction_trap /* 1 */ - .word instruction_trap /* 2 */ - .word instruction_trap /* 3 */ - .word instruction_trap /* 4 */ - .word instruction_trap /* 5 */ - .word instruction_trap /* 6 */ - .word instruction_trap /* 7 */ - .word instruction_trap /* 8 */ - .word instruction_trap /* 9 */ - .word instruction_trap /* 10 */ - .word instruction_trap /* 11 */ - .word instruction_trap /* 12 */ - .word instruction_trap /* 13 */ - .word instruction_trap /* 14 */ - .word instruction_trap /* 15 */ - .word instruction_trap /* 16 */ - .word instruction_trap /* 17 */ - .word instruction_trap /* 18 */ - .word instruction_trap /* 19 */ - .word instruction_trap /* 20 */ - .word instruction_trap /* 21 */ - .word instruction_trap /* 22 */ - .word instruction_trap /* 23 */ - .word instruction_trap /* 24 */ - .word instruction_trap /* 25 */ - .word instruction_trap /* 26 */ - .word instruction_trap /* 27 */ - .word instruction_trap /* 28 */ - .word instruction_trap /* 29 */ + .word handle_trap_1 /* 1 */ + .word handle_trap_2 /* 2 */ + .word handle_trap_3 /* 3 */ + .word handle_trap_reserved /* 4 */ + .word handle_trap_reserved /* 5 */ + .word handle_trap_reserved /* 6 */ + .word handle_trap_reserved /* 7 */ + .word handle_trap_reserved /* 8 */ + .word handle_trap_reserved /* 9 */ + .word handle_trap_reserved /* 10 */ + .word handle_trap_reserved /* 11 */ + .word handle_trap_reserved /* 12 */ + .word handle_trap_reserved /* 13 */ + .word handle_trap_reserved /* 14 */ + .word handle_trap_reserved /* 15 */ + .word handle_trap_reserved /* 16 */ + .word handle_trap_reserved /* 17 */ + .word handle_trap_reserved /* 18 */ + .word handle_trap_reserved /* 19 */ + .word handle_trap_reserved /* 20 */ + .word handle_trap_reserved /* 21 */ + .word handle_trap_reserved /* 22 */ + .word handle_trap_reserved /* 23 */ + .word handle_trap_reserved /* 24 */ + .word handle_trap_reserved /* 25 */ + .word handle_trap_reserved /* 26 */ + .word handle_trap_reserved /* 27 */ + .word handle_trap_reserved /* 28 */ + .word handle_trap_reserved /* 29 */ #ifdef CONFIG_KGDB .word handle_kgdb_breakpoint /* 30 KGDB breakpoint */ #else @@ -455,6 +455,19 @@ handle_kgdb_breakpoint: br ret_from_exception #endif +handle_trap_1: + call handle_trap_1_c + br ret_from_exception + +handle_trap_2: + call handle_trap_2_c + br ret_from_exception + +handle_trap_3: +handle_trap_reserved: + call handle_trap_3_c + br ret_from_exception + /* * Beware - when entering resume, prev (the current task) is * in r4, next (the new task) is in r5, don't change these diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index b7b97641a9a6..81f7da7b1d55 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -23,6 +23,17 @@ static DEFINE_SPINLOCK(die_lock); +static void _send_sig(int signo, int code, unsigned long addr) +{ + siginfo_t info; + + info.si_signo = signo; + info.si_errno = 0; + info.si_code = code; + info.si_addr = (void __user *) addr; + force_sig_info(signo, &info, current); +} + void die(const char *str, struct pt_regs *regs, long err) { console_verbose(); @@ -39,16 +50,10 @@ void die(const char *str, struct pt_regs *regs, long err) void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) { - siginfo_t info; - if (!user_mode(regs)) die("Exception in kernel mode", regs, signo); - info.si_signo = signo; - info.si_errno = 0; - info.si_code = code; - info.si_addr = (void __user *) addr; - force_sig_info(signo, &info, current); + _send_sig(signo, code, addr); } /* @@ -183,3 +188,18 @@ asmlinkage void unhandled_exception(struct pt_regs *regs, int cause) pr_emerg("opcode: 0x%08lx\n", *(unsigned long *)(regs->ea)); } + +asmlinkage void handle_trap_1_c(struct pt_regs *fp) +{ + _send_sig(SIGUSR1, 0, fp->ea); +} + +asmlinkage void handle_trap_2_c(struct pt_regs *fp) +{ + _send_sig(SIGUSR2, 0, fp->ea); +} + +asmlinkage void handle_trap_3_c(struct pt_regs *fp) +{ + _send_sig(SIGILL, ILL_ILLTRP, fp->ea); +} diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 796642932e2e..223cdcc8203f 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -58,9 +58,6 @@ static void __invalidate_dcache(unsigned long start, unsigned long end) end += (cpuinfo.dcache_line_size - 1); end &= ~(cpuinfo.dcache_line_size - 1); - if (end > start + cpuinfo.dcache_size) - end = start + cpuinfo.dcache_size; - for (addr = start; addr < end; addr += cpuinfo.dcache_line_size) { __asm__ __volatile__ (" initda 0(%0)\n" : /* Outputs */ @@ -131,12 +128,14 @@ void flush_cache_dup_mm(struct mm_struct *mm) void flush_icache_range(unsigned long start, unsigned long end) { + __flush_dcache(start, end); __flush_icache(start, end); } void flush_dcache_range(unsigned long start, unsigned long end) { __flush_dcache(start, end); + __flush_icache(start, end); } EXPORT_SYMBOL(flush_dcache_range); @@ -159,6 +158,7 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) unsigned long start = (unsigned long) page_address(page); unsigned long end = start + PAGE_SIZE; + __flush_dcache(start, end); __flush_icache(start, end); } @@ -173,6 +173,18 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, __flush_icache(start, end); } +void __flush_dcache_page(struct address_space *mapping, struct page *page) +{ + /* + * Writeback any data associated with the kernel mapping of this + * page. This ensures that data in the physical page is mutually + * coherent with the kernels mapping. + */ + unsigned long start = (unsigned long)page_address(page); + + __flush_dcache_all(start, start + PAGE_SIZE); +} + void flush_dcache_page(struct page *page) { struct address_space *mapping; @@ -190,11 +202,12 @@ void flush_dcache_page(struct page *page) if (mapping && !mapping_mapped(mapping)) { clear_bit(PG_dcache_clean, &page->flags); } else { - unsigned long start = (unsigned long)page_address(page); - - __flush_dcache_all(start, start + PAGE_SIZE); - if (mapping) + __flush_dcache_page(mapping, page); + if (mapping) { + unsigned long start = (unsigned long)page_address(page); flush_aliases(mapping, page); + flush_icache_range(start, start + PAGE_SIZE); + } set_bit(PG_dcache_clean, &page->flags); } } @@ -205,6 +218,7 @@ void update_mmu_cache(struct vm_area_struct *vma, { unsigned long pfn = pte_pfn(*pte); struct page *page; + struct address_space *mapping; if (!pfn_valid(pfn)) return; @@ -217,16 +231,15 @@ void update_mmu_cache(struct vm_area_struct *vma, if (page == ZERO_PAGE(0)) return; - if (!PageReserved(page) && - !test_and_set_bit(PG_dcache_clean, &page->flags)) { - unsigned long start = page_to_virt(page); - struct address_space *mapping; - - __flush_dcache(start, start + PAGE_SIZE); - - mapping = page_mapping(page); - if (mapping) - flush_aliases(mapping, page); + mapping = page_mapping(page); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + __flush_dcache_page(mapping, page); + + if(mapping) + { + flush_aliases(mapping, page); + if (vma->vm_flags & VM_EXEC) + flush_icache_page(vma, page); } } @@ -234,15 +247,19 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, struct page *to) { __flush_dcache(vaddr, vaddr + PAGE_SIZE); + __flush_icache(vaddr, vaddr + PAGE_SIZE); copy_page(vto, vfrom); __flush_dcache((unsigned long)vto, (unsigned long)vto + PAGE_SIZE); + __flush_icache((unsigned long)vto, (unsigned long)vto + PAGE_SIZE); } void clear_user_page(void *addr, unsigned long vaddr, struct page *page) { __flush_dcache(vaddr, vaddr + PAGE_SIZE); + __flush_icache(vaddr, vaddr + PAGE_SIZE); clear_page(addr); __flush_dcache((unsigned long)addr, (unsigned long)addr + PAGE_SIZE); + __flush_icache((unsigned long)addr, (unsigned long)addr + PAGE_SIZE); } void copy_from_user_page(struct vm_area_struct *vma, struct page *page, @@ -251,7 +268,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, { flush_cache_page(vma, user_vaddr, page_to_pfn(page)); memcpy(dst, src, len); - __flush_dcache((unsigned long)src, (unsigned long)src + len); + __flush_dcache_all((unsigned long)src, (unsigned long)src + len); if (vma->vm_flags & VM_EXEC) __flush_icache((unsigned long)src, (unsigned long)src + len); } @@ -262,7 +279,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, { flush_cache_page(vma, user_vaddr, page_to_pfn(page)); memcpy(dst, src, len); - __flush_dcache((unsigned long)dst, (unsigned long)dst + len); + __flush_dcache_all((unsigned long)dst, (unsigned long)dst + len); if (vma->vm_flags & VM_EXEC) __flush_icache((unsigned long)dst, (unsigned long)dst + len); } diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index bde531103638..0cc6eedc4780 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -30,8 +30,6 @@ static inline int arch_has_random(void) return !!ppc_md.get_random_long; } -int powernv_get_random_long(unsigned long *v); - static inline int arch_get_random_seed_long(unsigned long *v) { return 0; @@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void) #endif /* CONFIG_ARCH_RANDOM */ +#ifdef CONFIG_PPC_POWERNV +int powernv_hwrng_present(void); +int powernv_get_random_long(unsigned long *v); +int powernv_get_random_real_mode(unsigned long *v); +#else +static inline int powernv_hwrng_present(void) { return 0; } +static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } +#endif + #endif /* _ASM_POWERPC_ARCHRANDOM_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 993090422690..b91e74a817d8 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -288,6 +288,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu) return !is_kvmppc_hv_enabled(vcpu->kvm); } +extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu); +extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 14619a59ec09..3536d12eb798 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits) return old == 0; } +static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = cpu_to_be64(hpte_v); +} + +/* Without barrier */ +static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v) +{ + hpte_v &= ~HPTE_V_HVLOCK; + hpte[0] = cpu_to_be64(hpte_v); +} + static inline int __hpte_actual_psize(unsigned int lp, int psize) { int i, shift; @@ -281,16 +295,17 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) /* * If it's present and writable, atomically set dirty and referenced bits and - * return the PTE, otherwise return 0. If we find a transparent hugepage - * and if it is marked splitting we return 0; + * return the PTE, otherwise return 0. */ -static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, - unsigned int hugepage) +static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) { pte_t old_pte, new_pte = __pte(0); while (1) { - old_pte = *ptep; + /* + * Make sure we don't reload from ptep + */ + old_pte = READ_ONCE(*ptep); /* * wait until _PAGE_BUSY is clear then set it atomically */ @@ -298,12 +313,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, cpu_relax(); continue; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* If hugepage and is trans splitting return None */ - if (unlikely(hugepage && - pmd_trans_splitting(pte_pmd(old_pte)))) - return __pte(0); -#endif /* If pte is not present return None */ if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT))) return __pte(0); @@ -424,6 +433,10 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) return rcu_dereference_raw_notrace(kvm->memslots); } +extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); + +extern void kvmhv_rm_send_ipi(int cpu); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c610961720c7..a193a13cf08b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -227,10 +227,8 @@ struct kvm_arch { unsigned long host_sdr1; int tlbie_lock; unsigned long lpcr; - unsigned long rmor; - struct kvm_rma_info *rma; unsigned long vrma_slb_v; - int rma_setup_done; + int hpte_setup_done; u32 hpt_order; atomic_t vcpus_running; u32 online_vcores; @@ -239,6 +237,8 @@ struct kvm_arch { atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; int hpt_cma_alloc; + struct dentry *debugfs_dir; + struct dentry *htab_dentry; #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE struct mutex hpt_mutex; @@ -263,18 +263,15 @@ struct kvm_arch { /* * Struct for a virtual core. - * Note: entry_exit_count combines an entry count in the bottom 8 bits - * and an exit count in the next 8 bits. This is so that we can - * atomically increment the entry count iff the exit count is 0 - * without taking the lock. + * Note: entry_exit_map combines a bitmap of threads that have entered + * in the bottom 8 bits and a bitmap of threads that have exited in the + * next 8 bits. This is so that we can atomically set the entry bit + * iff the exit map is 0 without taking a lock. */ struct kvmppc_vcore { int n_runnable; - int n_busy; int num_threads; - int entry_exit_count; - int n_woken; - int nap_count; + int entry_exit_map; int napping_threads; int first_vcpuid; u16 pcpu; @@ -299,13 +296,14 @@ struct kvmppc_vcore { ulong conferring_threads; }; -#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) -#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) +#define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff) +#define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) +#define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) /* Values for vcore_state */ #define VCORE_INACTIVE 0 #define VCORE_SLEEPING 1 -#define VCORE_STARTING 2 +#define VCORE_PREEMPT 2 #define VCORE_RUNNING 3 #define VCORE_EXITING 4 @@ -368,6 +366,14 @@ struct kvmppc_slb { u8 base_page_size; /* MMU_PAGE_xxx */ }; +/* Struct used to accumulate timing information in HV real mode code */ +struct kvmhv_tb_accumulator { + u64 seqcount; /* used to synchronize access, also count * 2 */ + u64 tb_total; /* total time in timebase ticks */ + u64 tb_min; /* min time */ + u64 tb_max; /* max time */ +}; + # ifdef CONFIG_PPC_FSL_BOOK3E #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 @@ -656,6 +662,19 @@ struct kvm_vcpu_arch { u32 emul_inst; #endif + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */ + u64 cur_tb_start; /* when it started */ + struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */ + struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */ + struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */ + struct kvmhv_tb_accumulator guest_time; /* guest execution */ + struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ + + struct dentry *debugfs_dir; + struct dentry *debugfs_timings; +#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ }; #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET] diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 46bf652c9169..b8475daad884 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm) return kvm->arch.kvm_ops == kvmppc_hv_ops; } +extern int kvmppc_hwrng_present(void); + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9835ac4173b7..11a38635dd65 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -247,28 +247,16 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, #define pmd_large(pmd) 0 #define has_transparent_hugepage() 0 #endif -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, +pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift); - -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, - unsigned long *pte_sizep) +static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) { - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); - if (!ptep) - return NULL; - if (shift) - *pte_sizep = 1ul << shift; - else - *pte_sizep = PAGE_SIZE; - - if (ps > *pte_sizep) - return NULL; - - return ptep; + if (!arch_irqs_disabled()) { + pr_info("%s called with irq enabled\n", __func__); + dump_stack(); + } + return __find_linux_pte_or_hugepte(pgdir, ea, shift); } #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 03cbada59d3a..10fc784a2ad4 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void); DECLARE_PER_CPU(u64, decrementers_next_tb); +/* Convert timebase ticks to nanoseconds */ +unsigned long long tb_to_ns(unsigned long long tb_ticks); + #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4717859fdd04..0034b6b3556a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -37,6 +37,7 @@ #include <asm/thread_info.h> #include <asm/rtas.h> #include <asm/vdso_datapage.h> +#include <asm/dbell.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> #include <asm/lppaca.h> @@ -459,6 +460,19 @@ int main(void) DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); #endif +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry)); + DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr)); + DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit)); + DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time)); + DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time)); + DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity)); + DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start)); + DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount)); + DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total)); + DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min)); + DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max)); +#endif DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3)); DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); @@ -492,7 +506,6 @@ int main(void) DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); - DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); @@ -550,8 +563,7 @@ int main(void) DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1)); - DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); - DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm)); @@ -748,5 +760,7 @@ int main(void) offsetof(struct paca_struct, subcore_sibling_mask)); #endif + DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + return 0; } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index a4c62eb0ee48..44b480e3a5af 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -334,9 +334,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) int hugepage_shift; /* - * We won't find hugepages here, iomem + * We won't find hugepages here(this is iomem). Hence we are not + * worried about _PAGE_SPLITTING/collapse. Also we will not hit + * page table free, because of init_mm. */ - ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); + ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); if (!ptep) return token; WARN_ON(hugepage_shift); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 24b968f8e4d8..63d9cc4d7366 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -71,15 +71,15 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) vaddr = (unsigned long)PCI_FIX_ADDR(addr); if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - - ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + /* + * We won't find huge pages here (iomem). Also can't hit + * a page table free due to init_mm + */ + ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, &hugepage_shift); if (ptep == NULL) paddr = 0; else { - /* - * we don't have hugepages backing iomem - */ WARN_ON(hugepage_shift); paddr = pte_pfn(*ptep) << PAGE_SHIFT; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2d7b33fab953..56f44848b044 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void) } #endif +unsigned long long tb_to_ns(unsigned long long ticks) +{ + return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift; +} +EXPORT_SYMBOL_GPL(tb_to_ns); + /* * Scheduler clock - returns current time in nanosec units. * diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 11850f310fb4..3caec2c42105 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -75,7 +75,7 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" - depends on KVM_BOOK3S_64 + depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA @@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR processor, including emulating 32-bit processors on a 64-bit host. +config KVM_BOOK3S_HV_EXIT_TIMING + bool "Detailed timing for hypervisor real-mode code" + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + ---help--- + Calculate time taken for each vcpu in the real-mode guest entry, + exit, and interrupt handling code, plus time spent in the guest + and in nap mode due to idle (cede) while other threads are still + in the guest. The total, minimum and maximum times in nanoseconds + together with the number of executions are reported in debugfs in + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 + ns per exit on POWER8. + + If unsure, say N. + config KVM_BOOKE_HV bool diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index cfbcdc654201..453a8a47a467 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) #endif } +int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + u64 buf; + int ret; + + if (!is_power_of_2(size) || (size > sizeof(buf))) + return H_TOO_HARD; + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + switch (size) { + case 1: + kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf); + break; + + case 2: + kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf)); + break; + + case 4: + kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf)); + break; + + case 8: + kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf)); + break; + + default: + BUG(); + } + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load); + +int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + unsigned long val = kvmppc_get_gpr(vcpu, 6); + u64 buf; + int ret; + + switch (size) { + case 1: + *(u8 *)&buf = val; + break; + + case 2: + *(__be16 *)&buf = cpu_to_be16(val); + break; + + case 4: + *(__be32 *)&buf = cpu_to_be32(val); + break; + + case 8: + *(__be64 *)&buf = cpu_to_be64(val); + break; + + default: + return H_TOO_HARD; + } + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); + int kvmppc_core_check_processor_compat(void) { /* diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 534acb3c6c3d..1a4acf8bf4f4 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -27,6 +27,7 @@ #include <linux/srcu.h> #include <linux/anon_inodes.h> #include <linux/file.h> +#include <linux/debugfs.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) long order; mutex_lock(&kvm->lock); - if (kvm->arch.rma_setup_done) { - kvm->arch.rma_setup_done = 0; - /* order rma_setup_done vs. vcpus_running */ + if (kvm->arch.hpte_setup_done) { + kvm->arch.hpte_setup_done = 0; + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; goto out; } } @@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; gr = kvm->arch.revmap[index].guest_rpte; - /* Unlock the HPTE */ - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(v); + unlock_hpte(hptep, v); preempt_enable(); gpte->eaddr = eaddr; @@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; - asm volatile("lwsync" : : : "memory"); - hptep[0] = cpu_to_be64(hpte[0]); + unlock_hpte(hptep, hpte[0]); preempt_enable(); if (hpte[0] != vcpu->arch.pgfault_hpte[0] || @@ -537,23 +535,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { - unsigned int hugepage_shift; pte_t *ptep, pte; - + unsigned long flags; /* * We need to protect against page table destruction - * while looking up and updating the pte. + * hugepage split and collapse. */ - rcu_read_lock_sched(); + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, &hugepage_shift); + hva, NULL); if (ptep) { - pte = kvmppc_read_update_linux_pte(ptep, 1, - hugepage_shift); + pte = kvmppc_read_update_linux_pte(ptep, 1); if (pte_write(pte)) write_ok = 1; } - rcu_read_unlock_sched(); + local_irq_restore(flags); } } @@ -621,7 +617,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, hptep[1] = cpu_to_be64(r); eieio(); - hptep[0] = cpu_to_be64(hpte[0]); + __unlock_hpte(hptep, hpte[0]); asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) @@ -642,7 +638,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; out_unlock: - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); preempt_enable(); goto out_put; } @@ -771,7 +767,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, } } unlock_rmap(rmapp); - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } return 0; } @@ -857,7 +853,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, } ret = 1; } - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } while ((i = j) != head); unlock_rmap(rmapp); @@ -974,8 +970,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) /* Now check and modify the HPTE */ if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { - /* unlock and continue */ - hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); continue; } @@ -996,9 +991,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) npages_dirty = n; eieio(); } - v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; - hptep[0] = cpu_to_be64(v); + __unlock_hpte(hptep, v); } while ((i = j) != head); unlock_rmap(rmapp); @@ -1218,8 +1213,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp, r &= ~HPTE_GR_MODIFIED; revp->guest_rpte = r; } - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + unlock_hpte(hptp, be64_to_cpu(hptp[0])); preempt_enable(); if (!(valid == want_valid && (first_pass || dirty))) ok = 0; @@ -1339,20 +1333,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, unsigned long tmp[2]; ssize_t nb; long int err, ret; - int rma_setup; + int hpte_setup; if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; /* lock out vcpus from running while we're doing this */ mutex_lock(&kvm->lock); - rma_setup = kvm->arch.rma_setup_done; - if (rma_setup) { - kvm->arch.rma_setup_done = 0; /* temporarily */ - /* order rma_setup_done vs. vcpus_running */ + hpte_setup = kvm->arch.hpte_setup_done; + if (hpte_setup) { + kvm->arch.hpte_setup_done = 0; /* temporarily */ + /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; mutex_unlock(&kvm->lock); return -EBUSY; } @@ -1405,7 +1399,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, "r=%lx\n", ret, i, v, r); goto out; } - if (!rma_setup && is_vrma_hpte(v)) { + if (!hpte_setup && is_vrma_hpte(v)) { unsigned long psize = hpte_base_page_size(v, r); unsigned long senc = slb_pgsize_encoding(psize); unsigned long lpcr; @@ -1414,7 +1408,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, (VRMA_VSID << SLB_VSID_SHIFT_1T); lpcr = senc << (LPCR_VRMASD_SH - 4); kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - rma_setup = 1; + hpte_setup = 1; } ++i; hptp += 2; @@ -1430,9 +1424,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, } out: - /* Order HPTE updates vs. rma_setup_done */ + /* Order HPTE updates vs. hpte_setup_done */ smp_wmb(); - kvm->arch.rma_setup_done = rma_setup; + kvm->arch.hpte_setup_done = hpte_setup; mutex_unlock(&kvm->lock); if (err) @@ -1495,6 +1489,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) return ret; } +struct debugfs_htab_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long hpt_index; + int chars_left; + int buf_index; + char buf[64]; +}; + +static int debugfs_htab_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_htab_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_htab_release(struct inode *inode, struct file *file) +{ + struct debugfs_htab_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_htab_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_htab_state *p = file->private_data; + ssize_t ret, r; + unsigned long i, n; + unsigned long v, hr, gr; + struct kvm *kvm; + __be64 *hptp; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + kvm = p->kvm; + i = p->hpt_index; + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; + hr = be64_to_cpu(hptp[1]); + gr = kvm->arch.revmap[i].guest_rpte; + unlock_hpte(hptp, v); + preempt_enable(); + + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + n = scnprintf(p->buf, sizeof(p->buf), + "%6lx %.16lx %.16lx %.16lx\n", + i, v, hr, gr); + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + goto out; + } + } + p->hpt_index = i; + + out: + mutex_unlock(&p->mutex); + return ret; +} + +ssize_t debugfs_htab_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_htab_fops = { + .owner = THIS_MODULE, + .open = debugfs_htab_open, + .release = debugfs_htab_release, + .read = debugfs_htab_read, + .write = debugfs_htab_write, + .llseek = generic_file_llseek, +}; + +void kvmppc_mmu_debugfs_init(struct kvm *kvm) +{ + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_htab_fops); +} + void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de747563d29d..48d3c5d2ecc9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -32,6 +32,7 @@ #include <linux/page-flags.h> #include <linux/srcu.h> #include <linux/miscdevice.h> +#include <linux/debugfs.h> #include <asm/reg.h> #include <asm/cputable.h> @@ -50,6 +51,7 @@ #include <asm/hvcall.h> #include <asm/switch_to.h> #include <asm/smp.h> +#include <asm/dbell.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static bool kvmppc_ipi_thread(int cpu) +{ + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + preempt_disable(); + if (cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + smp_mb(); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + preempt_enable(); + return true; + } + preempt_enable(); + } + +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) + if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { + xics_wake_cpu(cpu); + return true; + } +#endif + + return false; +} + static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { - int me; int cpu = vcpu->cpu; wait_queue_head_t *wqp; @@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - me = get_cpu(); + if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) + return; /* CPU points to the first thread of the core */ - if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) { -#ifdef CONFIG_PPC_ICP_NATIVE - int real_cpu = cpu + vcpu->arch.ptid; - if (paca[real_cpu].kvm_hstate.xics_phys) - xics_wake_cpu(real_cpu); - else -#endif - if (cpu_online(cpu)) - smp_send_reschedule(cpu); - } - put_cpu(); + if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) + smp_send_reschedule(cpu); } /* @@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) /* Send the error out to userspace via KVM_RUN */ return rc; + case H_LOGICAL_CI_LOAD: + ret = kvmppc_h_logical_ci_load(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_LOGICAL_CI_STORE: + ret = kvmppc_h_logical_ci_store(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; case H_SET_MODE: ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), @@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) case H_CONFER: case H_REGISTER_VPA: case H_SET_MODE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: @@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) return vcore; } +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +static struct debugfs_timings_element { + const char *name; + size_t offset; +} timings[] = { + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, +}; + +#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) + +struct debugfs_timings_state { + struct kvm_vcpu *vcpu; + unsigned int buflen; + char buf[N_TIMINGS * 100]; +}; + +static int debugfs_timings_open(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = inode->i_private; + struct debugfs_timings_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(vcpu->kvm); + p->vcpu = vcpu; + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_timings_release(struct inode *inode, struct file *file) +{ + struct debugfs_timings_state *p = file->private_data; + + kvm_put_kvm(p->vcpu->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_timings_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_timings_state *p = file->private_data; + struct kvm_vcpu *vcpu = p->vcpu; + char *s, *buf_end; + struct kvmhv_tb_accumulator tb; + u64 count; + loff_t pos; + ssize_t n; + int i, loops; + bool ok; + + if (!p->buflen) { + s = p->buf; + buf_end = s + sizeof(p->buf); + for (i = 0; i < N_TIMINGS; ++i) { + struct kvmhv_tb_accumulator *acc; + + acc = (struct kvmhv_tb_accumulator *) + ((unsigned long)vcpu + timings[i].offset); + ok = false; + for (loops = 0; loops < 1000; ++loops) { + count = acc->seqcount; + if (!(count & 1)) { + smp_rmb(); + tb = *acc; + smp_rmb(); + if (count == acc->seqcount) { + ok = true; + break; + } + } + udelay(1); + } + if (!ok) + snprintf(s, buf_end - s, "%s: stuck\n", + timings[i].name); + else + snprintf(s, buf_end - s, + "%s: %llu %llu %llu %llu\n", + timings[i].name, count / 2, + tb_to_ns(tb.tb_total), + tb_to_ns(tb.tb_min), + tb_to_ns(tb.tb_max)); + s += strlen(s); + } + p->buflen = s - p->buf; + } + + pos = *ppos; + if (pos >= p->buflen) + return 0; + if (len > p->buflen - pos) + len = p->buflen - pos; + n = copy_to_user(buf, p->buf + pos, len); + if (n) { + if (n == len) + return -EFAULT; + len -= n; + } + *ppos = pos + len; + return len; +} + +static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_timings_ops = { + .owner = THIS_MODULE, + .open = debugfs_timings_open, + .release = debugfs_timings_release, + .read = debugfs_timings_read, + .write = debugfs_timings_write, + .llseek = generic_file_llseek, +}; + +/* Create a debugfs directory for the vcpu */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ + char buf[16]; + struct kvm *kvm = vcpu->kvm; + + snprintf(buf, sizeof(buf), "vcpu%u", id); + if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); + if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_timings = + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, + vcpu, &debugfs_timings_ops); +} + +#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ +} +#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ + static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id) { @@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); + debugfs_vcpu_init(vcpu, id); + return vcpu; free_vcpu: @@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu) tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ - tpaca->kvm_hstate.hwthread_req = 1; tpaca->kvm_hstate.kvm_vcpu = NULL; + tpaca->kvm_hstate.napping = 0; + smp_wmb(); + tpaca->kvm_hstate.hwthread_req = 1; /* * If the thread is already executing in the kernel (e.g. handling @@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) } cpu = vc->pcpu + vcpu->arch.ptid; tpaca = &paca[cpu]; - tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; tpaca->kvm_hstate.ptid = vcpu->arch.ptid; vcpu->cpu = vc->pcpu; + /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ smp_wmb(); -#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (cpu != smp_processor_id()) { - xics_wake_cpu(cpu); - if (vcpu->arch.ptid) - ++vc->n_woken; - } -#endif + tpaca->kvm_hstate.kvm_vcpu = vcpu; + if (cpu != smp_processor_id()) + kvmppc_ipi_thread(cpu); } -static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) +static void kvmppc_wait_for_nap(void) { - int i; + int cpu = smp_processor_id(); + int i, loops; - HMT_low(); - i = 0; - while (vc->nap_count < vc->n_woken) { - if (++i >= 1000000) { - pr_err("kvmppc_wait_for_nap timeout %d %d\n", - vc->nap_count, vc->n_woken); - break; + for (loops = 0; loops < 1000000; ++loops) { + /* + * Check if all threads are finished. + * We set the vcpu pointer when starting a thread + * and the thread clears it when finished, so we look + * for any threads that still have a non-NULL vcpu ptr. + */ + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + break; + if (i == threads_per_subcore) { + HMT_medium(); + return; } - cpu_relax(); + HMT_low(); } HMT_medium(); + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); } /* @@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc) mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); } +static void prepare_threads(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vnext; + + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + if (signal_pending(vcpu->arch.run_task)) + vcpu->arch.ret = -EINTR; + else if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpu->arch.ret = RESUME_GUEST; + else + continue; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } +} + +static void post_guest_process(struct kvmppc_vcore *vc) +{ + u64 now; + long ret; + struct kvm_vcpu *vcpu, *vnext; + + now = get_tb(); + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && + kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + + vcpu->arch.ret = ret; + vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (!is_kvmppc_resume_guest(ret)) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + } +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. */ -static void kvmppc_run_core(struct kvmppc_vcore *vc) +static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; - long ret; - u64 now; - int i, need_vpa_update; + struct kvm_vcpu *vcpu; + int i; int srcu_idx; - struct kvm_vcpu *vcpus_to_update[threads_per_core]; - /* don't start if any threads have a signal pending */ - need_vpa_update = 0; - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - if (signal_pending(vcpu->arch.run_task)) - return; - if (vcpu->arch.vpa.update_pending || - vcpu->arch.slb_shadow.update_pending || - vcpu->arch.dtl.update_pending) - vcpus_to_update[need_vpa_update++] = vcpu; - } + /* + * Remove from the list any threads that have a signal pending + * or need a VPA update done + */ + prepare_threads(vc); + + /* if the runner is no longer runnable, let the caller pick a new one */ + if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) + return; /* - * Initialize *vc, in particular vc->vcore_state, so we can - * drop the vcore lock if necessary. + * Initialize *vc. */ - vc->n_woken = 0; - vc->nap_count = 0; - vc->entry_exit_count = 0; + vc->entry_exit_map = 0; vc->preempt_tb = TB_NIL; - vc->vcore_state = VCORE_STARTING; vc->in_guest = 0; vc->napping_threads = 0; vc->conferring_threads = 0; /* - * Updating any of the vpas requires calling kvmppc_pin_guest_page, - * which can't be called with any spinlocks held. - */ - if (need_vpa_update) { - spin_unlock(&vc->lock); - for (i = 0; i < need_vpa_update; ++i) - kvmppc_update_vpas(vcpus_to_update[i]); - spin_lock(&vc->lock); - } - - /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. */ if ((threads_per_core > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { vcpu->arch.ret = -EBUSY; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } goto out; } @@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) vcpu->cpu = -1; /* wait for secondary threads to finish writing their state to memory */ - if (vc->nap_count < vc->n_woken) - kvmppc_wait_for_nap(vc); + kvmppc_wait_for_nap(); for (i = 0; i < threads_per_subcore; ++i) kvmppc_release_hwthread(vc->pcpu + i); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ @@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc) kvm_guest_exit(); preempt_enable(); - cond_resched(); spin_lock(&vc->lock); - now = get_tb(); - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - /* cancel pending dec exception if dec is positive */ - if (now < vcpu->arch.dec_expires && - kvmppc_core_pending_dec(vcpu)) - kvmppc_core_dequeue_dec(vcpu); - - trace_kvm_guest_exit(vcpu); - - ret = RESUME_GUEST; - if (vcpu->arch.trap) - ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); - - vcpu->arch.ret = ret; - vcpu->arch.trap = 0; - - if (vcpu->arch.ceded) { - if (!is_kvmppc_resume_guest(ret)) - kvmppc_end_cede(vcpu); - else - kvmppc_set_timer(vcpu); - } - } + post_guest_process(vc); out: vc->vcore_state = VCORE_INACTIVE; - list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, - arch.run_list) { - if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { - kvmppc_remove_runnable(vc, vcpu); - wake_up(&vcpu->arch.cpu_run); - } - } - trace_kvmppc_run_core(vc, 1); } @@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * this thread straight away and have it join in. */ if (!signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING && - VCORE_EXIT_COUNT(vc) == 0) { + if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu); trace_kvm_guest_enter(vcpu); @@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) break; - vc->runner = vcpu; n_ceded = 0; list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { if (!v->arch.pending_exceptions) @@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) else v->arch.ceded = 0; } - if (n_ceded == vc->n_runnable) + vc->runner = vcpu; + if (n_ceded == vc->n_runnable) { kvmppc_vcore_blocked(vc); - else + } else if (should_resched()) { + vc->vcore_state = VCORE_PREEMPT; + /* Let something else run */ + cond_resched_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + } else { kvmppc_run_core(vc); + } vc->runner = NULL; } @@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } atomic_inc(&vcpu->kvm->arch.vcpus_running); - /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */ + /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ smp_mb(); /* On the first time here, set up HTAB and VRMA */ - if (!vcpu->kvm->arch.rma_setup_done) { + if (!vcpu->kvm->arch.hpte_setup_done) { r = kvmppc_hv_setup_htab_rma(vcpu); if (r) goto out; @@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) int srcu_idx; mutex_lock(&kvm->lock); - if (kvm->arch.rma_setup_done) + if (kvm->arch.hpte_setup_done) goto out; /* another vcpu beat us to it */ /* Allocate hashed page table (if not done already) and reset it */ @@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); - /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ smp_wmb(); - kvm->arch.rma_setup_done = 1; + kvm->arch.hpte_setup_done = 1; err = 0; out_srcu: srcu_read_unlock(&kvm->srcu, srcu_idx); @@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; + char buf[32]; /* Allocate the guest's logical partition ID */ @@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) */ kvm_hv_vm_activated(); + /* + * Create a debugfs directory for the VM + */ + snprintf(buf, sizeof(buf), "vm%d", current->pid); + kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); + if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + kvmppc_mmu_debugfs_init(kvm); + return 0; } @@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm) static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { + debugfs_remove_recursive(kvm->arch.debugfs_dir); + kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 1f083ff8a61a..ed2589d4593f 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -21,6 +21,10 @@ #include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#include <asm/archrandom.h> +#include <asm/xics.h> +#include <asm/dbell.h> +#include <asm/cputhreads.h> #define KVM_CMA_CHUNK_ORDER 18 @@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, int rv = H_SUCCESS; /* => don't yield */ set_bit(vcpu->arch.ptid, &vc->conferring_threads); - while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) { - threads_running = VCORE_ENTRY_COUNT(vc); - threads_ceded = hweight32(vc->napping_threads); - threads_conferring = hweight32(vc->conferring_threads); - if (threads_ceded + threads_conferring >= threads_running) { + while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { + threads_running = VCORE_ENTRY_MAP(vc); + threads_ceded = vc->napping_threads; + threads_conferring = vc->conferring_threads; + if ((threads_ceded | threads_conferring) == threads_running) { rv = H_TOO_HARD; /* => do yield */ break; } @@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) return 0; } EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); + +int kvmppc_hwrng_present(void) +{ + return powernv_hwrng_present(); +} +EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); + +long kvmppc_h_random(struct kvm_vcpu *vcpu) +{ + if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) + return H_SUCCESS; + + return H_HARDWARE; +} + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +/* + * Send an interrupt or message to another CPU. + * This can only be called in real mode. + * The caller needs to include any barrier needed to order writes + * to memory vs. the IPI/message. + */ +void kvmhv_rm_send_ipi(int cpu) +{ + unsigned long xics_phys; + + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(raw_smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return; + } + + /* Else poke the target with an IPI */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +/* + * The following functions are called from the assembly code + * in book3s_hv_rmhandlers.S. + */ +static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active) +{ + int cpu = vc->pcpu; + + /* Order setting of exit map vs. msgsnd/IPI */ + smp_mb(); + for (; active; active >>= 1, ++cpu) + if (active & 1) + kvmhv_rm_send_ipi(cpu); +} + +void kvmhv_commence_exit(int trap) +{ + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; + int ptid = local_paca->kvm_hstate.ptid; + int me, ee; + + /* Set our bit in the threads-exiting-guest map in the 0xff00 + bits of vcore->entry_exit_map */ + me = 0x100 << ptid; + do { + ee = vc->entry_exit_map; + } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee); + + /* Are we the first here? */ + if ((ee >> 8) != 0) + return; + + /* + * Trigger the other threads in this vcore to exit the guest. + * If this is a hypervisor decrementer interrupt then they + * will be already on their way out of the guest. + */ + if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) + kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 625407e4d3b0..b027a89737b6 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -26,11 +26,14 @@ static void *real_vmalloc_addr(void *x) { unsigned long addr = (unsigned long) x; pte_t *p; - - p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); + /* + * assume we don't have huge pages in vmalloc space... + * So don't worry about THP collapse/split. Called + * Only in realmode, hence won't need irq_save/restore. + */ + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; - /* assume we don't have huge pages in vmalloc space... */ addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); return __va(addr); } @@ -131,31 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) -{ - pte_t *ptep; - unsigned long ps = *pte_sizep; - unsigned int hugepage_shift; - - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); - if (!ptep) - return __pte(0); - if (hugepage_shift) - *pte_sizep = 1ul << hugepage_shift; - else - *pte_sizep = PAGE_SIZE; - if (ps > *pte_sizep) - return __pte(0); - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); -} - -static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) -{ - asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); - hpte[0] = cpu_to_be64(hpte_v); -} - long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) @@ -166,13 +144,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; unsigned long g_ptel; struct kvm_memory_slot *memslot; - unsigned long pte_size; + unsigned hpage_shift; unsigned long is_io; unsigned long *rmap; - pte_t pte; + pte_t *ptep; unsigned int writing; unsigned long mmu_seq; - unsigned long rcbits; + unsigned long rcbits, irq_flags = 0; psize = hpte_page_size(pteh, ptel); if (!psize) @@ -208,22 +186,46 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); - - /* Look up the Linux PTE for the backing page */ - pte_size = psize; - pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); - if (pte_present(pte) && !pte_protnone(pte)) { - if (writing && !pte_write(pte)) - /* make the actual HPTE be read-only */ - ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); - pa = pte_pfn(pte) << PAGE_SHIFT; - pa |= hva & (pte_size - 1); - pa |= gpa & ~PAGE_MASK; + /* + * If we had a page table table change after lookup, we would + * retry via mmu_notifier_retry. + */ + if (realmode) + ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + else { + local_irq_save(irq_flags); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); } + if (ptep) { + pte_t pte; + unsigned int host_pte_size; - if (pte_size < psize) - return H_PARAMETER; + if (hpage_shift) + host_pte_size = 1ul << hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* + * We should always find the guest page size + * to <= host page size, if host is using hugepage + */ + if (host_pte_size < psize) { + if (!realmode) + local_irq_restore(flags); + return H_PARAMETER; + } + pte = kvmppc_read_update_linux_pte(ptep, writing); + if (pte_present(pte) && !pte_protnone(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (host_pte_size - 1); + pa |= gpa & ~PAGE_MASK; + } + } + if (!realmode) + local_irq_restore(irq_flags); ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; @@ -271,10 +273,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, u64 pte; while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) break; - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); hpte += 2; } if (i == 8) @@ -290,9 +292,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(*hpte); + pte = be64_to_cpu(hpte[0]); if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { - *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_PTEG_FULL; } } @@ -331,7 +333,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); - hpte[0] = cpu_to_be64(pteh); + __unlock_hpte(hpte, pteh); asm volatile("ptesync" : : : "memory"); *pte_idx_ret = pte_index; @@ -412,7 +414,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@ -548,7 +550,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); args[j] |= rcbits << (56 - 5); - hp[0] = 0; + __unlock_hpte(hp, 0); } } @@ -574,7 +576,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, pte = be64_to_cpu(hpte[0]); if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { - hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + __unlock_hpte(hpte, pte); return H_NOT_FOUND; } @@ -755,8 +757,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); - /* Unlock and move on */ - hpte[i] = cpu_to_be64(v); + __unlock_hpte(&hpte[i], v); } if (val & HPTE_V_SECONDARY) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 7c22997de906..00e45b6d4f24 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -23,17 +23,37 @@ #define DEBUG_PASSUP -static inline void rm_writeb(unsigned long paddr, u8 val) +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +/* -- ICS routines -- */ +static void ics_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_ics *ics, struct kvmppc_icp *icp) { - __asm__ __volatile__("sync; stbcix %0,0,%1" - : : "r" (val), "r" (paddr) : "memory"); + int i; + + arch_spin_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + arch_spin_unlock(&ics->lock); + icp_rm_deliver_irq(xics, icp, state->number); + arch_spin_lock(&ics->lock); + } + + arch_spin_unlock(&ics->lock); } +/* -- ICP routines -- */ + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; - unsigned long xics_phys; int cpu; /* Mark the target VCPU as having an interrupt pending */ @@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, /* In SMT cpu will always point to thread 0, we adjust it */ cpu += vcpu->arch.ptid; - /* Not too hard, then poke the target */ - xics_phys = paca[cpu].kvm_hstate.xics_phys; - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + smp_mb(); + kvmhv_rm_send_ipi(cpu); } static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) @@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics, return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; } +static void icp_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_rm_check_resend(xics, ics, icp); + } +} + +static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + do { + old_state = new_state = READ_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + return success; +} + +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + /* Unsafe increment, but this does not need to be accurate */ + xics->err_noics++; + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + arch_spin_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + /* Unsafe increment again*/ + xics->err_noicp++; + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differentiate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * ics spin lock. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_rm_try_to_deliver() the target + * processor may well have already consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + arch_spin_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_rm_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + arch_spin_unlock(&ics->lock); + goto again; + } + } + out: + arch_spin_unlock(&ics->lock); +} + static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u8 new_cppr) { @@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * separately here as well. */ if (resend) { - icp->rm_action |= XICS_RM_CHECK_RESEND; - icp->rm_resend_icp = icp; + icp->n_check_resend++; + icp_rm_check_resend(xics, icp); } } @@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, } } while (!icp_rm_try_update(icp, old_state, new_state)); - /* Pass rejects to virtual mode */ + /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { - this_icp->rm_action |= XICS_RM_REJECT; - this_icp->rm_reject = reject; + this_icp->n_reject++; + icp_rm_deliver_irq(xics, icp, reject); } - /* Pass resends to virtual mode */ + /* Handle resends in real mode */ if (resend) { - this_icp->rm_action |= XICS_RM_CHECK_RESEND; - this_icp->rm_resend_icp = icp; + this_icp->n_check_resend++; + icp_rm_check_resend(xics, icp); } return check_too_hard(xics, this_icp); @@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) } while (!icp_rm_try_update(icp, old_state, new_state)); - /* Pass rejects to virtual mode */ + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_rm_deliver_irq). + */ if (reject && reject != XICS_IPI) { - icp->rm_action |= XICS_RM_REJECT; - icp->rm_reject = reject; + icp->n_reject++; + icp_rm_deliver_irq(xics, icp, reject); } bail: return check_too_hard(xics, icp); @@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) goto bail; state = &ics->irq_state[src]; - /* Still asserted, resend it, we make it look like a reject */ + /* Still asserted, resend it */ if (state->asserted) { - icp->rm_action |= XICS_RM_REJECT; - icp->rm_reject = irq; + icp->n_reject++; + icp_rm_deliver_irq(xics, icp, irq); } if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 6cbf1630cb70..4d70df26c402 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ + /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + mfspr r3, SPRN_HDEC + mtspr SPRN_DEC, r3 + /* + * Make sure the primary has finished the MMU switch. + * We should never get here on a secondary thread, but + * check it for robustness' sake. + */ + ld r5, HSTATE_KVM_VCORE(r13) +65: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 65b + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync /* set our bit in napping_threads */ ld r5, HSTATE_KVM_VCORE(r13) lbz r7, HSTATE_PTID(r13) @@ -182,7 +198,7 @@ kvmppc_primary_no_guest: or r3, r3, r0 stwcx. r3, 0, r6 bne 1b - /* order napping_threads update vs testing entry_exit_count */ + /* order napping_threads update vs testing entry_exit_map */ isync li r12, 0 lwz r7, VCORE_ENTRY_EXIT(r5) @@ -191,6 +207,7 @@ kvmppc_primary_no_guest: li r3, NAPPING_NOVCPU stb r3, HSTATE_NAPPING(r13) + li r3, 0 /* Don't wake on privileged (OS) doorbell */ b kvm_do_nap kvm_novcpu_wakeup: @@ -202,7 +219,7 @@ kvm_novcpu_wakeup: /* check the wake reason */ bl kvmppc_check_wake_reason - + /* see if any other thread is already exiting */ lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -222,13 +239,37 @@ kvm_novcpu_wakeup: cmpdi r3, 0 bge kvm_novcpu_exit + /* See if our timeslice has expired (HDEC is negative) */ + mfspr r0, SPRN_HDEC + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + cmpwi r0, 0 + blt kvm_novcpu_exit + /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ ld r4, HSTATE_KVM_VCPU(r13) cmpdi r4, 0 - bne kvmppc_got_guest + beq kvmppc_primary_no_guest + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +#endif + b kvmppc_got_guest kvm_novcpu_exit: - b hdec_soon +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq 13f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif +13: mr r3, r12 + stw r12, 112-4(r1) + bl kvmhv_commence_exit + nop + lwz r12, 112-4(r1) + b kvmhv_switch_to_host /* * We come in here when wakened from nap mode. @@ -239,9 +280,9 @@ kvm_novcpu_exit: kvm_start_guest: /* Set runlatch bit the minute you wake up from nap */ - mfspr r1, SPRN_CTRLF - ori r1, r1, 1 - mtspr SPRN_CTRLT, r1 + mfspr r0, SPRN_CTRLF + ori r0, r0, 1 + mtspr SPRN_CTRLT, r0 ld r2,PACATOC(r13) @@ -286,26 +327,21 @@ kvm_secondary_got_guest: ld r6, PACA_DSCR(r13) std r6, HSTATE_DSCR(r13) + /* Order load of vcore, ptid etc. after load of vcpu */ + lwsync bl kvmppc_hv_entry /* Back from the guest, go back to nap */ /* Clear our vcpu pointer so we don't come back in early */ li r0, 0 - std r0, HSTATE_KVM_VCPU(r13) /* - * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing - * the nap_count, because once the increment to nap_count is - * visible we could be given another vcpu. + * Once we clear HSTATE_KVM_VCPU(r13), the code in + * kvmppc_run_core() is going to assume that all our vcpu + * state is visible in memory. This lwsync makes sure + * that that is true. */ lwsync - - /* increment the nap count and then go to nap mode */ - ld r4, HSTATE_KVM_VCORE(r13) - addi r4, r4, VCORE_NAP_COUNT -51: lwarx r3, 0, r4 - addi r3, r3, 1 - stwcx. r3, 0, r4 - bne 51b + std r0, HSTATE_KVM_VCPU(r13) /* * At this point we have finished executing in the guest. @@ -376,6 +412,14 @@ kvmppc_hv_entry: li r6, KVM_GUEST_MODE_HOST_HV stb r6, HSTATE_IN_GUEST(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Store initial timestamp */ + cmpdi r4, 0 + beq 1f + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +1: +#endif /* Clear out SLB */ li r6,0 slbmte r6,r6 @@ -387,21 +431,23 @@ kvmppc_hv_entry: * We don't have to lock against concurrent tlbies, * but we do have to coordinate across hardware threads. */ - /* Increment entry count iff exit count is zero. */ - ld r5,HSTATE_KVM_VCORE(r13) - addi r9,r5,VCORE_ENTRY_EXIT -21: lwarx r3,0,r9 - cmpwi r3,0x100 /* any threads starting to exit? */ + /* Set bit in entry map iff exit map is zero. */ + ld r5, HSTATE_KVM_VCORE(r13) + li r7, 1 + lbz r6, HSTATE_PTID(r13) + sld r7, r7, r6 + addi r9, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r9 + cmpwi r3, 0x100 /* any threads starting to exit? */ bge secondary_too_late /* if so we're too late to the party */ - addi r3,r3,1 - stwcx. r3,0,r9 + or r3, r3, r7 + stwcx. r3, 0, r9 bne 21b /* Primary thread switches to guest partition. */ ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ - lbz r6,HSTATE_PTID(r13) cmpwi r6,0 - bne 20f + bne 10f ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) li r0,LPID_RSVD /* switch to reserved LPID */ @@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ - b 10f - - /* Secondary threads wait for primary to have done partition switch */ -20: lbz r0,VCORE_IN_GUEST(r5) - cmpwi r0,0 - beq 20b - - /* Set LPCR and RMOR. */ -10: ld r8,VCORE_LPCR(r5) - mtspr SPRN_LPCR,r8 - ld r8,KVM_RMOR(r9) - mtspr SPRN_RMOR,r8 - isync - - /* Check if HDEC expires soon */ - mfspr r3,SPRN_HDEC - cmpwi r3,512 /* 1 microsecond */ - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - blt hdec_soon /* Do we have a guest vcpu to run? */ - cmpdi r4, 0 +10: cmpdi r4, 0 beq kvmppc_primary_no_guest kvmppc_got_guest: @@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) clrrdi r6,r6,1 mtspr SPRN_CTRLT,r6 4: + /* Secondary threads wait for primary to have done partition switch */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, HSTATE_PTID(r13) + cmpwi r6, 0 + beq 21f + lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + bne 21f + HMT_LOW +20: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 20b + HMT_MEDIUM +21: + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync + + /* Check if HDEC expires soon */ + mfspr r3, SPRN_HDEC + cmpwi r3, 512 /* 1 microsecond */ + blt hdec_soon + ld r6, VCPU_CTR(r4) lwz r7, VCPU_XER(r4) @@ -880,6 +931,12 @@ fast_guest_return: li r9, KVM_GUEST_MODE_GUEST_HV stb r9, HSTATE_IN_GUEST(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Accumulate timing */ + addi r3, r4, VCPU_TB_GUEST + bl kvmhv_accumulate_time +#endif + /* Enter guest */ BEGIN_FTR_SECTION @@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) hrfid b . +secondary_too_late: + li r12, 0 + cmpdi r4, 0 + beq 11f + stw r12, VCPU_TRAP(r4) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif +11: b kvmhv_switch_to_host + +hdec_soon: + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + stw r12, VCPU_TRAP(r4) + mr r9, r4 +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif + b guest_exit_cont + /****************************************************************************** * * * Exit code * @@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) stw r12,VCPU_TRAP(r9) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMINTR + mr r4, r9 + bl kvmhv_accumulate_time + ld r5, VCPU_GPR(R5)(r9) + ld r6, VCPU_GPR(R6)(r9) + ld r7, VCPU_GPR(R7)(r9) + ld r8, VCPU_GPR(R8)(r9) +#endif + /* Save HEIR (HV emulation assist reg) in emul_inst if this is an HEI (HV emulation interrupt, e40) */ li r3,KVM_INST_FETCH_FAILED @@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) bne 2f mfspr r3,SPRN_HDEC cmpwi r3,0 - bge ignore_hdec + mr r4,r9 + bge fast_guest_return 2: /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode + /* Hypervisor doorbell - exit only if host IPI flag set */ + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + bne 3f + lbz r0, HSTATE_HOST_IPI(r13) + beq 4f + b guest_exit_cont +3: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - bne+ ext_interrupt_to_host + bne+ guest_exit_cont /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now */ bl kvmppc_read_intr cmpdi r3, 0 - bgt ext_interrupt_to_host + bgt guest_exit_cont /* Check if any CPU is heading out to the host, if so head out too */ - ld r5, HSTATE_KVM_VCORE(r13) +4: ld r5, HSTATE_KVM_VCORE(r13) lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 - bge ext_interrupt_to_host - - /* Return to guest after delivering any pending interrupt */ mr r4, r9 - b deliver_guest_interrupt - -ext_interrupt_to_host: + blt deliver_guest_interrupt guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save more register state */ @@ -1065,7 +1156,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ stw r7, VCPU_DSISR(r9) /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq 6f + beq mc_cont std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) @@ -1073,9 +1164,20 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK beq machine_check_realmode mc_cont: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMEXIT + mr r4, r9 + bl kvmhv_accumulate_time +#endif + + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + nop + ld r9, HSTATE_KVM_VCPU(r13) + lwz r12, VCPU_TRAP(r9) /* Save guest CTRL register, set runlatch to 1 */ -6: mfspr r6,SPRN_CTRLF + mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync -hdec_soon: /* r12 = trap, r13 = paca */ /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. */ - /* Increment the threads-exiting-guest count in the 0xff00 - bits of vcore->entry_exit_count */ - ld r5,HSTATE_KVM_VCORE(r13) - addi r6,r5,VCORE_ENTRY_EXIT -41: lwarx r3,0,r6 - addi r0,r3,0x100 - stwcx. r0,0,r6 - bne 41b - isync /* order stwcx. vs. reading napping_threads */ - - /* - * At this point we have an interrupt that we have to pass - * up to the kernel or qemu; we can't handle it in real mode. - * Thus we have to do a partition switch, so we have to - * collect the other threads, if we are the first thread - * to take an interrupt. To do this, we set the HDEC to 0, - * which causes an HDEC interrupt in all threads within 2ns - * because the HDEC register is shared between all 4 threads. - * However, we don't need to bother if this is an HDEC - * interrupt, since the other threads will already be on their - * way here in that case. - */ - cmpwi r3,0x100 /* Are we the first here? */ - bge 43f - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER - beq 40f - li r0,0 - mtspr SPRN_HDEC,r0 -40: - /* - * Send an IPI to any napping threads, since an HDEC interrupt - * doesn't wake CPUs up from nap. - */ - lwz r3,VCORE_NAPPING_THREADS(r5) - lbz r4,HSTATE_PTID(r13) - li r0,1 - sld r0,r0,r4 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ - beq 43f - /* Order entry/exit update vs. IPIs */ - sync - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ - subf r6,r4,r13 -42: andi. r0,r3,1 - beq 44f - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ - li r0,IPI_PRIORITY - li r7,XICS_MFRR - stbcix r0,r7,r8 /* trigger the IPI */ -44: srdi. r3,r3,1 - addi r6,r6,PACA_SIZE - bne 42b - -secondary_too_late: +kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ -43: ld r5,HSTATE_KVM_VCORE(r13) + ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ lbz r3,HSTATE_PTID(r13) cmpwi r3,0 @@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 1: addi r8,r8,16 .endr +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Finish timing, if we have a vcpu */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + li r3, 0 + beq 2f + bl kvmhv_accumulate_time +2: +#endif /* Unset guest mode */ li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) @@ -1696,8 +1753,10 @@ kvmppc_hisi: * Returns to the guest if we handle it, or continues on up to * the kernel if we can't (i.e. if we don't have a handler for * it, or if the handler returns H_TOO_HARD). + * + * r5 - r8 contain hcall args, + * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca */ - .globl hcall_try_real_mode hcall_try_real_mode: ld r3,VCPU_GPR(R3)(r9) andi. r0,r11,MSR_PR @@ -1839,13 +1898,124 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ + .long 0 /* 0x140 */ + .long 0 /* 0x144 */ + .long 0 /* 0x148 */ + .long 0 /* 0x14c */ + .long 0 /* 0x150 */ + .long 0 /* 0x154 */ + .long 0 /* 0x158 */ + .long 0 /* 0x15c */ + .long 0 /* 0x160 */ + .long 0 /* 0x164 */ + .long 0 /* 0x168 */ + .long 0 /* 0x16c */ + .long 0 /* 0x170 */ + .long 0 /* 0x174 */ + .long 0 /* 0x178 */ + .long 0 /* 0x17c */ + .long 0 /* 0x180 */ + .long 0 /* 0x184 */ + .long 0 /* 0x188 */ + .long 0 /* 0x18c */ + .long 0 /* 0x190 */ + .long 0 /* 0x194 */ + .long 0 /* 0x198 */ + .long 0 /* 0x19c */ + .long 0 /* 0x1a0 */ + .long 0 /* 0x1a4 */ + .long 0 /* 0x1a8 */ + .long 0 /* 0x1ac */ + .long 0 /* 0x1b0 */ + .long 0 /* 0x1b4 */ + .long 0 /* 0x1b8 */ + .long 0 /* 0x1bc */ + .long 0 /* 0x1c0 */ + .long 0 /* 0x1c4 */ + .long 0 /* 0x1c8 */ + .long 0 /* 0x1cc */ + .long 0 /* 0x1d0 */ + .long 0 /* 0x1d4 */ + .long 0 /* 0x1d8 */ + .long 0 /* 0x1dc */ + .long 0 /* 0x1e0 */ + .long 0 /* 0x1e4 */ + .long 0 /* 0x1e8 */ + .long 0 /* 0x1ec */ + .long 0 /* 0x1f0 */ + .long 0 /* 0x1f4 */ + .long 0 /* 0x1f8 */ + .long 0 /* 0x1fc */ + .long 0 /* 0x200 */ + .long 0 /* 0x204 */ + .long 0 /* 0x208 */ + .long 0 /* 0x20c */ + .long 0 /* 0x210 */ + .long 0 /* 0x214 */ + .long 0 /* 0x218 */ + .long 0 /* 0x21c */ + .long 0 /* 0x220 */ + .long 0 /* 0x224 */ + .long 0 /* 0x228 */ + .long 0 /* 0x22c */ + .long 0 /* 0x230 */ + .long 0 /* 0x234 */ + .long 0 /* 0x238 */ + .long 0 /* 0x23c */ + .long 0 /* 0x240 */ + .long 0 /* 0x244 */ + .long 0 /* 0x248 */ + .long 0 /* 0x24c */ + .long 0 /* 0x250 */ + .long 0 /* 0x254 */ + .long 0 /* 0x258 */ + .long 0 /* 0x25c */ + .long 0 /* 0x260 */ + .long 0 /* 0x264 */ + .long 0 /* 0x268 */ + .long 0 /* 0x26c */ + .long 0 /* 0x270 */ + .long 0 /* 0x274 */ + .long 0 /* 0x278 */ + .long 0 /* 0x27c */ + .long 0 /* 0x280 */ + .long 0 /* 0x284 */ + .long 0 /* 0x288 */ + .long 0 /* 0x28c */ + .long 0 /* 0x290 */ + .long 0 /* 0x294 */ + .long 0 /* 0x298 */ + .long 0 /* 0x29c */ + .long 0 /* 0x2a0 */ + .long 0 /* 0x2a4 */ + .long 0 /* 0x2a8 */ + .long 0 /* 0x2ac */ + .long 0 /* 0x2b0 */ + .long 0 /* 0x2b4 */ + .long 0 /* 0x2b8 */ + .long 0 /* 0x2bc */ + .long 0 /* 0x2c0 */ + .long 0 /* 0x2c4 */ + .long 0 /* 0x2c8 */ + .long 0 /* 0x2cc */ + .long 0 /* 0x2d0 */ + .long 0 /* 0x2d4 */ + .long 0 /* 0x2d8 */ + .long 0 /* 0x2dc */ + .long 0 /* 0x2e0 */ + .long 0 /* 0x2e4 */ + .long 0 /* 0x2e8 */ + .long 0 /* 0x2ec */ + .long 0 /* 0x2f0 */ + .long 0 /* 0x2f4 */ + .long 0 /* 0x2f8 */ + .long 0 /* 0x2fc */ + .long DOTSYM(kvmppc_h_random) - hcall_real_table .globl hcall_real_table_end hcall_real_table_end: -ignore_hdec: - mr r4,r9 - b fast_guest_return - _GLOBAL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f @@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, 0 blr -_GLOBAL(kvmppc_h_cede) +_GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ ori r11,r11,MSR_EE std r11,VCPU_MSR(r3) li r0,1 @@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede) lbz r5,VCPU_PRODDED(r3) cmpwi r5,0 bne kvm_cede_prodded - li r0,0 /* set trap to 0 to say hcall is handled */ - stw r0,VCPU_TRAP(r3) + li r12,0 /* set trap to 0 to say hcall is handled */ + stw r12,VCPU_TRAP(r3) li r0,H_SUCCESS std r0,VCPU_GPR(R3)(r3) @@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede) addi r6,r5,VCORE_NAPPING_THREADS 31: lwarx r4,0,r6 or r4,r4,r0 - PPC_POPCNTW(R7,R4) - cmpw r7,r8 - bge kvm_cede_exit + cmpw r4,r8 + beq kvm_cede_exit stwcx. r4,0,r6 bne 31b - /* order napping_threads update vs testing entry_exit_count */ + /* order napping_threads update vs testing entry_exit_map */ isync li r0,NAPPING_CEDE stb r0,HSTATE_NAPPING(r13) @@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede) bl kvmppc_save_fp /* + * Set DEC to the smaller of DEC and HDEC, so that we wake + * no later than the end of our timeslice (HDEC interrupts + * don't wake us from nap). + */ + mfspr r3, SPRN_DEC + mfspr r4, SPRN_HDEC + mftb r5 + cmpw r3, r4 + ble 67f + mtspr SPRN_DEC, r4 +67: + /* save expiry time of guest decrementer */ + extsw r3, r3 + add r3, r3, r5 + ld r4, HSTATE_KVM_VCPU(r13) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + subf r3, r6, r3 /* convert to host TB value */ + std r3, VCPU_DEC_EXPIRES(r4) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + addi r3, r4, VCPU_TB_CEDE + bl kvmhv_accumulate_time +#endif + + lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ + + /* * Take a nap until a decrementer or external or doobell interrupt - * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the - * runlatch bit before napping. + * occurs, with PECE1 and PECE0 set in LPCR. + * On POWER8, set PECEDH, and if we are ceding, also set PECEDP. + * Also clear the runlatch bit before napping. */ kvm_do_nap: - mfspr r2, SPRN_CTRLF - clrrdi r2, r2, 1 - mtspr SPRN_CTRLT, r2 + mfspr r0, SPRN_CTRLF + clrrdi r0, r0, 1 + mtspr SPRN_CTRLT, r0 li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION - oris r5,r5,LPCR_PECEDP@h + ori r5, r5, LPCR_PECEDH + rlwimi r5, r3, 0, LPCR_PECEDP END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_LPCR,r5 isync @@ -1994,9 +2194,23 @@ kvm_end_cede: /* Woken by external or decrementer interrupt */ ld r1, HSTATE_HOST_R1(r13) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMINTR + bl kvmhv_accumulate_time +#endif + /* load up FP state */ bl kvmppc_load_fp + /* Restore guest decrementer */ + ld r3, VCPU_DEC_EXPIRES(r4) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + add r3, r3, r6 /* convert host TB to guest TB value */ + mftb r7 + subf r3, r7, r3 + mtspr SPRN_DEC, r3 + /* Load NV GPRS */ ld r14, VCPU_GPR(R14)(r4) ld r15, VCPU_GPR(R15)(r4) @@ -2057,7 +2271,8 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: - b hcall_real_fallback + ld r9, HSTATE_KVM_VCPU(r13) + b guest_exit_cont /* Try to handle a machine check in real mode */ machine_check_realmode: @@ -2089,13 +2304,14 @@ machine_check_realmode: /* * Check the reason we woke from nap, and take appropriate action. - * Returns: + * Returns (in r3): * 0 if nothing needs to be done * 1 if something happened that needs to be handled by the host - * -1 if there was a guest wakeup (IPI) + * -1 if there was a guest wakeup (IPI or msgsnd) * * Also sets r12 to the interrupt vector for any interrupt that needs * to be handled now by the host (0x500 for external interrupt), or zero. + * Modifies r0, r6, r7, r8. */ kvmppc_check_wake_reason: mfspr r6, SPRN_SRR1 @@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* hypervisor doorbell */ 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + /* see if it's a host IPI */ li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bnelr + /* if not, clear it and return -1 */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) + li r3, -1 blr /* @@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * 0 if no interrupt is pending * 1 if an interrupt is pending that needs to be handled by the host * -1 if there was a guest wakeup IPI (which has now been cleared) + * Modifies r0, r6, r7, r8, returns value in r3. */ kvmppc_read_intr: /* see if a host IPI is pending */ @@ -2185,6 +2410,7 @@ kvmppc_read_intr: bne- 43f /* OK, it's an IPI for us */ + li r12, 0 li r3, -1 1: blr @@ -2314,3 +2540,62 @@ kvmppc_fix_pmao: mtspr SPRN_PMC6, r3 isync blr + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +/* + * Start timing an activity + * r3 = pointer to time accumulation struct, r4 = vcpu + */ +kvmhv_start_timing: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, VCORE_IN_GUEST(r5) + cmpwi r6, 0 + beq 5f /* if in guest, need to */ + ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +5: mftb r5 + subf r5, r6, r5 + std r3, VCPU_CUR_ACTIVITY(r4) + std r5, VCPU_ACTIVITY_START(r4) + blr + +/* + * Accumulate time to one activity and start another. + * r3 = pointer to new time accumulation struct, r4 = vcpu + */ +kvmhv_accumulate_time: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r8, VCORE_IN_GUEST(r5) + cmpwi r8, 0 + beq 4f /* if in guest, need to */ + ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +4: ld r5, VCPU_CUR_ACTIVITY(r4) + ld r6, VCPU_ACTIVITY_START(r4) + std r3, VCPU_CUR_ACTIVITY(r4) + mftb r7 + subf r7, r8, r7 + std r7, VCPU_ACTIVITY_START(r4) + cmpdi r5, 0 + beqlr + subf r3, r6, r7 + ld r8, TAS_SEQCOUNT(r5) + cmpdi r8, 0 + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + lwsync + ld r7, TAS_TOTAL(r5) + add r7, r7, r3 + std r7, TAS_TOTAL(r5) + ld r6, TAS_MIN(r5) + ld r7, TAS_MAX(r5) + beq 3f + cmpd r3, r6 + bge 1f +3: std r3, TAS_MIN(r5) +1: cmpd r3, r7 + ble 2f + std r3, TAS_MAX(r5) +2: lwsync + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + blr +#endif diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index ce3c893d509b..f2c75a1e0536 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_load(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_store(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) clear_bit(KVM_REQ_UNHALT, &vcpu->requests); vcpu->stat.halt_wakeup++; return EMULATE_DONE; + case H_LOGICAL_CI_LOAD: + return kvmppc_h_pr_logical_ci_load(vcpu); + case H_LOGICAL_CI_STORE: + return kvmppc_h_pr_logical_ci_store(vcpu); case H_XIRR: case H_CPPR: case H_EOI: @@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_BULK_REMOVE: case H_PUT_TCE: case H_CEDE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index a4a8d9f0dcb7..8f3e6cc54d95 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -20,6 +20,7 @@ #include <asm/xics.h> #include <asm/debug.h> #include <asm/time.h> +#include <asm/spinlock.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -39,7 +40,7 @@ * LOCKING * ======= * - * Each ICS has a mutex protecting the information about the IRQ + * Each ICS has a spin lock protecting the information about the IRQ * sources and avoiding simultaneous deliveries if the same interrupt. * * ICP operations are done via a single compare & swap transaction @@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - mutex_lock(&ics->lock); + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; @@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, XICS_DBG("resend %#x prio %#x\n", state->number, state->priority); - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); icp_deliver_irq(xics, icp, state->number); - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, u32 server, u32 priority, u32 saved_priority) { bool deliver; + unsigned long flags; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); state->server = server; state->priority = priority; @@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = true; } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); return deliver; } @@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) struct kvmppc_ics *ics; struct ics_irq_state *state; u16 src; + unsigned long flags; if (!xics) return -ENODEV; @@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) return -EINVAL; state = &ics->irq_state[src]; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); *server = state->server; *priority = state->priority; - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); return 0; } @@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, struct kvmppc_ics *ics; u32 reject; u16 src; + unsigned long flags; /* * This is used both for initial delivery of an interrupt and @@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, state = &ics->irq_state[src]; /* Get a lock on the ICS */ - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); /* Get our server */ if (!icp || state->server != icp->server_num) { @@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * * Note that if successful, the new delivery might have itself * rejected an interrupt that was "delivered" before we took the - * icp mutex. + * ics spin lock. * * In this case we do the whole sequence all over again for the * new guy. We cannot assume that the rejected interrupt is less @@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * Delivery was successful, did we reject somebody else ? */ if (reject && reject != XICS_IPI) { - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); new_irq = reject; goto again; } @@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); goto again; } } out: - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, @@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); - if (icp->rm_action & XICS_RM_KICK_VCPU) + if (icp->rm_action & XICS_RM_KICK_VCPU) { + icp->n_rm_kick_vcpu++; kvmppc_fast_vcpu_kick(icp->rm_kick_target); - if (icp->rm_action & XICS_RM_CHECK_RESEND) + } + if (icp->rm_action & XICS_RM_CHECK_RESEND) { + icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); - if (icp->rm_action & XICS_RM_REJECT) + } + if (icp->rm_action & XICS_RM_REJECT) { + icp->n_rm_reject++; icp_deliver_irq(xics, icp, icp->rm_reject); - if (icp->rm_action & XICS_RM_NOTIFY_EOI) + } + if (icp->rm_action & XICS_RM_NOTIFY_EOI) { + icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); + } icp->rm_action = 0; @@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private) struct kvm *kvm = xics->kvm; struct kvm_vcpu *vcpu; int icsid, i; + unsigned long flags; + unsigned long t_rm_kick_vcpu, t_rm_check_resend; + unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_reject, t_check_resend; if (!kvm) return 0; + t_rm_kick_vcpu = 0; + t_rm_notify_eoi = 0; + t_rm_check_resend = 0; + t_rm_reject = 0; + t_check_resend = 0; + t_reject = 0; + seq_printf(m, "=========\nICP state\n=========\n"); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private) icp->server_num, state.xisr, state.pending_pri, state.cppr, state.mfrr, state.out_ee, state.need_resend); + t_rm_kick_vcpu += icp->n_rm_kick_vcpu; + t_rm_notify_eoi += icp->n_rm_notify_eoi; + t_rm_check_resend += icp->n_rm_check_resend; + t_rm_reject += icp->n_rm_reject; + t_check_resend += icp->n_check_resend; + t_reject += icp->n_reject; } + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + t_rm_kick_vcpu, t_rm_check_resend, + t_rm_reject, t_rm_notify_eoi); + seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", + t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { struct kvmppc_ics *ics = xics->ics[icsid]; @@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private) seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", icsid); - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; @@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private) irq->resend, irq->masked_pending); } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); } return 0; } @@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, if (!ics) goto out; - mutex_init(&ics->lock); ics->icsid = icsid; for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { @@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) u64 __user *ubufp = (u64 __user *) addr; u16 idx; u64 val, prio; + unsigned long flags; ics = kvmppc_xics_find_ics(xics, irq, &idx); if (!ics) return -ENOENT; irqp = &ics->irq_state[idx]; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); ret = -ENOENT; if (irqp->exists) { val = irqp->server; @@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= KVM_XICS_PENDING; ret = 0; } - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); if (!ret && put_user(val, ubufp)) ret = -EFAULT; @@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) u64 val; u8 prio; u32 server; + unsigned long flags; if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) return -ENOENT; @@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) kvmppc_xics_find_server(xics->kvm, server) == NULL) return -EINVAL; - mutex_lock(&ics->lock); + local_irq_save(flags); + arch_spin_lock(&ics->lock); irqp->server = server; irqp->saved_priority = prio; if (val & KVM_XICS_MASKED) @@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) irqp->asserted = 1; irqp->exists = 1; - mutex_unlock(&ics->lock); + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); if (val & KVM_XICS_PENDING) icp_deliver_irq(xics, NULL, irqp->number); diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 73f0f2723c07..56ea44f9867f 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -78,13 +78,22 @@ struct kvmppc_icp { u32 rm_reject; u32 rm_eoied_irq; + /* Counters for each reason we exited real mode */ + unsigned long n_rm_kick_vcpu; + unsigned long n_rm_check_resend; + unsigned long n_rm_reject; + unsigned long n_rm_notify_eoi; + /* Counters for handling ICP processing in real mode */ + unsigned long n_check_resend; + unsigned long n_reject; + /* Debug stuff for real mode */ union kvmppc_icp_state rm_dbgstate; struct kvm_vcpu *rm_dbgtgt; }; struct kvmppc_ics { - struct mutex lock; + arch_spinlock_t lock; u16 icsid; struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; }; @@ -96,6 +105,8 @@ struct kvmppc_xics { u32 max_icsid; bool real_mode; bool real_mode_dbg; + u32 err_noics; + u32 err_noicp; struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; }; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index cc536d4a75ef..4d33e199edcc 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -338,6 +338,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pte_t *ptep; unsigned int wimg = 0; pgd_t *pgdir; + unsigned long flags; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; @@ -468,15 +469,28 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pgdir = vcpu_e500->vcpu.arch.pgdir; - ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); - if (pte_present(*ptep)) - wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - else { - if (printk_ratelimit()) - pr_err("%s: pte not present: gfn %lx, pfn %lx\n", - __func__, (long)gfn, pfn); - ret = -EINVAL; - goto out; + /* + * We are just looking at the wimg bits, so we don't + * care much about the trans splitting bit. + * We are holding kvm->mmu_lock so a notifier invalidate + * can't run hence pfn won't change. + */ + local_irq_save(flags); + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); + if (ptep) { + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & + MAS2_WIMGE_MASK; + local_irq_restore(flags); + } else { + local_irq_restore(flags); + pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } } kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 91bbc845ac66..ac3ddf115f3d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_RMA: r = 0; break; + case KVM_CAP_PPC_HWRNG: + r = kvmppc_hwrng_present(); + break; #endif case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 2c2022d16059..fda236f908eb 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1066,7 +1066,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); + ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; @@ -1394,6 +1394,7 @@ tm_abort: tm_abort(TM_CAUSE_TLBI); } #endif + return; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index fa9d5c238d22..0ce968b00b7c 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { /* Only called for hugetlbfs pages, hence can ignore THP */ - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -681,28 +682,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } +/* + * We are holding mmap_sem, so a parallel huge page collapse cannot run. + * To prevent hugepage split, disable irq. + */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; unsigned shift; - unsigned long mask; + unsigned long mask, flags; /* * Transparent hugepages are handled by generic code. We can skip them * here. */ + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) { + local_irq_restore(flags); return ERR_PTR(-EINVAL); - + } mask = (1UL << shift) - 1; page = pte_page(*ptep); if (page) page += (address & mask) / PAGE_SIZE; + local_irq_restore(flags); return page; } @@ -949,9 +957,12 @@ void flush_dcache_icache_hugepage(struct page *page) * * So long as we atomically load page table pointers we are safe against teardown, * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 */ -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -1003,12 +1014,11 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift * A hugepage collapse is captured by pmd_none, because * it mark the pmd none and do a hpte invalidate. * - * A hugepage split is captured by pmd_trans_splitting - * because we mark the pmd trans splitting and do a - * hpte invalidate - * + * We don't worry about pmd_trans_splitting here, The + * caller if it needs to handle the splitting case + * should check for that. */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return NULL; if (pmd_huge(pmd) || pmd_large(pmd)) { @@ -1030,7 +1040,7 @@ out: *shift = pdshift; return ret_pte; } -EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); +EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index ead55351b254..ff09cde20cd2 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -111,41 +111,45 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) * interrupt context, so if the access faults, we read the page tables * to find which page (if any) is mapped and access it directly. */ -static int read_user_stack_slow(void __user *ptr, void *ret, int nb) +static int read_user_stack_slow(void __user *ptr, void *buf, int nb) { + int ret = -EFAULT; pgd_t *pgdir; pte_t *ptep, pte; unsigned shift; unsigned long addr = (unsigned long) ptr; unsigned long offset; - unsigned long pfn; + unsigned long pfn, flags; void *kaddr; pgdir = current->mm->pgd; if (!pgdir) return -EFAULT; + local_irq_save(flags); ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); + if (!ptep) + goto err_out; if (!shift) shift = PAGE_SHIFT; /* align address to page boundary */ offset = addr & ((1UL << shift) - 1); - addr -= offset; - if (ptep == NULL) - return -EFAULT; - pte = *ptep; + pte = READ_ONCE(*ptep); if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) - return -EFAULT; + goto err_out; pfn = pte_pfn(pte); if (!page_is_ram(pfn)) - return -EFAULT; + goto err_out; /* no highmem to worry about here */ kaddr = pfn_to_kaddr(pfn); - memcpy(ret, kaddr + offset, nb); - return 0; + memcpy(buf, kaddr + offset, nb); + ret = 0; +err_out: + local_irq_restore(flags); + return ret; } static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 1a3429e1ccb5..1ba6307be4db 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -111,7 +111,7 @@ out: static int spufs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size != inode->i_size)) @@ -163,14 +163,14 @@ static void spufs_prune_dir(struct dentry *dir) { struct dentry *dentry, *tmp; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) { spin_lock(&dentry->d_lock); - if (!(d_unhashed(dentry)) && dentry->d_inode) { + if (!(d_unhashed(dentry)) && d_really_is_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(dir->d_inode, dentry); + simple_unlink(d_inode(dir), dentry); /* XXX: what was dcache_lock protecting here? Other * filesystems (IB, configfs) release dcache_lock * before unlink */ @@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir) } } shrink_dcache_parent(dir); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); } /* Caller must hold parent->i_mutex */ @@ -192,7 +192,7 @@ static int spufs_rmdir(struct inode *parent, struct dentry *dir) d_drop(dir); res = simple_rmdir(parent, dir); /* We have to give up the mm_struct */ - spu_forget(SPUFS_I(dir->d_inode)->i_ctx); + spu_forget(SPUFS_I(d_inode(dir))->i_ctx); return res; } @@ -222,8 +222,8 @@ static int spufs_dir_close(struct inode *inode, struct file *file) int ret; dir = file->f_path.dentry; - parent = dir->d_parent->d_inode; - ctx = SPUFS_I(dir->d_inode)->i_ctx; + parent = d_inode(dir->d_parent); + ctx = SPUFS_I(d_inode(dir))->i_ctx; mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT); ret = spufs_rmdir(parent, dir); @@ -460,7 +460,7 @@ spufs_create_context(struct inode *inode, struct dentry *dentry, goto out_aff_unlock; if (affinity) { - spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx, + spufs_set_affinity(flags, SPUFS_I(d_inode(dentry))->i_ctx, neighbor); if (neighbor) put_spu_context(neighbor); @@ -504,7 +504,7 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode) d_instantiate(dentry, inode); inc_nlink(dir); - inc_nlink(dentry->d_inode); + inc_nlink(d_inode(dentry)); return ret; out_iput: @@ -561,7 +561,7 @@ static struct file_system_type spufs_type; long spufs_create(struct path *path, struct dentry *dentry, unsigned int flags, umode_t mode, struct file *filp) { - struct inode *dir = path->dentry->d_inode; + struct inode *dir = d_inode(path->dentry); int ret; /* check if we are on spufs */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 80db43944afe..6eb808ff637e 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -24,12 +24,22 @@ struct powernv_rng { void __iomem *regs; + void __iomem *regs_real; unsigned long mask; }; static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); +int powernv_hwrng_present(void) +{ + struct powernv_rng *rng; + + rng = get_cpu_var(powernv_rng); + put_cpu_var(rng); + return rng != NULL; +} + static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) { unsigned long parity; @@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) return val; } +int powernv_get_random_real_mode(unsigned long *v) +{ + struct powernv_rng *rng; + + rng = raw_cpu_read(powernv_rng); + + *v = rng_whiten(rng, in_rm64(rng->regs_real)); + + return 1; +} + int powernv_get_random_long(unsigned long *v) { struct powernv_rng *rng; @@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, static __init int rng_create(struct device_node *dn) { struct powernv_rng *rng; + struct resource res; unsigned long val; rng = kzalloc(sizeof(*rng), GFP_KERNEL); if (!rng) return -ENOMEM; + if (of_address_to_resource(dn, 0, &res)) { + kfree(rng); + return -ENXIO; + } + + rng->regs_real = (void __iomem *)res.start; + rng->regs = of_iomap(dn, 0); if (!rng->regs) { kfree(rng); diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 3f5c799b7fb5..d3f896a35b98 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -48,7 +48,7 @@ static struct dentry *hypfs_last_dentry; static void hypfs_update_update(struct super_block *sb) { struct hypfs_sb_info *sb_info = sb->s_fs_info; - struct inode *inode = sb_info->update_file->d_inode; + struct inode *inode = d_inode(sb_info->update_file); sb_info->last_update = get_seconds(); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -64,7 +64,7 @@ static void hypfs_add_dentry(struct dentry *dentry) static inline int hypfs_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } static void hypfs_remove(struct dentry *dentry) @@ -72,16 +72,16 @@ static void hypfs_remove(struct dentry *dentry) struct dentry *parent; parent = dentry->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); if (hypfs_positive(dentry)) { if (d_is_dir(dentry)) - simple_rmdir(parent->d_inode, dentry); + simple_rmdir(d_inode(parent), dentry); else - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); } d_delete(dentry); dput(dentry); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); } static void hypfs_delete_tree(struct dentry *root) @@ -336,7 +336,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { dentry = ERR_PTR(-ENOMEM); @@ -357,14 +357,14 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, } else if (S_ISDIR(mode)) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - inc_nlink(parent->d_inode); + inc_nlink(d_inode(parent)); } else BUG(); inode->i_private = data; d_instantiate(dentry, inode); dget(dentry); fail: - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return dentry; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index afa2bd750ffc..8cd8e7b288c5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[] = { 0xffe6fffbfcfdfc40UL, - 0x205c800000000000UL, + 0x005c800000000000UL, }; unsigned long kvm_s390_fac_list_mask_size(void) diff --git a/arch/sh/boards/board-sh7757lcr.c b/arch/sh/boards/board-sh7757lcr.c index 669df51a82e3..324599bfad14 100644 --- a/arch/sh/boards/board-sh7757lcr.c +++ b/arch/sh/boards/board-sh7757lcr.c @@ -17,6 +17,7 @@ #include <linux/spi/spi.h> #include <linux/spi/flash.h> #include <linux/io.h> +#include <linux/mfd/tmio.h> #include <linux/mmc/host.h> #include <linux/mmc/sh_mmcif.h> #include <linux/mmc/sh_mobile_sdhi.h> @@ -243,10 +244,10 @@ static struct platform_device sh_mmcif_device = { }; /* SDHI0 */ -static struct sh_mobile_sdhi_info sdhi_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI_RX, - .tmio_caps = MMC_CAP_SD_HIGHSPEED, +static struct tmio_mmc_data sdhi_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI_RX, + .capabilities = MMC_CAP_SD_HIGHSPEED, }; static struct resource sdhi_resources[] = { diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index d4b01d4cc102..cbd2a9f02a91 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -18,6 +18,7 @@ #include <linux/mmc/sh_mobile_sdhi.h> #include <linux/mtd/physmap.h> #include <linux/mtd/sh_flctl.h> +#include <linux/mfd/tmio.h> #include <linux/delay.h> #include <linux/i2c.h> #include <linux/regulator/fixed.h> @@ -447,8 +448,8 @@ static struct resource sdhi0_cn3_resources[] = { }, }; -static struct sh_mobile_sdhi_info sdhi0_cn3_data = { - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sdhi0_cn3_data = { + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device sdhi0_cn3_device = { @@ -474,8 +475,8 @@ static struct resource sdhi1_cn7_resources[] = { }, }; -static struct sh_mobile_sdhi_info sdhi1_cn7_data = { - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sdhi1_cn7_data = { + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device sdhi1_cn7_device = { diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 0d3049244cd3..d531791f06ff 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -601,12 +601,12 @@ static struct platform_device sdhi0_power = { }, }; -static struct sh_mobile_sdhi_info sdhi0_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_caps = MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD | +static struct tmio_mmc_data sdhi0_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .capabilities = MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD | MMC_CAP_NEEDS_POLL, - .tmio_flags = TMIO_MMC_USE_GPIO_CD, + .flags = TMIO_MMC_USE_GPIO_CD, .cd_gpio = GPIO_PTY7, }; @@ -635,12 +635,12 @@ static struct platform_device sdhi0_device = { #if !defined(CONFIG_MMC_SH_MMCIF) && !defined(CONFIG_MMC_SH_MMCIF_MODULE) /* SDHI1 */ -static struct sh_mobile_sdhi_info sdhi1_info = { - .dma_slave_tx = SHDMA_SLAVE_SDHI1_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI1_RX, - .tmio_caps = MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD | +static struct tmio_mmc_data sdhi1_info = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI1_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI1_RX, + .capabilities = MMC_CAP_SDIO_IRQ | MMC_CAP_POWER_OFF_CARD | MMC_CAP_NEEDS_POLL, - .tmio_flags = TMIO_MMC_USE_GPIO_CD, + .flags = TMIO_MMC_USE_GPIO_CD, .cd_gpio = GPIO_PTW7, }; diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index 1df4398f8375..7d997cec09c5 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -373,11 +373,11 @@ static struct resource kfr2r09_sh_sdhi0_resources[] = { }, }; -static struct sh_mobile_sdhi_info sh7724_sdhi0_data = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_flags = TMIO_MMC_WRPROTECT_DISABLE, - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sh7724_sdhi0_data = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .flags = TMIO_MMC_WRPROTECT_DISABLE, + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device kfr2r09_sh_sdhi0_device = { diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index 8b73194ed2ce..29b7c0dcfc51 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -15,6 +15,7 @@ #include <linux/mmc/host.h> #include <linux/mmc/sh_mobile_sdhi.h> #include <linux/mtd/physmap.h> +#include <linux/mfd/tmio.h> #include <linux/mtd/nand.h> #include <linux/i2c.h> #include <linux/regulator/fixed.h> @@ -408,10 +409,10 @@ static struct resource sdhi_cn9_resources[] = { }, }; -static struct sh_mobile_sdhi_info sh7724_sdhi_data = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sh7724_sdhi_data = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device sdhi_cn9_device = { diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index 1162bc6945a3..4f6635a075f2 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -16,6 +16,7 @@ #include <linux/platform_device.h> #include <linux/mmc/host.h> #include <linux/mmc/sh_mobile_sdhi.h> +#include <linux/mfd/tmio.h> #include <linux/mtd/physmap.h> #include <linux/delay.h> #include <linux/regulator/fixed.h> @@ -468,10 +469,10 @@ static struct resource sdhi0_cn7_resources[] = { }, }; -static struct sh_mobile_sdhi_info sh7724_sdhi0_data = { - .dma_slave_tx = SHDMA_SLAVE_SDHI0_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI0_RX, - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sh7724_sdhi0_data = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI0_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI0_RX, + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device sdhi0_cn7_device = { @@ -497,10 +498,10 @@ static struct resource sdhi1_cn8_resources[] = { }, }; -static struct sh_mobile_sdhi_info sh7724_sdhi1_data = { - .dma_slave_tx = SHDMA_SLAVE_SDHI1_TX, - .dma_slave_rx = SHDMA_SLAVE_SDHI1_RX, - .tmio_caps = MMC_CAP_SDIO_IRQ, +static struct tmio_mmc_data sh7724_sdhi1_data = { + .chan_priv_tx = (void *)SHDMA_SLAVE_SDHI1_TX, + .chan_priv_rx = (void *)SHDMA_SLAVE_SDHI1_RX, + .capabilities = MMC_CAP_SDIO_IRQ, }; static struct platform_device sdhi1_cn8_device = { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6049d587599e..226d5696e1d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index a4771dcd1fcf..1f20b35d8573 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -79,7 +79,7 @@ NUM_BLKS = %rdx c = %rcx d = %r8 e = %rdx -y3 = %rdi +y3 = %rsi TBL = %rbp diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..72bf2680f819 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -427,6 +427,13 @@ sysretl_from_sys_call: * cs and ss are loaded from MSRs. * (Note: 32bit->32bit SYSRET is different: since r11 * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. */ USERGS_SYSRET32 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ee9b94d9921..3d6606fb97d0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -265,6 +265,7 @@ #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index e2d4a4afa8c3..3bbc07a57a31 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -20,13 +20,10 @@ extern unsigned long switcher_addr; /* Found in switcher.S */ extern unsigned long default_idt_entries[]; -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; +/* Declarations for definitions in arch/x86/lguest/head_32.S */ +extern char lguest_noirq_iret[]; extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; extern void lguest_iret(void); extern void lguest_init(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 803b684676ff..dbe76a14c3c9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -757,7 +757,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd470ebf924e..e4cf63301ff4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET */ + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..02c2eff7478d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,15 @@ system_call_fastpath: * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) */ USERGS_SYSRET64 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4baaa972f52a..ddfdbf74f174 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + return prev_p; } diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 6eb5c20ee373..d090ecf08809 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -666,7 +666,7 @@ static int probe_sysfs_permissions(struct pci_dev *dev) if (r) return r; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); path_put(&path); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d67206a7b99a..629af0f1c5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, unsigned long bitmap = 1; struct kvm_lapic **dst; int i; - bool ret = false; - bool x2apic_ipi = src && apic_x2apic_mode(src); + bool ret, x2apic_ipi; *r = -1; @@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (irq->shorthand) return false; + x2apic_ipi = src && apic_x2apic_mode(src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return false; + ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) + if (!map) { + ret = false; goto out; - - ret = true; + } if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 146f295ee322..d43867c33bc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn = spte_to_pfn(*sptep); /* - * Only EPT supported for now; otherwise, one would need to - * find out efficiently whether the guest page tables are - * also using huge pages. + * We cannot do huge page mapping for indirect shadow pages, + * which are found on the last rmap (level = 1) when not using + * tdp; such shadow pages are synced with the page table in + * the guest, and the guest page table is using 4K page size + * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && @@ -4504,19 +4506,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush = false; unsigned long *rmapp; unsigned long last_index, index; - gfn_t gfn_start, gfn_end; spin_lock(&kvm->mmu_lock); - gfn_start = memslot->base_gfn; - gfn_end = memslot->base_gfn + memslot->npages - 1; - - if (gfn_start >= gfn_end) - goto out; - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(gfn_end, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) @@ -4534,7 +4529,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); -out: spin_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f5e8dce8046c..f7b61687bd79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4 = + (cr4_read_shadow() & X86_CR4_MCE) | + (cr4 & ~X86_CR4_MCE) | + (to_vmx(vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1a81267f3f6..ed31c31b2485 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque) kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; - kvm_init_msr_list(); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void) { - return kvm_x86_ops->hardware_setup(); + int r; + + r = kvm_x86_ops->hardware_setup(); + if (r != 0) + return r; + + kvm_init_msr_list(); + return 0; } void kvm_arch_hardware_unsetup(void) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 717908b16037..8f9a133cc099 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,8 +87,7 @@ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_start = (u32)lguest_noirq_start, - .noirq_end = (u32)lguest_noirq_end, + .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR, @@ -262,7 +261,7 @@ PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); /*:*/ -/* These are in i386_head.S */ +/* These are in head_32.S */ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); @@ -1368,7 +1367,7 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. + * and these are in head_32.S. */ /*G:060 We construct a table from the assembler templates: */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S index 6ddfe4fc23c3..d5ae63f5ec5d 100644 --- a/arch/x86/lguest/head_32.S +++ b/arch/x86/lguest/head_32.S @@ -84,7 +84,7 @@ ENTRY(lg_irq_enable) * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we * jump to send_interrupts, otherwise we're done. */ - testl $0, lguest_data+LGUEST_DATA_irq_pending + cmpl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts /* * One cool thing about x86 is that you can do many things without using @@ -133,9 +133,8 @@ ENTRY(lg_restore_fl) ret /*:*/ -/* These demark the EIP range where host should never deliver interrupts. */ -.global lguest_noirq_start -.global lguest_noirq_end +/* These demark the EIP where host should never deliver interrupts. */ +.global lguest_noirq_iret /*M:004 * When the Host reflects a trap or injects an interrupt into the Guest, it @@ -168,29 +167,26 @@ ENTRY(lg_restore_fl) * So we have to copy eflags from the stack to lguest_data.irq_enabled before * we do the "iret". * - * There are two problems with this: firstly, we need to use a register to do - * the copy and secondly, the whole thing needs to be atomic. The first - * problem is easy to solve: push %eax on the stack so we can use it, and then - * restore it at the end just before the real "iret". + * There are two problems with this: firstly, we can't clobber any registers + * and secondly, the whole thing needs to be atomic. The first problem + * is solved by using "push memory"/"pop memory" instruction pair for copying. * * The second is harder: copying eflags to lguest_data.irq_enabled will turn * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to surround the - * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the + * return to userspace or wherever. Our solution to this is to tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. + * enabled. (It's not necessary to protect pop instruction, since + * data gets updated only after it completes, so we only need to protect + * one instruction, iret). */ ENTRY(lguest_iret) - pushl %eax - movl 12(%esp), %eax -lguest_noirq_start: + pushl 2*4(%esp) /* * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: * prefix makes sure we use the stack segment, which is still valid. */ - movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled - popl %eax + popl %ss:lguest_data+LGUEST_DATA_irq_enabled +lguest_noirq_iret: iret -lguest_noirq_end: diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 1f33b3d1fd68..0a42327a59d7 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); /* If the destination is a kernel buffer, we always clear the end */ - if ((unsigned long)to >= TASK_SIZE_MAX) + if (!__addr_ok(to)) memset(to, 0, len); return len; } diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index d05327caf69d..5d355e0c2633 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, { void **srcs; int i; + int start = -1, stop = disks - 3; if (submit->scribble) srcs = submit->scribble; @@ -134,10 +135,21 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ srcs[i] = (void*)raid6_empty_zero_page; - } else + } else { srcs[i] = page_address(blocks[i]) + offset; + if (i < disks - 2) { + stop = i; + if (start == -1) + start = i; + } + } } - raid6_call.gen_syndrome(disks, len, srcs); + if (submit->flags & ASYNC_TX_PQ_XOR_DST) { + BUG_ON(!raid6_call.xor_syndrome); + if (start >= 0) + raid6_call.xor_syndrome(disks, start, stop, len, srcs); + } else + raid6_call.gen_syndrome(disks, len, srcs); async_tx_sync_epilog(submit); } @@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks, if (device) unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO); - if (unmap && + /* XORing P/Q is only implemented in software */ + if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) && (src_cnt <= dma_maxpq(device, 0) || dma_maxpq(device, DMA_PREP_CONTINUE) > 0) && is_dma_pq_aligned(device, offset, 0, len)) { diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index e6c3ddd92665..ab2cbb51c6aa 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -5,7 +5,7 @@ menuconfig ACPI bool "ACPI (Advanced Configuration and Power Interface) Support" depends on !IA64_HP_SIM - depends on IA64 || X86 + depends on IA64 || X86 || (ARM64 && EXPERT) depends on PCI select PNP default y @@ -48,9 +48,16 @@ config ACPI_LEGACY_TABLES_LOOKUP config ARCH_MIGHT_HAVE_ACPI_PDC bool +config ACPI_GENERIC_GSI + bool + +config ACPI_SYSTEM_POWER_STATES_SUPPORT + bool + config ACPI_SLEEP bool depends on SUSPEND || HIBERNATION + depends on ACPI_SYSTEM_POWER_STATES_SUPPORT default y config ACPI_PROCFS_POWER @@ -163,6 +170,7 @@ config ACPI_PROCESSOR tristate "Processor" select THERMAL select CPU_IDLE + depends on X86 || IA64 default y help This driver installs ACPI as the idle handler for Linux and uses diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 623b117ad1a2..8a063e276530 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -23,7 +23,7 @@ acpi-y += nvs.o # Power management related files acpi-y += wakeup.o -acpi-y += sleep.o +acpi-$(CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT) += sleep.o acpi-y += device_pm.o acpi-$(CONFIG_ACPI_SLEEP) += proc.o @@ -56,6 +56,7 @@ ifdef CONFIG_ACPI_VIDEO acpi-y += video_detect.o endif acpi-y += acpi_lpat.o +acpi-$(CONFIG_ACPI_GENERIC_GSI) += gsi.o # These are (potentially) separate modules diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 1020b1b53a17..58f335ca2e75 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -170,7 +170,7 @@ static int acpi_processor_hotadd_init(struct acpi_processor *pr) acpi_status status; int ret; - if (pr->phys_id == -1) + if (pr->phys_id == PHYS_CPUID_INVALID) return -ENODEV; status = acpi_evaluate_integer(pr->handle, "_STA", NULL, &sta); @@ -215,7 +215,8 @@ static int acpi_processor_get_info(struct acpi_device *device) union acpi_object object = { 0 }; struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; struct acpi_processor *pr = acpi_driver_data(device); - int phys_id, cpu_index, device_declaration = 0; + phys_cpuid_t phys_id; + int cpu_index, device_declaration = 0; acpi_status status = AE_OK; static int cpu0_initialized; unsigned long long value; @@ -263,7 +264,7 @@ static int acpi_processor_get_info(struct acpi_device *device) } phys_id = acpi_get_phys_id(pr->handle, device_declaration, pr->acpi_id); - if (phys_id < 0) + if (phys_id == PHYS_CPUID_INVALID) acpi_handle_debug(pr->handle, "failed to get CPU physical ID.\n"); pr->phys_id = phys_id; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 8b67bd0f6bb5..c412fdb28d34 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -448,6 +448,9 @@ static int __init acpi_bus_init_irq(void) case ACPI_IRQ_MODEL_IOSAPIC: message = "IOSAPIC"; break; + case ACPI_IRQ_MODEL_GIC: + message = "GIC"; + break; case ACPI_IRQ_MODEL_PLATFORM: message = "platform specific model"; break; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 220d6406c9e9..5e8fed448850 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -861,7 +861,7 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) } } mutex_unlock(&ec->mutex); - list_for_each_entry(handler, &free_list, node) + list_for_each_entry_safe(handler, tmp, &free_list, node) acpi_ec_put_query_handler(handler); } EXPORT_SYMBOL_GPL(acpi_ec_remove_query_handler); diff --git a/drivers/acpi/gsi.c b/drivers/acpi/gsi.c new file mode 100644 index 000000000000..38208f2d0e69 --- /dev/null +++ b/drivers/acpi/gsi.c @@ -0,0 +1,105 @@ +/* + * ACPI GSI IRQ layer + * + * Copyright (C) 2015 ARM Ltd. + * Author: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/acpi.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> + +enum acpi_irq_model_id acpi_irq_model; + +static unsigned int acpi_gsi_get_irq_type(int trigger, int polarity) +{ + switch (polarity) { + case ACPI_ACTIVE_LOW: + return trigger == ACPI_EDGE_SENSITIVE ? + IRQ_TYPE_EDGE_FALLING : + IRQ_TYPE_LEVEL_LOW; + case ACPI_ACTIVE_HIGH: + return trigger == ACPI_EDGE_SENSITIVE ? + IRQ_TYPE_EDGE_RISING : + IRQ_TYPE_LEVEL_HIGH; + case ACPI_ACTIVE_BOTH: + if (trigger == ACPI_EDGE_SENSITIVE) + return IRQ_TYPE_EDGE_BOTH; + default: + return IRQ_TYPE_NONE; + } +} + +/** + * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI + * @gsi: GSI IRQ number to map + * @irq: pointer where linux IRQ number is stored + * + * irq location updated with irq value [>0 on success, 0 on failure] + * + * Returns: linux IRQ number on success (>0) + * -EINVAL on failure + */ +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ + /* + * Only default domain is supported at present, always find + * the mapping corresponding to default domain by passing NULL + * as irq_domain parameter + */ + *irq = irq_find_mapping(NULL, gsi); + /* + * *irq == 0 means no mapping, that should + * be reported as a failure + */ + return (*irq > 0) ? *irq : -EINVAL; +} +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); + +/** + * acpi_register_gsi() - Map a GSI to a linux IRQ number + * @dev: device for which IRQ has to be mapped + * @gsi: GSI IRQ number + * @trigger: trigger type of the GSI number to be mapped + * @polarity: polarity of the GSI to be mapped + * + * Returns: a valid linux IRQ number on success + * -EINVAL on failure + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ + unsigned int irq; + unsigned int irq_type = acpi_gsi_get_irq_type(trigger, polarity); + + /* + * There is no way at present to look-up the IRQ domain on ACPI, + * hence always create mapping referring to the default domain + * by passing NULL as irq_domain parameter + */ + irq = irq_create_mapping(NULL, gsi); + if (!irq) + return -EINVAL; + + /* Set irq type if specified and different than the current one */ + if (irq_type != IRQ_TYPE_NONE && + irq_type != irq_get_trigger_type(irq)) + irq_set_irq_type(irq, irq_type); + return irq; +} +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +/** + * acpi_unregister_gsi() - Free a GSI<->linux IRQ number mapping + * @gsi: GSI IRQ number + */ +void acpi_unregister_gsi(u32 gsi) +{ + int irq = irq_find_mapping(NULL, gsi); + + irq_dispose_mapping(irq); +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 56b321aa2b1c..ba4a61e964be 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -161,7 +161,11 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); /*-------------------------------------------------------------------------- Suspend/Resume -------------------------------------------------------------------------- */ +#ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT extern int acpi_sleep_init(void); +#else +static inline int acpi_sleep_init(void) { return -ENXIO; } +#endif #ifdef CONFIG_ACPI_SLEEP int acpi_sleep_proc_init(void); diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index f9eeae871593..39748bb3a543 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -336,11 +336,11 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size) return NULL; } -#ifndef CONFIG_IA64 -#define should_use_kmap(pfn) page_is_ram(pfn) -#else +#if defined(CONFIG_IA64) || defined(CONFIG_ARM64) /* ioremap will take care of cache attributes */ #define should_use_kmap(pfn) 0 +#else +#define should_use_kmap(pfn) page_is_ram(pfn) #endif static void __iomem *acpi_map(acpi_physical_address pg_off, unsigned long pg_sz) diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 7962651cdbd4..b1ec78b8a645 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -32,7 +32,7 @@ static struct acpi_table_madt *get_madt_table(void) } static int map_lapic_id(struct acpi_subtable_header *entry, - u32 acpi_id, int *apic_id) + u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_apic *lapic = container_of(entry, struct acpi_madt_local_apic, header); @@ -48,7 +48,7 @@ static int map_lapic_id(struct acpi_subtable_header *entry, } static int map_x2apic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, int *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_x2apic *apic = container_of(entry, struct acpi_madt_local_x2apic, header); @@ -65,7 +65,7 @@ static int map_x2apic_id(struct acpi_subtable_header *entry, } static int map_lsapic_id(struct acpi_subtable_header *entry, - int device_declaration, u32 acpi_id, int *apic_id) + int device_declaration, u32 acpi_id, phys_cpuid_t *apic_id) { struct acpi_madt_local_sapic *lsapic = container_of(entry, struct acpi_madt_local_sapic, header); @@ -83,10 +83,35 @@ static int map_lsapic_id(struct acpi_subtable_header *entry, return 0; } -static int map_madt_entry(int type, u32 acpi_id) +/* + * Retrieve the ARM CPU physical identifier (MPIDR) + */ +static int map_gicc_mpidr(struct acpi_subtable_header *entry, + int device_declaration, u32 acpi_id, phys_cpuid_t *mpidr) +{ + struct acpi_madt_generic_interrupt *gicc = + container_of(entry, struct acpi_madt_generic_interrupt, header); + + if (!(gicc->flags & ACPI_MADT_ENABLED)) + return -ENODEV; + + /* device_declaration means Device object in DSDT, in the + * GIC interrupt model, logical processors are required to + * have a Processor Device object in the DSDT, so we should + * check device_declaration here + */ + if (device_declaration && (gicc->uid == acpi_id)) { + *mpidr = gicc->arm_mpidr; + return 0; + } + + return -EINVAL; +} + +static phys_cpuid_t map_madt_entry(int type, u32 acpi_id) { unsigned long madt_end, entry; - int phys_id = -1; /* CPU hardware ID */ + phys_cpuid_t phys_id = PHYS_CPUID_INVALID; /* CPU hardware ID */ struct acpi_table_madt *madt; madt = get_madt_table(); @@ -111,18 +136,21 @@ static int map_madt_entry(int type, u32 acpi_id) } else if (header->type == ACPI_MADT_TYPE_LOCAL_SAPIC) { if (!map_lsapic_id(header, type, acpi_id, &phys_id)) break; + } else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) { + if (!map_gicc_mpidr(header, type, acpi_id, &phys_id)) + break; } entry += header->length; } return phys_id; } -static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id) +static phys_cpuid_t map_mat_entry(acpi_handle handle, int type, u32 acpi_id) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; struct acpi_subtable_header *header; - int phys_id = -1; + phys_cpuid_t phys_id = PHYS_CPUID_INVALID; if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) goto exit; @@ -143,33 +171,35 @@ static int map_mat_entry(acpi_handle handle, int type, u32 acpi_id) map_lsapic_id(header, type, acpi_id, &phys_id); else if (header->type == ACPI_MADT_TYPE_LOCAL_X2APIC) map_x2apic_id(header, type, acpi_id, &phys_id); + else if (header->type == ACPI_MADT_TYPE_GENERIC_INTERRUPT) + map_gicc_mpidr(header, type, acpi_id, &phys_id); exit: kfree(buffer.pointer); return phys_id; } -int acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) +phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id) { - int phys_id; + phys_cpuid_t phys_id; phys_id = map_mat_entry(handle, type, acpi_id); - if (phys_id == -1) + if (phys_id == PHYS_CPUID_INVALID) phys_id = map_madt_entry(type, acpi_id); return phys_id; } -int acpi_map_cpuid(int phys_id, u32 acpi_id) +int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id) { #ifdef CONFIG_SMP int i; #endif - if (phys_id == -1) { + if (phys_id == PHYS_CPUID_INVALID) { /* * On UP processor, there is no _MAT or MADT table. - * So above phys_id is always set to -1. + * So above phys_id is always set to PHYS_CPUID_INVALID. * * BIOS may define multiple CPU handles even for UP processor. * For example, @@ -190,7 +220,7 @@ int acpi_map_cpuid(int phys_id, u32 acpi_id) if (nr_cpu_ids <= 1 && acpi_id == 0) return acpi_id; else - return phys_id; + return -1; } #ifdef CONFIG_SMP @@ -208,7 +238,7 @@ int acpi_map_cpuid(int phys_id, u32 acpi_id) int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id) { - int phys_id; + phys_cpuid_t phys_id; phys_id = acpi_get_phys_id(handle, type, acpi_id); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 69bc0d888c01..03141aa4ea95 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -375,7 +375,11 @@ bool acpi_scan_is_offline(struct acpi_device *adev, bool uevent) struct acpi_device_physical_node *pn; bool offline = true; - mutex_lock(&adev->physical_node_lock); + /* + * acpi_container_offline() calls this for all of the container's + * children under the container's physical_node_lock lock. + */ + mutex_lock_nested(&adev->physical_node_lock, SINGLE_DEPTH_NESTING); list_for_each_entry(pn, &adev->physical_node_list, node) if (device_supports_offline(pn->dev) && !pn->dev->offline) { @@ -2388,9 +2392,6 @@ static void acpi_default_enumeration(struct acpi_device *device) struct list_head resource_list; bool is_spi_i2c_slave = false; - if (!device->pnp.type.platform_id || device->handler) - return; - /* * Do not enemerate SPI/I2C slaves as they will be enuerated by their * respective parents. @@ -2403,6 +2404,29 @@ static void acpi_default_enumeration(struct acpi_device *device) acpi_create_platform_device(device); } +static const struct acpi_device_id generic_device_ids[] = { + {"PRP0001", }, + {"", }, +}; + +static int acpi_generic_device_attach(struct acpi_device *adev, + const struct acpi_device_id *not_used) +{ + /* + * Since PRP0001 is the only ID handled here, the test below can be + * unconditional. + */ + if (adev->data.of_compatible) + acpi_default_enumeration(adev); + + return 1; +} + +static struct acpi_scan_handler generic_device_handler = { + .ids = generic_device_ids, + .attach = acpi_generic_device_attach, +}; + static int acpi_scan_attach_handler(struct acpi_device *device) { struct acpi_hardware_id *hwid; @@ -2428,8 +2452,6 @@ static int acpi_scan_attach_handler(struct acpi_device *device) break; } } - if (!ret) - acpi_default_enumeration(device); return ret; } @@ -2471,6 +2493,9 @@ static void acpi_bus_attach(struct acpi_device *device) ret = device_attach(&device->dev); if (ret < 0) return; + + if (!ret && device->pnp.type.platform_id) + acpi_default_enumeration(device); } device->flags.visited = true; @@ -2629,6 +2654,8 @@ int __init acpi_scan_init(void) acpi_pnp_init(); acpi_int340x_thermal_init(); + acpi_scan_add_handler(&generic_device_handler); + mutex_lock(&acpi_scan_lock); /* * Enumerate devices in the ACPI namespace. diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 93b81523a2fe..2e19189da0ee 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -23,6 +23,8 @@ * */ +/* Uncomment next line to get verbose printout */ +/* #define DEBUG */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/init.h> @@ -61,9 +63,9 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { struct acpi_madt_local_apic *p = (struct acpi_madt_local_apic *)header; - pr_info("LAPIC (acpi_id[0x%02x] lapic_id[0x%02x] %s)\n", - p->processor_id, p->id, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + pr_debug("LAPIC (acpi_id[0x%02x] lapic_id[0x%02x] %s)\n", + p->processor_id, p->id, + (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; @@ -71,9 +73,9 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { struct acpi_madt_local_x2apic *p = (struct acpi_madt_local_x2apic *)header; - pr_info("X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n", - p->local_apic_id, p->uid, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + pr_debug("X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n", + p->local_apic_id, p->uid, + (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; @@ -81,8 +83,8 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { struct acpi_madt_io_apic *p = (struct acpi_madt_io_apic *)header; - pr_info("IOAPIC (id[0x%02x] address[0x%08x] gsi_base[%d])\n", - p->id, p->address, p->global_irq_base); + pr_debug("IOAPIC (id[0x%02x] address[0x%08x] gsi_base[%d])\n", + p->id, p->address, p->global_irq_base); } break; @@ -155,9 +157,9 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { struct acpi_madt_io_sapic *p = (struct acpi_madt_io_sapic *)header; - pr_info("IOSAPIC (id[0x%x] address[%p] gsi_base[%d])\n", - p->id, (void *)(unsigned long)p->address, - p->global_irq_base); + pr_debug("IOSAPIC (id[0x%x] address[%p] gsi_base[%d])\n", + p->id, (void *)(unsigned long)p->address, + p->global_irq_base); } break; @@ -165,9 +167,9 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) { struct acpi_madt_local_sapic *p = (struct acpi_madt_local_sapic *)header; - pr_info("LSAPIC (acpi_id[0x%02x] lsapic_id[0x%02x] lsapic_eid[0x%02x] %s)\n", - p->processor_id, p->id, p->eid, - (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + pr_debug("LSAPIC (acpi_id[0x%02x] lsapic_id[0x%02x] lsapic_eid[0x%02x] %s)\n", + p->processor_id, p->id, p->eid, + (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; @@ -183,6 +185,28 @@ void acpi_table_print_madt_entry(struct acpi_subtable_header *header) } break; + case ACPI_MADT_TYPE_GENERIC_INTERRUPT: + { + struct acpi_madt_generic_interrupt *p = + (struct acpi_madt_generic_interrupt *)header; + pr_debug("GICC (acpi_id[0x%04x] address[%llx] MPIDR[0x%llx] %s)\n", + p->uid, p->base_address, + p->arm_mpidr, + (p->flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); + + } + break; + + case ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR: + { + struct acpi_madt_generic_distributor *p = + (struct acpi_madt_generic_distributor *)header; + pr_debug("GIC Distributor (gic_id[0x%04x] address[%llx] gsi_base[%d])\n", + p->gic_id, p->base_address, + p->global_irq_base); + } + break; + default: pr_warn("Found unsupported MADT entry (type = 0x%x)\n", header->type); diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 25798db14553..68f03141e432 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -157,10 +157,10 @@ static int dev_mkdir(const char *name, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mkdir(path.dentry->d_inode, dentry, mode); + err = vfs_mkdir(d_inode(path.dentry), dentry, mode); if (!err) /* mark as kernel-created inode */ - dentry->d_inode->i_private = &thread; + d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); return err; } @@ -207,7 +207,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, if (IS_ERR(dentry)) return PTR_ERR(dentry); - err = vfs_mknod(path.dentry->d_inode, dentry, mode, dev->devt); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; @@ -215,12 +215,12 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); /* mark as kernel-created inode */ - dentry->d_inode->i_private = &thread; + d_inode(dentry)->i_private = &thread; } done_path_create(&path, dentry); return err; @@ -235,16 +235,16 @@ static int dev_rmdir(const char *name) dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (dentry->d_inode) { - if (dentry->d_inode->i_private == &thread) - err = vfs_rmdir(parent.dentry->d_inode, dentry); + if (d_really_is_positive(dentry)) { + if (d_inode(dentry)->i_private == &thread) + err = vfs_rmdir(d_inode(parent.dentry), dentry); else err = -EPERM; } else { err = -ENOENT; } dput(dentry); - mutex_unlock(&parent.dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(parent.dentry)->i_mutex); path_put(&parent); return err; } @@ -306,11 +306,11 @@ static int handle_remove(const char *nodename, struct device *dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { struct kstat stat; struct path p = {.mnt = parent.mnt, .dentry = dentry}; err = vfs_getattr(&p, &stat); - if (!err && dev_mynode(dev, dentry->d_inode, &stat)) { + if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { struct iattr newattrs; /* * before unlinking this node, reset permissions @@ -321,10 +321,10 @@ static int handle_remove(const char *nodename, struct device *dev) newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); - err = vfs_unlink(parent.dentry->d_inode, dentry, NULL); + mutex_unlock(&d_inode(dentry)->i_mutex); + err = vfs_unlink(d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } @@ -332,7 +332,7 @@ static int handle_remove(const char *nodename, struct device *dev) err = -ENOENT; } dput(dentry); - mutex_unlock(&parent.dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(parent.dentry)->i_mutex); path_put(&parent); if (deleted && strchr(nodename, '/')) diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 9a950022ff88..a6ee3d750c30 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -424,7 +424,7 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) * So we have our own inline version of it above. :-( */ static inline int debugfs_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } /* make sure at *open* time that the respective object won't go away. */ @@ -439,15 +439,15 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo * or has debugfs_remove() already been called? */ parent = file->f_path.dentry->d_parent; /* not sure if this can happen: */ - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) goto out; /* serialize with d_delete() */ - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); /* Make sure the object is still alive */ if (debugfs_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!ret) { ret = single_open(file, show, data); if (ret) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b40af3203089..812523330a78 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3762,8 +3762,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) goto out_tag_set; } - /* We use the default size, but let's be explicit about it. */ - blk_queue_physical_block_size(q, SECTOR_SIZE); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ /* set io sizes to object size */ segment_size = rbd_obj_bytes(&rbd_dev->header); @@ -5301,8 +5301,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) if (mapping) { ret = rbd_dev_header_watch_sync(rbd_dev); - if (ret) + if (ret) { + if (ret == -ENOENT) + pr_info("image %s/%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name); goto out_header_name; + } } ret = rbd_dev_header_info(rbd_dev); @@ -5319,8 +5324,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) ret = rbd_spec_fill_snap_id(rbd_dev); else ret = rbd_spec_fill_names(rbd_dev); - if (ret) + if (ret) { + if (ret == -ENOENT) + pr_info("snap %s/%s@%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name, + rbd_dev->spec->snap_name); goto err_out_probe; + } if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { ret = rbd_dev_v2_parent_info(rbd_dev); @@ -5390,8 +5401,11 @@ static ssize_t do_rbd_add(struct bus_type *bus, /* pick the pool */ rc = rbd_add_get_pool_id(rbdc, spec->pool_name); - if (rc < 0) + if (rc < 0) { + if (rc == -ENOENT) + pr_info("pool %s does not exist\n", spec->pool_name); goto err_out_client; + } spec->pool_id = (u64)rc; /* The ceph file layout needs to fit pool id in 32 bits */ @@ -5673,7 +5687,7 @@ static int __init rbd_init(void) /* * The number of active work items is limited by the number of - * rbd devices, so leave @max_active at default. + * rbd devices * queue depth, so leave @max_active at default. */ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); if (!rbd_wq) { diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 266469691e58..0aa135ddbf80 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/sched_clock.h> +#include <linux/acpi.h> #include <asm/arch_timer.h> #include <asm/virt.h> @@ -371,8 +372,12 @@ arch_timer_detect_rate(void __iomem *cntbase, struct device_node *np) if (arch_timer_rate) return; - /* Try to determine the frequency from the device tree or CNTFRQ */ - if (of_property_read_u32(np, "clock-frequency", &arch_timer_rate)) { + /* + * Try to determine the frequency from the device tree or CNTFRQ, + * if ACPI is enabled, get the frequency from CNTFRQ ONLY. + */ + if (!acpi_disabled || + of_property_read_u32(np, "clock-frequency", &arch_timer_rate)) { if (cntbase) arch_timer_rate = readl_relaxed(cntbase + CNTFRQ); else @@ -691,28 +696,8 @@ static void __init arch_timer_common_init(void) arch_timer_arch_init(); } -static void __init arch_timer_init(struct device_node *np) +static void __init arch_timer_init(void) { - int i; - - if (arch_timers_present & ARCH_CP15_TIMER) { - pr_warn("arch_timer: multiple nodes in dt, skipping\n"); - return; - } - - arch_timers_present |= ARCH_CP15_TIMER; - for (i = PHYS_SECURE_PPI; i < MAX_TIMER_PPI; i++) - arch_timer_ppi[i] = irq_of_parse_and_map(np, i); - arch_timer_detect_rate(NULL, np); - - /* - * If we cannot rely on firmware initializing the timer registers then - * we should use the physical timers instead. - */ - if (IS_ENABLED(CONFIG_ARM) && - of_property_read_bool(np, "arm,cpu-registers-not-fw-configured")) - arch_timer_use_virtual = false; - /* * If HYP mode is available, we know that the physical timer * has been configured to be accessible from PL1. Use it, so @@ -731,13 +716,39 @@ static void __init arch_timer_init(struct device_node *np) } } - arch_timer_c3stop = !of_property_read_bool(np, "always-on"); - arch_timer_register(); arch_timer_common_init(); } -CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_init); -CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_init); + +static void __init arch_timer_of_init(struct device_node *np) +{ + int i; + + if (arch_timers_present & ARCH_CP15_TIMER) { + pr_warn("arch_timer: multiple nodes in dt, skipping\n"); + return; + } + + arch_timers_present |= ARCH_CP15_TIMER; + for (i = PHYS_SECURE_PPI; i < MAX_TIMER_PPI; i++) + arch_timer_ppi[i] = irq_of_parse_and_map(np, i); + + arch_timer_detect_rate(NULL, np); + + arch_timer_c3stop = !of_property_read_bool(np, "always-on"); + + /* + * If we cannot rely on firmware initializing the timer registers then + * we should use the physical timers instead. + */ + if (IS_ENABLED(CONFIG_ARM) && + of_property_read_bool(np, "arm,cpu-registers-not-fw-configured")) + arch_timer_use_virtual = false; + + arch_timer_init(); +} +CLOCKSOURCE_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init); +CLOCKSOURCE_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init); static void __init arch_timer_mem_init(struct device_node *np) { @@ -804,3 +815,70 @@ static void __init arch_timer_mem_init(struct device_node *np) } CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem", arch_timer_mem_init); + +#ifdef CONFIG_ACPI +static int __init map_generic_timer_interrupt(u32 interrupt, u32 flags) +{ + int trigger, polarity; + + if (!interrupt) + return 0; + + trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE + : ACPI_LEVEL_SENSITIVE; + + polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW + : ACPI_ACTIVE_HIGH; + + return acpi_register_gsi(NULL, interrupt, trigger, polarity); +} + +/* Initialize per-processor generic timer */ +static int __init arch_timer_acpi_init(struct acpi_table_header *table) +{ + struct acpi_table_gtdt *gtdt; + + if (arch_timers_present & ARCH_CP15_TIMER) { + pr_warn("arch_timer: already initialized, skipping\n"); + return -EINVAL; + } + + gtdt = container_of(table, struct acpi_table_gtdt, header); + + arch_timers_present |= ARCH_CP15_TIMER; + + arch_timer_ppi[PHYS_SECURE_PPI] = + map_generic_timer_interrupt(gtdt->secure_el1_interrupt, + gtdt->secure_el1_flags); + + arch_timer_ppi[PHYS_NONSECURE_PPI] = + map_generic_timer_interrupt(gtdt->non_secure_el1_interrupt, + gtdt->non_secure_el1_flags); + + arch_timer_ppi[VIRT_PPI] = + map_generic_timer_interrupt(gtdt->virtual_timer_interrupt, + gtdt->virtual_timer_flags); + + arch_timer_ppi[HYP_PPI] = + map_generic_timer_interrupt(gtdt->non_secure_el2_interrupt, + gtdt->non_secure_el2_flags); + + /* Get the frequency from CNTFRQ */ + arch_timer_detect_rate(NULL, NULL); + + /* Always-on capability */ + arch_timer_c3stop = !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON); + + arch_timer_init(); + return 0; +} + +/* Initialize all the generic timers presented in GTDT */ +void __init acpi_generic_timer_init(void) +{ + if (acpi_disabled) + return; + + acpi_table_parse(ACPI_SIG_GTDT, arch_timer_acpi_init); +} +#endif diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index c5b81beccc8e..6414661ac1c4 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -31,6 +31,7 @@ #include <asm/div64.h> #include <asm/msr.h> #include <asm/cpu_device_id.h> +#include <asm/cpufeature.h> #define BYT_RATIOS 0x66a #define BYT_VIDS 0x66b @@ -649,7 +650,7 @@ static struct cpu_defaults byt_params = { .pid_policy = { .sample_rate_ms = 10, .deadband = 0, - .setpoint = 97, + .setpoint = 60, .p_gain_pct = 14, .d_gain_pct = 0, .i_gain_pct = 4, @@ -1200,8 +1201,7 @@ static int __init intel_pstate_init(void) { int cpu, rc = 0; const struct x86_cpu_id *id; - struct cpu_defaults *cpu_info; - struct cpuinfo_x86 *c = &boot_cpu_data; + struct cpu_defaults *cpu_def; if (no_load) return -ENODEV; @@ -1217,10 +1217,10 @@ static int __init intel_pstate_init(void) if (intel_pstate_platform_pwr_mgmt_exists()) return -ENODEV; - cpu_info = (struct cpu_defaults *)id->driver_data; + cpu_def = (struct cpu_defaults *)id->driver_data; - copy_pid_params(&cpu_info->pid_policy); - copy_cpu_funcs(&cpu_info->funcs); + copy_pid_params(&cpu_def->pid_policy); + copy_cpu_funcs(&cpu_def->funcs); if (intel_pstate_msrs_not_valid()) return -ENODEV; @@ -1231,7 +1231,7 @@ static int __init intel_pstate_init(void) if (!all_cpu_data) return -ENOMEM; - if (cpu_has(c,X86_FEATURE_HWP) && !no_hwp) + if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) intel_pstate_hwp_enable(); if (!hwp_active && hwp_only) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 800bf41718e1..033c0c86f6ec 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -446,8 +446,9 @@ config CRYPTO_DEV_VMX source "drivers/crypto/vmx/Kconfig" config CRYPTO_DEV_IMGTEC_HASH - depends on MIPS || COMPILE_TEST tristate "Imagination Technologies hardware hash accelerator" + depends on MIPS || COMPILE_TEST + depends on HAS_DMA select CRYPTO_ALGAPI select CRYPTO_MD5 select CRYPTO_SHA1 diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 5be225c2ba98..c5a9138a6a8d 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -265,43 +265,40 @@ static inline int is_dma_buf_file(struct file *file) } /** - * dma_buf_export_named - Creates a new dma_buf, and associates an anon file + * dma_buf_export - Creates a new dma_buf, and associates an anon file * with this buffer, so it can be exported. * Also connect the allocator specific data and ops to the buffer. * Additionally, provide a name string for exporter; useful in debugging. * - * @priv: [in] Attach private data of allocator to this buffer - * @ops: [in] Attach allocator-defined dma buf ops to the new buffer. - * @size: [in] Size of the buffer - * @flags: [in] mode flags for the file. - * @exp_name: [in] name of the exporting module - useful for debugging. - * @resv: [in] reservation-object, NULL to allocate default one. + * @exp_info: [in] holds all the export related information provided + * by the exporter. see struct dma_buf_export_info + * for further details. * * Returns, on success, a newly created dma_buf object, which wraps the * supplied private data and operations for dma_buf_ops. On either missing * ops, or error in allocating struct dma_buf, will return negative error. * */ -struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, - size_t size, int flags, const char *exp_name, - struct reservation_object *resv) +struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) { struct dma_buf *dmabuf; + struct reservation_object *resv = exp_info->resv; struct file *file; size_t alloc_size = sizeof(struct dma_buf); - if (!resv) + if (!exp_info->resv) alloc_size += sizeof(struct reservation_object); else /* prevent &dma_buf[1] == dma_buf->resv */ alloc_size += 1; - if (WARN_ON(!priv || !ops - || !ops->map_dma_buf - || !ops->unmap_dma_buf - || !ops->release - || !ops->kmap_atomic - || !ops->kmap - || !ops->mmap)) { + if (WARN_ON(!exp_info->priv + || !exp_info->ops + || !exp_info->ops->map_dma_buf + || !exp_info->ops->unmap_dma_buf + || !exp_info->ops->release + || !exp_info->ops->kmap_atomic + || !exp_info->ops->kmap + || !exp_info->ops->mmap)) { return ERR_PTR(-EINVAL); } @@ -309,10 +306,10 @@ struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, if (dmabuf == NULL) return ERR_PTR(-ENOMEM); - dmabuf->priv = priv; - dmabuf->ops = ops; - dmabuf->size = size; - dmabuf->exp_name = exp_name; + dmabuf->priv = exp_info->priv; + dmabuf->ops = exp_info->ops; + dmabuf->size = exp_info->size; + dmabuf->exp_name = exp_info->exp_name; init_waitqueue_head(&dmabuf->poll); dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll; dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0; @@ -323,7 +320,8 @@ struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, } dmabuf->resv = resv; - file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf, flags); + file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf, + exp_info->flags); if (IS_ERR(file)) { kfree(dmabuf); return ERR_CAST(file); @@ -341,8 +339,7 @@ struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, return dmabuf; } -EXPORT_SYMBOL_GPL(dma_buf_export_named); - +EXPORT_SYMBOL_GPL(dma_buf_export); /** * dma_buf_fd - returns a file descriptor for the given dma_buf diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 91eced044321..fd7ac13f2574 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -112,6 +112,17 @@ config FSL_DMA EloPlus is on mpc85xx and mpc86xx and Pxxx parts, and the Elo3 is on some Txxx and Bxxx parts. +config FSL_RAID + tristate "Freescale RAID engine Support" + depends on FSL_SOC && !ASYNC_TX_ENABLE_CHANNEL_SWITCH + select DMA_ENGINE + select DMA_ENGINE_RAID + ---help--- + Enable support for Freescale RAID Engine. RAID Engine is + available on some QorIQ SoCs (like P5020/P5040). It has + the capability to offload memcpy, xor and pq computation + for raid5/6. + source "drivers/dma/hsu/Kconfig" config MPC512X_DMA @@ -347,6 +358,16 @@ config DMA_JZ4740 select DMA_ENGINE select DMA_VIRTUAL_CHANNELS +config DMA_JZ4780 + tristate "JZ4780 DMA support" + depends on MACH_JZ4780 + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + This selects support for the DMA controller in Ingenic JZ4780 SoCs. + If you have a board based on such a SoC and wish to use DMA for + devices which can use the DMA controller, say Y or M here. + config K3_DMA tristate "Hisilicon K3 DMA support" depends on ARCH_HI3xxx @@ -414,6 +435,14 @@ config IMG_MDC_DMA help Enable support for the IMG multi-threaded DMA controller (MDC). +config XGENE_DMA + tristate "APM X-Gene DMA support" + select DMA_ENGINE + select DMA_ENGINE_RAID + select ASYNC_TX_ENABLE_CHANNEL_SWITCH + help + Enable support for the APM X-Gene SoC DMA engine. + config DMA_ENGINE bool diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 7e8301cb489d..69f77d5ba53b 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -41,9 +41,11 @@ obj-$(CONFIG_DMA_OMAP) += omap-dma.o obj-$(CONFIG_DMA_BCM2835) += bcm2835-dma.o obj-$(CONFIG_MMP_PDMA) += mmp_pdma.o obj-$(CONFIG_DMA_JZ4740) += dma-jz4740.o +obj-$(CONFIG_DMA_JZ4780) += dma-jz4780.o obj-$(CONFIG_TI_CPPI41) += cppi41.o obj-$(CONFIG_K3_DMA) += k3dma.o obj-$(CONFIG_MOXART_DMA) += moxart-dma.o +obj-$(CONFIG_FSL_RAID) += fsl_raid.o obj-$(CONFIG_FSL_EDMA) += fsl-edma.o obj-$(CONFIG_QCOM_BAM_DMA) += qcom_bam_dma.o obj-y += xilinx/ @@ -51,3 +53,4 @@ obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o obj-$(CONFIG_NBPFAXI_DMA) += nbpfaxi.o obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o obj-$(CONFIG_IMG_MDC_DMA) += img-mdc-dma.o +obj-$(CONFIG_XGENE_DMA) += xgene-dma.o diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c index 83aa55d6fa5d..49d396ec06e5 100644 --- a/drivers/dma/amba-pl08x.c +++ b/drivers/dma/amba-pl08x.c @@ -15,10 +15,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is in this distribution in the file * called COPYING. * @@ -1195,11 +1191,6 @@ static void pl08x_free_txd_list(struct pl08x_driver_data *pl08x, /* * The DMA ENGINE API */ -static int pl08x_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void pl08x_free_chan_resources(struct dma_chan *chan) { /* Ensure all queued descriptors are freed */ @@ -2066,7 +2057,6 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id) /* Initialize memcpy engine */ dma_cap_set(DMA_MEMCPY, pl08x->memcpy.cap_mask); pl08x->memcpy.dev = &adev->dev; - pl08x->memcpy.device_alloc_chan_resources = pl08x_alloc_chan_resources; pl08x->memcpy.device_free_chan_resources = pl08x_free_chan_resources; pl08x->memcpy.device_prep_dma_memcpy = pl08x_prep_dma_memcpy; pl08x->memcpy.device_prep_dma_interrupt = pl08x_prep_dma_interrupt; @@ -2085,7 +2075,6 @@ static int pl08x_probe(struct amba_device *adev, const struct amba_id *id) dma_cap_set(DMA_SLAVE, pl08x->slave.cap_mask); dma_cap_set(DMA_CYCLIC, pl08x->slave.cap_mask); pl08x->slave.dev = &adev->dev; - pl08x->slave.device_alloc_chan_resources = pl08x_alloc_chan_resources; pl08x->slave.device_free_chan_resources = pl08x_free_chan_resources; pl08x->slave.device_prep_dma_interrupt = pl08x_prep_dma_interrupt; pl08x->slave.device_tx_status = pl08x_dma_tx_status; diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 0b4fc6fb48ce..57b2141ddddc 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -65,6 +65,21 @@ static void atc_issue_pending(struct dma_chan *chan); /*----------------------------------------------------------------------*/ +static inline unsigned int atc_get_xfer_width(dma_addr_t src, dma_addr_t dst, + size_t len) +{ + unsigned int width; + + if (!((src | dst | len) & 3)) + width = 2; + else if (!((src | dst | len) & 1)) + width = 1; + else + width = 0; + + return width; +} + static struct at_desc *atc_first_active(struct at_dma_chan *atchan) { return list_first_entry(&atchan->active_list, @@ -659,16 +674,10 @@ atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, * We can be a lot more clever here, but this should take care * of the most common optimization. */ - if (!((src | dest | len) & 3)) { - ctrla = ATC_SRC_WIDTH_WORD | ATC_DST_WIDTH_WORD; - src_width = dst_width = 2; - } else if (!((src | dest | len) & 1)) { - ctrla = ATC_SRC_WIDTH_HALFWORD | ATC_DST_WIDTH_HALFWORD; - src_width = dst_width = 1; - } else { - ctrla = ATC_SRC_WIDTH_BYTE | ATC_DST_WIDTH_BYTE; - src_width = dst_width = 0; - } + src_width = dst_width = atc_get_xfer_width(src, dest, len); + + ctrla = ATC_SRC_WIDTH(src_width) | + ATC_DST_WIDTH(dst_width); for (offset = 0; offset < len; offset += xfer_count << src_width) { xfer_count = min_t(size_t, (len - offset) >> src_width, @@ -862,6 +871,144 @@ err: } /** + * atc_prep_dma_sg - prepare memory to memory scather-gather operation + * @chan: the channel to prepare operation on + * @dst_sg: destination scatterlist + * @dst_nents: number of destination scatterlist entries + * @src_sg: source scatterlist + * @src_nents: number of source scatterlist entries + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_dma_sg(struct dma_chan *chan, + struct scatterlist *dst_sg, unsigned int dst_nents, + struct scatterlist *src_sg, unsigned int src_nents, + unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_desc *desc = NULL; + struct at_desc *first = NULL; + struct at_desc *prev = NULL; + unsigned int src_width; + unsigned int dst_width; + size_t xfer_count; + u32 ctrla; + u32 ctrlb; + size_t dst_len = 0, src_len = 0; + dma_addr_t dst = 0, src = 0; + size_t len = 0, total_len = 0; + + if (unlikely(dst_nents == 0 || src_nents == 0)) + return NULL; + + if (unlikely(dst_sg == NULL || src_sg == NULL)) + return NULL; + + ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN + | ATC_SRC_ADDR_MODE_INCR + | ATC_DST_ADDR_MODE_INCR + | ATC_FC_MEM2MEM; + + /* + * loop until there is either no more source or no more destination + * scatterlist entry + */ + while (true) { + + /* prepare the next transfer */ + if (dst_len == 0) { + + /* no more destination scatterlist entries */ + if (!dst_sg || !dst_nents) + break; + + dst = sg_dma_address(dst_sg); + dst_len = sg_dma_len(dst_sg); + + dst_sg = sg_next(dst_sg); + dst_nents--; + } + + if (src_len == 0) { + + /* no more source scatterlist entries */ + if (!src_sg || !src_nents) + break; + + src = sg_dma_address(src_sg); + src_len = sg_dma_len(src_sg); + + src_sg = sg_next(src_sg); + src_nents--; + } + + len = min_t(size_t, src_len, dst_len); + if (len == 0) + continue; + + /* take care for the alignment */ + src_width = dst_width = atc_get_xfer_width(src, dst, len); + + ctrla = ATC_SRC_WIDTH(src_width) | + ATC_DST_WIDTH(dst_width); + + /* + * The number of transfers to set up refer to the source width + * that depends on the alignment. + */ + xfer_count = len >> src_width; + if (xfer_count > ATC_BTSIZE_MAX) { + xfer_count = ATC_BTSIZE_MAX; + len = ATC_BTSIZE_MAX << src_width; + } + + /* create the transfer */ + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + desc->lli.saddr = src; + desc->lli.daddr = dst; + desc->lli.ctrla = ctrla | xfer_count; + desc->lli.ctrlb = ctrlb; + + desc->txd.cookie = 0; + desc->len = len; + + /* + * Although we only need the transfer width for the first and + * the last descriptor, its easier to set it to all descriptors. + */ + desc->tx_width = src_width; + + atc_desc_chain(&first, &prev, desc); + + /* update the lengths and addresses for the next loop cycle */ + dst_len -= len; + src_len -= len; + dst += len; + src += len; + + total_len += len; + } + + /* First descriptor of the chain embedds additional information */ + first->txd.cookie = -EBUSY; + first->total_len = total_len; + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(desc); + + first->txd.flags = flags; /* client is in control of this ack */ + + return &first->txd; + +err_desc_get: + atc_desc_put(atchan, first); + return NULL; +} + +/** * atc_dma_cyclic_check_values * Check for too big/unaligned periods and unaligned DMA buffer */ @@ -1461,8 +1608,10 @@ static int __init at_dma_probe(struct platform_device *pdev) /* setup platform data for each SoC */ dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask); + dma_cap_set(DMA_SG, at91sam9rl_config.cap_mask); dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask); dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask); + dma_cap_set(DMA_SG, at91sam9g45_config.cap_mask); /* get DMA parameters from controller type */ plat_dat = at_dma_get_driver_data(pdev); @@ -1582,11 +1731,15 @@ static int __init at_dma_probe(struct platform_device *pdev) atdma->dma_common.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; } + if (dma_has_cap(DMA_SG, atdma->dma_common.cap_mask)) + atdma->dma_common.device_prep_dma_sg = atc_prep_dma_sg; + dma_writel(atdma, EN, AT_DMA_ENABLE); - dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n", + dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s%s), %d channels\n", dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "", dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ? "slave " : "", + dma_has_cap(DMA_SG, atdma->dma_common.cap_mask) ? "sg-cpy " : "", plat_dat->nr_channels); dma_async_device_register(&atdma->dma_common); diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index d9891d3461f6..933e4b338459 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -1154,8 +1154,10 @@ static int at_xdmac_device_resume(struct dma_chan *chan) dev_dbg(chan2dev(chan), "%s\n", __func__); spin_lock_bh(&atchan->lock); - if (!at_xdmac_chan_is_paused(atchan)) + if (!at_xdmac_chan_is_paused(atchan)) { + spin_unlock_bh(&atchan->lock); return 0; + } at_xdmac_write(atxdmac, AT_XDMAC_GRWR, atchan->mask); clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status); diff --git a/drivers/dma/bestcomm/bestcomm.c b/drivers/dma/bestcomm/bestcomm.c index fa378d88f6c8..180fedb418cc 100644 --- a/drivers/dma/bestcomm/bestcomm.c +++ b/drivers/dma/bestcomm/bestcomm.c @@ -30,7 +30,7 @@ #define DRIVER_NAME "bestcomm-core" /* MPC5200 device tree match tables */ -static struct of_device_id mpc52xx_sram_ids[] = { +static const struct of_device_id mpc52xx_sram_ids[] = { { .compatible = "fsl,mpc5200-sram", }, { .compatible = "mpc5200-sram", }, {} @@ -481,7 +481,7 @@ static int mpc52xx_bcom_remove(struct platform_device *op) return 0; } -static struct of_device_id mpc52xx_bcom_of_match[] = { +static const struct of_device_id mpc52xx_bcom_of_match[] = { { .compatible = "fsl,mpc5200-bestcomm", }, { .compatible = "mpc5200-bestcomm", }, {}, diff --git a/drivers/dma/dma-jz4740.c b/drivers/dma/dma-jz4740.c index 84884418fd30..7638b24ce8d0 100644 --- a/drivers/dma/dma-jz4740.c +++ b/drivers/dma/dma-jz4740.c @@ -7,10 +7,6 @@ * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * */ #include <linux/dmaengine.h> @@ -343,7 +339,7 @@ static void jz4740_dma_chan_irq(struct jz4740_dmaengine_chan *chan) { spin_lock(&chan->vchan.lock); if (chan->desc) { - if (chan->desc && chan->desc->cyclic) { + if (chan->desc->cyclic) { vchan_cyclic_callback(&chan->desc->vdesc); } else { if (chan->next_sg == chan->desc->num_sgs) { @@ -496,11 +492,6 @@ static enum dma_status jz4740_dma_tx_status(struct dma_chan *c, return status; } -static int jz4740_dma_alloc_chan_resources(struct dma_chan *c) -{ - return 0; -} - static void jz4740_dma_free_chan_resources(struct dma_chan *c) { vchan_free_chan_resources(to_virt_chan(c)); @@ -543,7 +534,6 @@ static int jz4740_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_SLAVE, dd->cap_mask); dma_cap_set(DMA_CYCLIC, dd->cap_mask); - dd->device_alloc_chan_resources = jz4740_dma_alloc_chan_resources; dd->device_free_chan_resources = jz4740_dma_free_chan_resources; dd->device_tx_status = jz4740_dma_tx_status; dd->device_issue_pending = jz4740_dma_issue_pending; diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c new file mode 100644 index 000000000000..26d2f0e09ea3 --- /dev/null +++ b/drivers/dma/dma-jz4780.c @@ -0,0 +1,877 @@ +/* + * Ingenic JZ4780 DMA controller + * + * Copyright (c) 2015 Imagination Technologies + * Author: Alex Smith <alex@alex-smith.me.uk> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/clk.h> +#include <linux/dmapool.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_dma.h> +#include <linux/platform_device.h> +#include <linux/slab.h> + +#include "dmaengine.h" +#include "virt-dma.h" + +#define JZ_DMA_NR_CHANNELS 32 + +/* Global registers. */ +#define JZ_DMA_REG_DMAC 0x1000 +#define JZ_DMA_REG_DIRQP 0x1004 +#define JZ_DMA_REG_DDR 0x1008 +#define JZ_DMA_REG_DDRS 0x100c +#define JZ_DMA_REG_DMACP 0x101c +#define JZ_DMA_REG_DSIRQP 0x1020 +#define JZ_DMA_REG_DSIRQM 0x1024 +#define JZ_DMA_REG_DCIRQP 0x1028 +#define JZ_DMA_REG_DCIRQM 0x102c + +/* Per-channel registers. */ +#define JZ_DMA_REG_CHAN(n) (n * 0x20) +#define JZ_DMA_REG_DSA(n) (0x00 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DTA(n) (0x04 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DTC(n) (0x08 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DRT(n) (0x0c + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DCS(n) (0x10 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DCM(n) (0x14 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DDA(n) (0x18 + JZ_DMA_REG_CHAN(n)) +#define JZ_DMA_REG_DSD(n) (0x1c + JZ_DMA_REG_CHAN(n)) + +#define JZ_DMA_DMAC_DMAE BIT(0) +#define JZ_DMA_DMAC_AR BIT(2) +#define JZ_DMA_DMAC_HLT BIT(3) +#define JZ_DMA_DMAC_FMSC BIT(31) + +#define JZ_DMA_DRT_AUTO 0x8 + +#define JZ_DMA_DCS_CTE BIT(0) +#define JZ_DMA_DCS_HLT BIT(2) +#define JZ_DMA_DCS_TT BIT(3) +#define JZ_DMA_DCS_AR BIT(4) +#define JZ_DMA_DCS_DES8 BIT(30) + +#define JZ_DMA_DCM_LINK BIT(0) +#define JZ_DMA_DCM_TIE BIT(1) +#define JZ_DMA_DCM_STDE BIT(2) +#define JZ_DMA_DCM_TSZ_SHIFT 8 +#define JZ_DMA_DCM_TSZ_MASK (0x7 << JZ_DMA_DCM_TSZ_SHIFT) +#define JZ_DMA_DCM_DP_SHIFT 12 +#define JZ_DMA_DCM_SP_SHIFT 14 +#define JZ_DMA_DCM_DAI BIT(22) +#define JZ_DMA_DCM_SAI BIT(23) + +#define JZ_DMA_SIZE_4_BYTE 0x0 +#define JZ_DMA_SIZE_1_BYTE 0x1 +#define JZ_DMA_SIZE_2_BYTE 0x2 +#define JZ_DMA_SIZE_16_BYTE 0x3 +#define JZ_DMA_SIZE_32_BYTE 0x4 +#define JZ_DMA_SIZE_64_BYTE 0x5 +#define JZ_DMA_SIZE_128_BYTE 0x6 + +#define JZ_DMA_WIDTH_32_BIT 0x0 +#define JZ_DMA_WIDTH_8_BIT 0x1 +#define JZ_DMA_WIDTH_16_BIT 0x2 + +#define JZ_DMA_BUSWIDTHS (BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | \ + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | \ + BIT(DMA_SLAVE_BUSWIDTH_4_BYTES)) + +/** + * struct jz4780_dma_hwdesc - descriptor structure read by the DMA controller. + * @dcm: value for the DCM (channel command) register + * @dsa: source address + * @dta: target address + * @dtc: transfer count (number of blocks of the transfer size specified in DCM + * to transfer) in the low 24 bits, offset of the next descriptor from the + * descriptor base address in the upper 8 bits. + * @sd: target/source stride difference (in stride transfer mode). + * @drt: request type + */ +struct jz4780_dma_hwdesc { + uint32_t dcm; + uint32_t dsa; + uint32_t dta; + uint32_t dtc; + uint32_t sd; + uint32_t drt; + uint32_t reserved[2]; +}; + +/* Size of allocations for hardware descriptor blocks. */ +#define JZ_DMA_DESC_BLOCK_SIZE PAGE_SIZE +#define JZ_DMA_MAX_DESC \ + (JZ_DMA_DESC_BLOCK_SIZE / sizeof(struct jz4780_dma_hwdesc)) + +struct jz4780_dma_desc { + struct virt_dma_desc vdesc; + + struct jz4780_dma_hwdesc *desc; + dma_addr_t desc_phys; + unsigned int count; + enum dma_transaction_type type; + uint32_t status; +}; + +struct jz4780_dma_chan { + struct virt_dma_chan vchan; + unsigned int id; + struct dma_pool *desc_pool; + + uint32_t transfer_type; + uint32_t transfer_shift; + struct dma_slave_config config; + + struct jz4780_dma_desc *desc; + unsigned int curr_hwdesc; +}; + +struct jz4780_dma_dev { + struct dma_device dma_device; + void __iomem *base; + struct clk *clk; + unsigned int irq; + + uint32_t chan_reserved; + struct jz4780_dma_chan chan[JZ_DMA_NR_CHANNELS]; +}; + +struct jz4780_dma_data { + uint32_t transfer_type; + int channel; +}; + +static inline struct jz4780_dma_chan *to_jz4780_dma_chan(struct dma_chan *chan) +{ + return container_of(chan, struct jz4780_dma_chan, vchan.chan); +} + +static inline struct jz4780_dma_desc *to_jz4780_dma_desc( + struct virt_dma_desc *vdesc) +{ + return container_of(vdesc, struct jz4780_dma_desc, vdesc); +} + +static inline struct jz4780_dma_dev *jz4780_dma_chan_parent( + struct jz4780_dma_chan *jzchan) +{ + return container_of(jzchan->vchan.chan.device, struct jz4780_dma_dev, + dma_device); +} + +static inline uint32_t jz4780_dma_readl(struct jz4780_dma_dev *jzdma, + unsigned int reg) +{ + return readl(jzdma->base + reg); +} + +static inline void jz4780_dma_writel(struct jz4780_dma_dev *jzdma, + unsigned int reg, uint32_t val) +{ + writel(val, jzdma->base + reg); +} + +static struct jz4780_dma_desc *jz4780_dma_desc_alloc( + struct jz4780_dma_chan *jzchan, unsigned int count, + enum dma_transaction_type type) +{ + struct jz4780_dma_desc *desc; + + if (count > JZ_DMA_MAX_DESC) + return NULL; + + desc = kzalloc(sizeof(*desc), GFP_NOWAIT); + if (!desc) + return NULL; + + desc->desc = dma_pool_alloc(jzchan->desc_pool, GFP_NOWAIT, + &desc->desc_phys); + if (!desc->desc) { + kfree(desc); + return NULL; + } + + desc->count = count; + desc->type = type; + return desc; +} + +static void jz4780_dma_desc_free(struct virt_dma_desc *vdesc) +{ + struct jz4780_dma_desc *desc = to_jz4780_dma_desc(vdesc); + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(vdesc->tx.chan); + + dma_pool_free(jzchan->desc_pool, desc->desc, desc->desc_phys); + kfree(desc); +} + +static uint32_t jz4780_dma_transfer_size(unsigned long val, int *ord) +{ + *ord = ffs(val) - 1; + + switch (*ord) { + case 0: + return JZ_DMA_SIZE_1_BYTE; + case 1: + return JZ_DMA_SIZE_2_BYTE; + case 2: + return JZ_DMA_SIZE_4_BYTE; + case 4: + return JZ_DMA_SIZE_16_BYTE; + case 5: + return JZ_DMA_SIZE_32_BYTE; + case 6: + return JZ_DMA_SIZE_64_BYTE; + case 7: + return JZ_DMA_SIZE_128_BYTE; + default: + return -EINVAL; + } +} + +static uint32_t jz4780_dma_setup_hwdesc(struct jz4780_dma_chan *jzchan, + struct jz4780_dma_hwdesc *desc, dma_addr_t addr, size_t len, + enum dma_transfer_direction direction) +{ + struct dma_slave_config *config = &jzchan->config; + uint32_t width, maxburst, tsz; + int ord; + + if (direction == DMA_MEM_TO_DEV) { + desc->dcm = JZ_DMA_DCM_SAI; + desc->dsa = addr; + desc->dta = config->dst_addr; + desc->drt = jzchan->transfer_type; + + width = config->dst_addr_width; + maxburst = config->dst_maxburst; + } else { + desc->dcm = JZ_DMA_DCM_DAI; + desc->dsa = config->src_addr; + desc->dta = addr; + desc->drt = jzchan->transfer_type; + + width = config->src_addr_width; + maxburst = config->src_maxburst; + } + + /* + * This calculates the maximum transfer size that can be used with the + * given address, length, width and maximum burst size. The address + * must be aligned to the transfer size, the total length must be + * divisible by the transfer size, and we must not use more than the + * maximum burst specified by the user. + */ + tsz = jz4780_dma_transfer_size(addr | len | (width * maxburst), &ord); + jzchan->transfer_shift = ord; + + switch (width) { + case DMA_SLAVE_BUSWIDTH_1_BYTE: + case DMA_SLAVE_BUSWIDTH_2_BYTES: + break; + case DMA_SLAVE_BUSWIDTH_4_BYTES: + width = JZ_DMA_WIDTH_32_BIT; + break; + default: + return -EINVAL; + } + + desc->dcm |= tsz << JZ_DMA_DCM_TSZ_SHIFT; + desc->dcm |= width << JZ_DMA_DCM_SP_SHIFT; + desc->dcm |= width << JZ_DMA_DCM_DP_SHIFT; + + desc->dtc = len >> ord; +} + +static struct dma_async_tx_descriptor *jz4780_dma_prep_slave_sg( + struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, + enum dma_transfer_direction direction, unsigned long flags) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + struct jz4780_dma_desc *desc; + unsigned int i; + int err; + + desc = jz4780_dma_desc_alloc(jzchan, sg_len, DMA_SLAVE); + if (!desc) + return NULL; + + for (i = 0; i < sg_len; i++) { + err = jz4780_dma_setup_hwdesc(jzchan, &desc->desc[i], + sg_dma_address(&sgl[i]), + sg_dma_len(&sgl[i]), + direction); + if (err < 0) + return ERR_PTR(err); + + + desc->desc[i].dcm |= JZ_DMA_DCM_TIE; + + if (i != (sg_len - 1)) { + /* Automatically proceeed to the next descriptor. */ + desc->desc[i].dcm |= JZ_DMA_DCM_LINK; + + /* + * The upper 8 bits of the DTC field in the descriptor + * must be set to (offset from descriptor base of next + * descriptor >> 4). + */ + desc->desc[i].dtc |= + (((i + 1) * sizeof(*desc->desc)) >> 4) << 24; + } + } + + return vchan_tx_prep(&jzchan->vchan, &desc->vdesc, flags); +} + +static struct dma_async_tx_descriptor *jz4780_dma_prep_dma_cyclic( + struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, + size_t period_len, enum dma_transfer_direction direction, + unsigned long flags) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + struct jz4780_dma_desc *desc; + unsigned int periods, i; + int err; + + if (buf_len % period_len) + return NULL; + + periods = buf_len / period_len; + + desc = jz4780_dma_desc_alloc(jzchan, periods, DMA_CYCLIC); + if (!desc) + return NULL; + + for (i = 0; i < periods; i++) { + err = jz4780_dma_setup_hwdesc(jzchan, &desc->desc[i], buf_addr, + period_len, direction); + if (err < 0) + return ERR_PTR(err); + + buf_addr += period_len; + + /* + * Set the link bit to indicate that the controller should + * automatically proceed to the next descriptor. In + * jz4780_dma_begin(), this will be cleared if we need to issue + * an interrupt after each period. + */ + desc->desc[i].dcm |= JZ_DMA_DCM_TIE | JZ_DMA_DCM_LINK; + + /* + * The upper 8 bits of the DTC field in the descriptor must be + * set to (offset from descriptor base of next descriptor >> 4). + * If this is the last descriptor, link it back to the first, + * i.e. leave offset set to 0, otherwise point to the next one. + */ + if (i != (periods - 1)) { + desc->desc[i].dtc |= + (((i + 1) * sizeof(*desc->desc)) >> 4) << 24; + } + } + + return vchan_tx_prep(&jzchan->vchan, &desc->vdesc, flags); +} + +struct dma_async_tx_descriptor *jz4780_dma_prep_dma_memcpy( + struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, + size_t len, unsigned long flags) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + struct jz4780_dma_desc *desc; + uint32_t tsz; + int ord; + + desc = jz4780_dma_desc_alloc(jzchan, 1, DMA_MEMCPY); + if (!desc) + return NULL; + + tsz = jz4780_dma_transfer_size(dest | src | len, &ord); + if (tsz < 0) + return ERR_PTR(tsz); + + desc->desc[0].dsa = src; + desc->desc[0].dta = dest; + desc->desc[0].drt = JZ_DMA_DRT_AUTO; + desc->desc[0].dcm = JZ_DMA_DCM_TIE | JZ_DMA_DCM_SAI | JZ_DMA_DCM_DAI | + tsz << JZ_DMA_DCM_TSZ_SHIFT | + JZ_DMA_WIDTH_32_BIT << JZ_DMA_DCM_SP_SHIFT | + JZ_DMA_WIDTH_32_BIT << JZ_DMA_DCM_DP_SHIFT; + desc->desc[0].dtc = len >> ord; + + return vchan_tx_prep(&jzchan->vchan, &desc->vdesc, flags); +} + +static void jz4780_dma_begin(struct jz4780_dma_chan *jzchan) +{ + struct jz4780_dma_dev *jzdma = jz4780_dma_chan_parent(jzchan); + struct virt_dma_desc *vdesc; + unsigned int i; + dma_addr_t desc_phys; + + if (!jzchan->desc) { + vdesc = vchan_next_desc(&jzchan->vchan); + if (!vdesc) + return; + + list_del(&vdesc->node); + + jzchan->desc = to_jz4780_dma_desc(vdesc); + jzchan->curr_hwdesc = 0; + + if (jzchan->desc->type == DMA_CYCLIC && vdesc->tx.callback) { + /* + * The DMA controller doesn't support triggering an + * interrupt after processing each descriptor, only + * after processing an entire terminated list of + * descriptors. For a cyclic DMA setup the list of + * descriptors is not terminated so we can never get an + * interrupt. + * + * If the user requested a callback for a cyclic DMA + * setup then we workaround this hardware limitation + * here by degrading to a set of unlinked descriptors + * which we will submit in sequence in response to the + * completion of processing the previous descriptor. + */ + for (i = 0; i < jzchan->desc->count; i++) + jzchan->desc->desc[i].dcm &= ~JZ_DMA_DCM_LINK; + } + } else { + /* + * There is an existing transfer, therefore this must be one + * for which we unlinked the descriptors above. Advance to the + * next one in the list. + */ + jzchan->curr_hwdesc = + (jzchan->curr_hwdesc + 1) % jzchan->desc->count; + } + + /* Use 8-word descriptors. */ + jz4780_dma_writel(jzdma, JZ_DMA_REG_DCS(jzchan->id), JZ_DMA_DCS_DES8); + + /* Write descriptor address and initiate descriptor fetch. */ + desc_phys = jzchan->desc->desc_phys + + (jzchan->curr_hwdesc * sizeof(*jzchan->desc->desc)); + jz4780_dma_writel(jzdma, JZ_DMA_REG_DDA(jzchan->id), desc_phys); + jz4780_dma_writel(jzdma, JZ_DMA_REG_DDRS, BIT(jzchan->id)); + + /* Enable the channel. */ + jz4780_dma_writel(jzdma, JZ_DMA_REG_DCS(jzchan->id), + JZ_DMA_DCS_DES8 | JZ_DMA_DCS_CTE); +} + +static void jz4780_dma_issue_pending(struct dma_chan *chan) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&jzchan->vchan.lock, flags); + + if (vchan_issue_pending(&jzchan->vchan) && !jzchan->desc) + jz4780_dma_begin(jzchan); + + spin_unlock_irqrestore(&jzchan->vchan.lock, flags); +} + +static int jz4780_dma_terminate_all(struct jz4780_dma_chan *jzchan) +{ + struct jz4780_dma_dev *jzdma = jz4780_dma_chan_parent(jzchan); + unsigned long flags; + LIST_HEAD(head); + + spin_lock_irqsave(&jzchan->vchan.lock, flags); + + /* Clear the DMA status and stop the transfer. */ + jz4780_dma_writel(jzdma, JZ_DMA_REG_DCS(jzchan->id), 0); + if (jzchan->desc) { + jz4780_dma_desc_free(&jzchan->desc->vdesc); + jzchan->desc = NULL; + } + + vchan_get_all_descriptors(&jzchan->vchan, &head); + + spin_unlock_irqrestore(&jzchan->vchan.lock, flags); + + vchan_dma_desc_free_list(&jzchan->vchan, &head); + return 0; +} + +static int jz4780_dma_slave_config(struct jz4780_dma_chan *jzchan, + const struct dma_slave_config *config) +{ + if ((config->src_addr_width == DMA_SLAVE_BUSWIDTH_8_BYTES) + || (config->dst_addr_width == DMA_SLAVE_BUSWIDTH_8_BYTES)) + return -EINVAL; + + /* Copy the reset of the slave configuration, it is used later. */ + memcpy(&jzchan->config, config, sizeof(jzchan->config)); + + return 0; +} + +static size_t jz4780_dma_desc_residue(struct jz4780_dma_chan *jzchan, + struct jz4780_dma_desc *desc, unsigned int next_sg) +{ + struct jz4780_dma_dev *jzdma = jz4780_dma_chan_parent(jzchan); + unsigned int residue, count; + unsigned int i; + + residue = 0; + + for (i = next_sg; i < desc->count; i++) + residue += desc->desc[i].dtc << jzchan->transfer_shift; + + if (next_sg != 0) { + count = jz4780_dma_readl(jzdma, + JZ_DMA_REG_DTC(jzchan->id)); + residue += count << jzchan->transfer_shift; + } + + return residue; +} + +static enum dma_status jz4780_dma_tx_status(struct dma_chan *chan, + dma_cookie_t cookie, struct dma_tx_state *txstate) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + struct virt_dma_desc *vdesc; + enum dma_status status; + unsigned long flags; + + status = dma_cookie_status(chan, cookie, txstate); + if ((status == DMA_COMPLETE) || (txstate == NULL)) + return status; + + spin_lock_irqsave(&jzchan->vchan.lock, flags); + + vdesc = vchan_find_desc(&jzchan->vchan, cookie); + if (vdesc) { + /* On the issued list, so hasn't been processed yet */ + txstate->residue = jz4780_dma_desc_residue(jzchan, + to_jz4780_dma_desc(vdesc), 0); + } else if (cookie == jzchan->desc->vdesc.tx.cookie) { + txstate->residue = jz4780_dma_desc_residue(jzchan, jzchan->desc, + (jzchan->curr_hwdesc + 1) % jzchan->desc->count); + } else + txstate->residue = 0; + + if (vdesc && jzchan->desc && vdesc == &jzchan->desc->vdesc + && jzchan->desc->status & (JZ_DMA_DCS_AR | JZ_DMA_DCS_HLT)) + status = DMA_ERROR; + + spin_unlock_irqrestore(&jzchan->vchan.lock, flags); + return status; +} + +static void jz4780_dma_chan_irq(struct jz4780_dma_dev *jzdma, + struct jz4780_dma_chan *jzchan) +{ + uint32_t dcs; + + spin_lock(&jzchan->vchan.lock); + + dcs = jz4780_dma_readl(jzdma, JZ_DMA_REG_DCS(jzchan->id)); + jz4780_dma_writel(jzdma, JZ_DMA_REG_DCS(jzchan->id), 0); + + if (dcs & JZ_DMA_DCS_AR) { + dev_warn(&jzchan->vchan.chan.dev->device, + "address error (DCS=0x%x)\n", dcs); + } + + if (dcs & JZ_DMA_DCS_HLT) { + dev_warn(&jzchan->vchan.chan.dev->device, + "channel halt (DCS=0x%x)\n", dcs); + } + + if (jzchan->desc) { + jzchan->desc->status = dcs; + + if ((dcs & (JZ_DMA_DCS_AR | JZ_DMA_DCS_HLT)) == 0) { + if (jzchan->desc->type == DMA_CYCLIC) { + vchan_cyclic_callback(&jzchan->desc->vdesc); + } else { + vchan_cookie_complete(&jzchan->desc->vdesc); + jzchan->desc = NULL; + } + + jz4780_dma_begin(jzchan); + } + } else { + dev_err(&jzchan->vchan.chan.dev->device, + "channel IRQ with no active transfer\n"); + } + + spin_unlock(&jzchan->vchan.lock); +} + +static irqreturn_t jz4780_dma_irq_handler(int irq, void *data) +{ + struct jz4780_dma_dev *jzdma = data; + uint32_t pending, dmac; + int i; + + pending = jz4780_dma_readl(jzdma, JZ_DMA_REG_DIRQP); + + for (i = 0; i < JZ_DMA_NR_CHANNELS; i++) { + if (!(pending & (1<<i))) + continue; + + jz4780_dma_chan_irq(jzdma, &jzdma->chan[i]); + } + + /* Clear halt and address error status of all channels. */ + dmac = jz4780_dma_readl(jzdma, JZ_DMA_REG_DMAC); + dmac &= ~(JZ_DMA_DMAC_HLT | JZ_DMA_DMAC_AR); + jz4780_dma_writel(jzdma, JZ_DMA_REG_DMAC, dmac); + + /* Clear interrupt pending status. */ + jz4780_dma_writel(jzdma, JZ_DMA_REG_DIRQP, 0); + + return IRQ_HANDLED; +} + +static int jz4780_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + + jzchan->desc_pool = dma_pool_create(dev_name(&chan->dev->device), + chan->device->dev, + JZ_DMA_DESC_BLOCK_SIZE, + PAGE_SIZE, 0); + if (!jzchan->desc_pool) { + dev_err(&chan->dev->device, + "failed to allocate descriptor pool\n"); + return -ENOMEM; + } + + return 0; +} + +static void jz4780_dma_free_chan_resources(struct dma_chan *chan) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + + vchan_free_chan_resources(&jzchan->vchan); + dma_pool_destroy(jzchan->desc_pool); + jzchan->desc_pool = NULL; +} + +static bool jz4780_dma_filter_fn(struct dma_chan *chan, void *param) +{ + struct jz4780_dma_chan *jzchan = to_jz4780_dma_chan(chan); + struct jz4780_dma_dev *jzdma = jz4780_dma_chan_parent(jzchan); + struct jz4780_dma_data *data = param; + + if (data->channel > -1) { + if (data->channel != jzchan->id) + return false; + } else if (jzdma->chan_reserved & BIT(jzchan->id)) { + return false; + } + + jzchan->transfer_type = data->transfer_type; + + return true; +} + +static struct dma_chan *jz4780_of_dma_xlate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct jz4780_dma_dev *jzdma = ofdma->of_dma_data; + dma_cap_mask_t mask = jzdma->dma_device.cap_mask; + struct jz4780_dma_data data; + + if (dma_spec->args_count != 2) + return NULL; + + data.transfer_type = dma_spec->args[0]; + data.channel = dma_spec->args[1]; + + if (data.channel > -1) { + if (data.channel >= JZ_DMA_NR_CHANNELS) { + dev_err(jzdma->dma_device.dev, + "device requested non-existent channel %u\n", + data.channel); + return NULL; + } + + /* Can only select a channel marked as reserved. */ + if (!(jzdma->chan_reserved & BIT(data.channel))) { + dev_err(jzdma->dma_device.dev, + "device requested unreserved channel %u\n", + data.channel); + return NULL; + } + } + + return dma_request_channel(mask, jz4780_dma_filter_fn, &data); +} + +static int jz4780_dma_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct jz4780_dma_dev *jzdma; + struct jz4780_dma_chan *jzchan; + struct dma_device *dd; + struct resource *res; + int i, ret; + + jzdma = devm_kzalloc(dev, sizeof(*jzdma), GFP_KERNEL); + if (!jzdma) + return -ENOMEM; + + platform_set_drvdata(pdev, jzdma); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(dev, "failed to get I/O memory\n"); + return -EINVAL; + } + + jzdma->base = devm_ioremap_resource(dev, res); + if (IS_ERR(jzdma->base)) + return PTR_ERR(jzdma->base); + + jzdma->irq = platform_get_irq(pdev, 0); + if (jzdma->irq < 0) { + dev_err(dev, "failed to get IRQ: %d\n", ret); + return jzdma->irq; + } + + ret = devm_request_irq(dev, jzdma->irq, jz4780_dma_irq_handler, 0, + dev_name(dev), jzdma); + if (ret) { + dev_err(dev, "failed to request IRQ %u!\n", jzdma->irq); + return -EINVAL; + } + + jzdma->clk = devm_clk_get(dev, NULL); + if (IS_ERR(jzdma->clk)) { + dev_err(dev, "failed to get clock\n"); + return PTR_ERR(jzdma->clk); + } + + clk_prepare_enable(jzdma->clk); + + /* Property is optional, if it doesn't exist the value will remain 0. */ + of_property_read_u32_index(dev->of_node, "ingenic,reserved-channels", + 0, &jzdma->chan_reserved); + + dd = &jzdma->dma_device; + + dma_cap_set(DMA_MEMCPY, dd->cap_mask); + dma_cap_set(DMA_SLAVE, dd->cap_mask); + dma_cap_set(DMA_CYCLIC, dd->cap_mask); + + dd->dev = dev; + dd->copy_align = 2; /* 2^2 = 4 byte alignment */ + dd->device_alloc_chan_resources = jz4780_dma_alloc_chan_resources; + dd->device_free_chan_resources = jz4780_dma_free_chan_resources; + dd->device_prep_slave_sg = jz4780_dma_prep_slave_sg; + dd->device_prep_dma_cyclic = jz4780_dma_prep_dma_cyclic; + dd->device_prep_dma_memcpy = jz4780_dma_prep_dma_memcpy; + dd->device_config = jz4780_dma_slave_config; + dd->device_terminate_all = jz4780_dma_terminate_all; + dd->device_tx_status = jz4780_dma_tx_status; + dd->device_issue_pending = jz4780_dma_issue_pending; + dd->src_addr_widths = JZ_DMA_BUSWIDTHS; + dd->dst_addr_widths = JZ_DMA_BUSWIDTHS; + dd->directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); + dd->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; + + + /* + * Enable DMA controller, mark all channels as not programmable. + * Also set the FMSC bit - it increases MSC performance, so it makes + * little sense not to enable it. + */ + jz4780_dma_writel(jzdma, JZ_DMA_REG_DMAC, + JZ_DMA_DMAC_DMAE | JZ_DMA_DMAC_FMSC); + jz4780_dma_writel(jzdma, JZ_DMA_REG_DMACP, 0); + + INIT_LIST_HEAD(&dd->channels); + + for (i = 0; i < JZ_DMA_NR_CHANNELS; i++) { + jzchan = &jzdma->chan[i]; + jzchan->id = i; + + vchan_init(&jzchan->vchan, dd); + jzchan->vchan.desc_free = jz4780_dma_desc_free; + } + + ret = dma_async_device_register(dd); + if (ret) { + dev_err(dev, "failed to register device\n"); + goto err_disable_clk; + } + + /* Register with OF DMA helpers. */ + ret = of_dma_controller_register(dev->of_node, jz4780_of_dma_xlate, + jzdma); + if (ret) { + dev_err(dev, "failed to register OF DMA controller\n"); + goto err_unregister_dev; + } + + dev_info(dev, "JZ4780 DMA controller initialised\n"); + return 0; + +err_unregister_dev: + dma_async_device_unregister(dd); + +err_disable_clk: + clk_disable_unprepare(jzdma->clk); + return ret; +} + +static int jz4780_dma_remove(struct platform_device *pdev) +{ + struct jz4780_dma_dev *jzdma = platform_get_drvdata(pdev); + + of_dma_controller_free(pdev->dev.of_node); + devm_free_irq(&pdev->dev, jzdma->irq, jzdma); + dma_async_device_unregister(&jzdma->dma_device); + return 0; +} + +static const struct of_device_id jz4780_dma_dt_match[] = { + { .compatible = "ingenic,jz4780-dma", .data = NULL }, + {}, +}; +MODULE_DEVICE_TABLE(of, jz4780_dma_dt_match); + +static struct platform_driver jz4780_dma_driver = { + .probe = jz4780_dma_probe, + .remove = jz4780_dma_remove, + .driver = { + .name = "jz4780-dma", + .of_match_table = of_match_ptr(jz4780_dma_dt_match), + }, +}; + +static int __init jz4780_dma_init(void) +{ + return platform_driver_register(&jz4780_dma_driver); +} +subsys_initcall(jz4780_dma_init); + +static void __exit jz4780_dma_exit(void) +{ + platform_driver_unregister(&jz4780_dma_driver); +} +module_exit(jz4780_dma_exit); + +MODULE_AUTHOR("Alex Smith <alex@alex-smith.me.uk>"); +MODULE_DESCRIPTION("Ingenic JZ4780 DMA controller driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index ac336a961dea..0e035a8cf401 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ @@ -355,20 +351,6 @@ struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type) } EXPORT_SYMBOL(dma_find_channel); -/* - * net_dma_find_channel - find a channel for net_dma - * net_dma has alignment requirements - */ -struct dma_chan *net_dma_find_channel(void) -{ - struct dma_chan *chan = dma_find_channel(DMA_MEMCPY); - if (chan && !is_dma_copy_aligned(chan->device, 1, 1, 1)) - return NULL; - - return chan; -} -EXPORT_SYMBOL(net_dma_find_channel); - /** * dma_issue_pending_all - flush all pending operations across all channels */ diff --git a/drivers/dma/dw/Kconfig b/drivers/dma/dw/Kconfig index dcfe964cc8dc..36e02f0f645e 100644 --- a/drivers/dma/dw/Kconfig +++ b/drivers/dma/dw/Kconfig @@ -3,7 +3,7 @@ # config DW_DMAC_CORE - tristate "Synopsys DesignWare AHB DMA support" + tristate select DMA_ENGINE config DW_DMAC diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index a8ad05291b27..1022c2e1a2b0 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -230,7 +230,8 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first) /* ASSERT: channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { dev_err(chan2dev(&dwc->chan), - "BUG: Attempted to start non-idle channel\n"); + "%s: BUG: Attempted to start non-idle channel\n", + __func__); dwc_dump_chan_regs(dwc); /* The tasklet will hopefully advance the queue... */ @@ -814,11 +815,8 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, slave_sg_todev_fill_desc: desc = dwc_desc_get(dwc); - if (!desc) { - dev_err(chan2dev(chan), - "not enough descriptors available\n"); + if (!desc) goto err_desc_get; - } desc->lli.sar = mem; desc->lli.dar = reg; @@ -874,11 +872,8 @@ slave_sg_todev_fill_desc: slave_sg_fromdev_fill_desc: desc = dwc_desc_get(dwc); - if (!desc) { - dev_err(chan2dev(chan), - "not enough descriptors available\n"); + if (!desc) goto err_desc_get; - } desc->lli.sar = reg; desc->lli.dar = mem; @@ -922,6 +917,8 @@ slave_sg_fromdev_fill_desc: return &first->txd; err_desc_get: + dev_err(chan2dev(chan), + "not enough descriptors available. Direction %d\n", direction); dwc_desc_put(dwc, first); return NULL; } @@ -1261,7 +1258,8 @@ int dw_dma_cyclic_start(struct dma_chan *chan) /* Assert channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { dev_err(chan2dev(&dwc->chan), - "BUG: Attempted to start non-idle channel\n"); + "%s: BUG: Attempted to start non-idle channel\n", + __func__); dwc_dump_chan_regs(dwc); spin_unlock_irqrestore(&dwc->lock, flags); return -EBUSY; diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index 53dbd3b3384c..bf09db7ca9ee 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -812,7 +812,7 @@ static int edma_alloc_chan_resources(struct dma_chan *chan) LIST_HEAD(descs); a_ch_num = edma_alloc_channel(echan->ch_num, edma_callback, - chan, EVENTQ_DEFAULT); + echan, EVENTQ_DEFAULT); if (a_ch_num < 0) { ret = -ENODEV; diff --git a/drivers/dma/fsl_raid.c b/drivers/dma/fsl_raid.c new file mode 100644 index 000000000000..4d9470f16552 --- /dev/null +++ b/drivers/dma/fsl_raid.c @@ -0,0 +1,904 @@ +/* + * drivers/dma/fsl_raid.c + * + * Freescale RAID Engine device driver + * + * Author: + * Harninder Rai <harninder.rai@freescale.com> + * Naveen Burmi <naveenburmi@freescale.com> + * + * Rewrite: + * Xuelin Shi <xuelin.shi@freescale.com> + * + * Copyright (c) 2010-2014 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Theory of operation: + * + * General capabilities: + * RAID Engine (RE) block is capable of offloading XOR, memcpy and P/Q + * calculations required in RAID5 and RAID6 operations. RE driver + * registers with Linux's ASYNC layer as dma driver. RE hardware + * maintains strict ordering of the requests through chained + * command queueing. + * + * Data flow: + * Software RAID layer of Linux (MD layer) maintains RAID partitions, + * strips, stripes etc. It sends requests to the underlying ASYNC layer + * which further passes it to RE driver. ASYNC layer decides which request + * goes to which job ring of RE hardware. For every request processed by + * RAID Engine, driver gets an interrupt unless coalescing is set. The + * per job ring interrupt handler checks the status register for errors, + * clears the interrupt and leave the post interrupt processing to the irq + * thread. + */ +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/of_irq.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/dma-mapping.h> +#include <linux/dmapool.h> +#include <linux/dmaengine.h> +#include <linux/io.h> +#include <linux/spinlock.h> +#include <linux/slab.h> + +#include "dmaengine.h" +#include "fsl_raid.h" + +#define FSL_RE_MAX_XOR_SRCS 16 +#define FSL_RE_MAX_PQ_SRCS 16 +#define FSL_RE_MIN_DESCS 256 +#define FSL_RE_MAX_DESCS (4 * FSL_RE_MIN_DESCS) +#define FSL_RE_FRAME_FORMAT 0x1 +#define FSL_RE_MAX_DATA_LEN (1024*1024) + +#define to_fsl_re_dma_desc(tx) container_of(tx, struct fsl_re_desc, async_tx) + +/* Add descriptors into per chan software queue - submit_q */ +static dma_cookie_t fsl_re_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct fsl_re_desc *desc; + struct fsl_re_chan *re_chan; + dma_cookie_t cookie; + unsigned long flags; + + desc = to_fsl_re_dma_desc(tx); + re_chan = container_of(tx->chan, struct fsl_re_chan, chan); + + spin_lock_irqsave(&re_chan->desc_lock, flags); + cookie = dma_cookie_assign(tx); + list_add_tail(&desc->node, &re_chan->submit_q); + spin_unlock_irqrestore(&re_chan->desc_lock, flags); + + return cookie; +} + +/* Copy descriptor from per chan software queue into hardware job ring */ +static void fsl_re_issue_pending(struct dma_chan *chan) +{ + struct fsl_re_chan *re_chan; + int avail; + struct fsl_re_desc *desc, *_desc; + unsigned long flags; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + + spin_lock_irqsave(&re_chan->desc_lock, flags); + avail = FSL_RE_SLOT_AVAIL( + in_be32(&re_chan->jrregs->inbring_slot_avail)); + + list_for_each_entry_safe(desc, _desc, &re_chan->submit_q, node) { + if (!avail) + break; + + list_move_tail(&desc->node, &re_chan->active_q); + + memcpy(&re_chan->inb_ring_virt_addr[re_chan->inb_count], + &desc->hwdesc, sizeof(struct fsl_re_hw_desc)); + + re_chan->inb_count = (re_chan->inb_count + 1) & + FSL_RE_RING_SIZE_MASK; + out_be32(&re_chan->jrregs->inbring_add_job, FSL_RE_ADD_JOB(1)); + avail--; + } + spin_unlock_irqrestore(&re_chan->desc_lock, flags); +} + +static void fsl_re_desc_done(struct fsl_re_desc *desc) +{ + dma_async_tx_callback callback; + void *callback_param; + + dma_cookie_complete(&desc->async_tx); + + callback = desc->async_tx.callback; + callback_param = desc->async_tx.callback_param; + if (callback) + callback(callback_param); + + dma_descriptor_unmap(&desc->async_tx); +} + +static void fsl_re_cleanup_descs(struct fsl_re_chan *re_chan) +{ + struct fsl_re_desc *desc, *_desc; + unsigned long flags; + + spin_lock_irqsave(&re_chan->desc_lock, flags); + list_for_each_entry_safe(desc, _desc, &re_chan->ack_q, node) { + if (async_tx_test_ack(&desc->async_tx)) + list_move_tail(&desc->node, &re_chan->free_q); + } + spin_unlock_irqrestore(&re_chan->desc_lock, flags); + + fsl_re_issue_pending(&re_chan->chan); +} + +static void fsl_re_dequeue(unsigned long data) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc, *_desc; + struct fsl_re_hw_desc *hwdesc; + unsigned long flags; + unsigned int count, oub_count; + int found; + + re_chan = dev_get_drvdata((struct device *)data); + + fsl_re_cleanup_descs(re_chan); + + spin_lock_irqsave(&re_chan->desc_lock, flags); + count = FSL_RE_SLOT_FULL(in_be32(&re_chan->jrregs->oubring_slot_full)); + while (count--) { + found = 0; + hwdesc = &re_chan->oub_ring_virt_addr[re_chan->oub_count]; + list_for_each_entry_safe(desc, _desc, &re_chan->active_q, + node) { + /* compare the hw dma addr to find the completed */ + if (desc->hwdesc.lbea32 == hwdesc->lbea32 && + desc->hwdesc.addr_low == hwdesc->addr_low) { + found = 1; + break; + } + } + + if (found) { + fsl_re_desc_done(desc); + list_move_tail(&desc->node, &re_chan->ack_q); + } else { + dev_err(re_chan->dev, + "found hwdesc not in sw queue, discard it\n"); + } + + oub_count = (re_chan->oub_count + 1) & FSL_RE_RING_SIZE_MASK; + re_chan->oub_count = oub_count; + + out_be32(&re_chan->jrregs->oubring_job_rmvd, + FSL_RE_RMVD_JOB(1)); + } + spin_unlock_irqrestore(&re_chan->desc_lock, flags); +} + +/* Per Job Ring interrupt handler */ +static irqreturn_t fsl_re_isr(int irq, void *data) +{ + struct fsl_re_chan *re_chan; + u32 irqstate, status; + + re_chan = dev_get_drvdata((struct device *)data); + + irqstate = in_be32(&re_chan->jrregs->jr_interrupt_status); + if (!irqstate) + return IRQ_NONE; + + /* + * There's no way in upper layer (read MD layer) to recover from + * error conditions except restart everything. In long term we + * need to do something more than just crashing + */ + if (irqstate & FSL_RE_ERROR) { + status = in_be32(&re_chan->jrregs->jr_status); + dev_err(re_chan->dev, "chan error irqstate: %x, status: %x\n", + irqstate, status); + } + + /* Clear interrupt */ + out_be32(&re_chan->jrregs->jr_interrupt_status, FSL_RE_CLR_INTR); + + tasklet_schedule(&re_chan->irqtask); + + return IRQ_HANDLED; +} + +static enum dma_status fsl_re_tx_status(struct dma_chan *chan, + dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + return dma_cookie_status(chan, cookie, txstate); +} + +static void fill_cfd_frame(struct fsl_re_cmpnd_frame *cf, u8 index, + size_t length, dma_addr_t addr, bool final) +{ + u32 efrl = length & FSL_RE_CF_LENGTH_MASK; + + efrl |= final << FSL_RE_CF_FINAL_SHIFT; + cf[index].efrl32 = efrl; + cf[index].addr_high = upper_32_bits(addr); + cf[index].addr_low = lower_32_bits(addr); +} + +static struct fsl_re_desc *fsl_re_init_desc(struct fsl_re_chan *re_chan, + struct fsl_re_desc *desc, + void *cf, dma_addr_t paddr) +{ + desc->re_chan = re_chan; + desc->async_tx.tx_submit = fsl_re_tx_submit; + dma_async_tx_descriptor_init(&desc->async_tx, &re_chan->chan); + INIT_LIST_HEAD(&desc->node); + + desc->hwdesc.fmt32 = FSL_RE_FRAME_FORMAT << FSL_RE_HWDESC_FMT_SHIFT; + desc->hwdesc.lbea32 = upper_32_bits(paddr); + desc->hwdesc.addr_low = lower_32_bits(paddr); + desc->cf_addr = cf; + desc->cf_paddr = paddr; + + desc->cdb_addr = (void *)(cf + FSL_RE_CF_DESC_SIZE); + desc->cdb_paddr = paddr + FSL_RE_CF_DESC_SIZE; + + return desc; +} + +static struct fsl_re_desc *fsl_re_chan_alloc_desc(struct fsl_re_chan *re_chan, + unsigned long flags) +{ + struct fsl_re_desc *desc = NULL; + void *cf; + dma_addr_t paddr; + unsigned long lock_flag; + + fsl_re_cleanup_descs(re_chan); + + spin_lock_irqsave(&re_chan->desc_lock, lock_flag); + if (!list_empty(&re_chan->free_q)) { + /* take one desc from free_q */ + desc = list_first_entry(&re_chan->free_q, + struct fsl_re_desc, node); + list_del(&desc->node); + + desc->async_tx.flags = flags; + } + spin_unlock_irqrestore(&re_chan->desc_lock, lock_flag); + + if (!desc) { + desc = kzalloc(sizeof(*desc), GFP_NOWAIT); + if (!desc) + return NULL; + + cf = dma_pool_alloc(re_chan->re_dev->cf_desc_pool, GFP_NOWAIT, + &paddr); + if (!cf) { + kfree(desc); + return NULL; + } + + desc = fsl_re_init_desc(re_chan, desc, cf, paddr); + desc->async_tx.flags = flags; + + spin_lock_irqsave(&re_chan->desc_lock, lock_flag); + re_chan->alloc_count++; + spin_unlock_irqrestore(&re_chan->desc_lock, lock_flag); + } + + return desc; +} + +static struct dma_async_tx_descriptor *fsl_re_prep_dma_genq( + struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + unsigned long flags) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc; + struct fsl_re_xor_cdb *xor; + struct fsl_re_cmpnd_frame *cf; + u32 cdb; + unsigned int i, j; + unsigned int save_src_cnt = src_cnt; + int cont_q = 0; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + if (len > FSL_RE_MAX_DATA_LEN) { + dev_err(re_chan->dev, "genq tx length %lu, max length %d\n", + len, FSL_RE_MAX_DATA_LEN); + return NULL; + } + + desc = fsl_re_chan_alloc_desc(re_chan, flags); + if (desc <= 0) + return NULL; + + if (scf && (flags & DMA_PREP_CONTINUE)) { + cont_q = 1; + src_cnt += 1; + } + + /* Filling xor CDB */ + cdb = FSL_RE_XOR_OPCODE << FSL_RE_CDB_OPCODE_SHIFT; + cdb |= (src_cnt - 1) << FSL_RE_CDB_NRCS_SHIFT; + cdb |= FSL_RE_BLOCK_SIZE << FSL_RE_CDB_BLKSIZE_SHIFT; + cdb |= FSL_RE_INTR_ON_ERROR << FSL_RE_CDB_ERROR_SHIFT; + cdb |= FSL_RE_DATA_DEP << FSL_RE_CDB_DEPEND_SHIFT; + xor = desc->cdb_addr; + xor->cdb32 = cdb; + + if (scf) { + /* compute q = src0*coef0^src1*coef1^..., * is GF(8) mult */ + for (i = 0; i < save_src_cnt; i++) + xor->gfm[i] = scf[i]; + if (cont_q) + xor->gfm[i++] = 1; + } else { + /* compute P, that is XOR all srcs */ + for (i = 0; i < src_cnt; i++) + xor->gfm[i] = 1; + } + + /* Filling frame 0 of compound frame descriptor with CDB */ + cf = desc->cf_addr; + fill_cfd_frame(cf, 0, sizeof(*xor), desc->cdb_paddr, 0); + + /* Fill CFD's 1st frame with dest buffer */ + fill_cfd_frame(cf, 1, len, dest, 0); + + /* Fill CFD's rest of the frames with source buffers */ + for (i = 2, j = 0; j < save_src_cnt; i++, j++) + fill_cfd_frame(cf, i, len, src[j], 0); + + if (cont_q) + fill_cfd_frame(cf, i++, len, dest, 0); + + /* Setting the final bit in the last source buffer frame in CFD */ + cf[i - 1].efrl32 |= 1 << FSL_RE_CF_FINAL_SHIFT; + + return &desc->async_tx; +} + +/* + * Prep function for P parity calculation.In RAID Engine terminology, + * XOR calculation is called GenQ calculation done through GenQ command + */ +static struct dma_async_tx_descriptor *fsl_re_prep_dma_xor( + struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src, + unsigned int src_cnt, size_t len, unsigned long flags) +{ + /* NULL let genq take all coef as 1 */ + return fsl_re_prep_dma_genq(chan, dest, src, src_cnt, NULL, len, flags); +} + +/* + * Prep function for P/Q parity calculation.In RAID Engine terminology, + * P/Q calculation is called GenQQ done through GenQQ command + */ +static struct dma_async_tx_descriptor *fsl_re_prep_dma_pq( + struct dma_chan *chan, dma_addr_t *dest, dma_addr_t *src, + unsigned int src_cnt, const unsigned char *scf, size_t len, + unsigned long flags) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc; + struct fsl_re_pq_cdb *pq; + struct fsl_re_cmpnd_frame *cf; + u32 cdb; + u8 *p; + int gfmq_len, i, j; + unsigned int save_src_cnt = src_cnt; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + if (len > FSL_RE_MAX_DATA_LEN) { + dev_err(re_chan->dev, "pq tx length is %lu, max length is %d\n", + len, FSL_RE_MAX_DATA_LEN); + return NULL; + } + + /* + * RE requires at least 2 sources, if given only one source, we pass the + * second source same as the first one. + * With only one source, generating P is meaningless, only generate Q. + */ + if (src_cnt == 1) { + struct dma_async_tx_descriptor *tx; + dma_addr_t dma_src[2]; + unsigned char coef[2]; + + dma_src[0] = *src; + coef[0] = *scf; + dma_src[1] = *src; + coef[1] = 0; + tx = fsl_re_prep_dma_genq(chan, dest[1], dma_src, 2, coef, len, + flags); + if (tx) + desc = to_fsl_re_dma_desc(tx); + + return tx; + } + + /* + * During RAID6 array creation, Linux's MD layer gets P and Q + * calculated separately in two steps. But our RAID Engine has + * the capability to calculate both P and Q with a single command + * Hence to merge well with MD layer, we need to provide a hook + * here and call re_jq_prep_dma_genq() function + */ + + if (flags & DMA_PREP_PQ_DISABLE_P) + return fsl_re_prep_dma_genq(chan, dest[1], src, src_cnt, + scf, len, flags); + + if (flags & DMA_PREP_CONTINUE) + src_cnt += 3; + + desc = fsl_re_chan_alloc_desc(re_chan, flags); + if (desc <= 0) + return NULL; + + /* Filling GenQQ CDB */ + cdb = FSL_RE_PQ_OPCODE << FSL_RE_CDB_OPCODE_SHIFT; + cdb |= (src_cnt - 1) << FSL_RE_CDB_NRCS_SHIFT; + cdb |= FSL_RE_BLOCK_SIZE << FSL_RE_CDB_BLKSIZE_SHIFT; + cdb |= FSL_RE_BUFFER_OUTPUT << FSL_RE_CDB_BUFFER_SHIFT; + cdb |= FSL_RE_DATA_DEP << FSL_RE_CDB_DEPEND_SHIFT; + + pq = desc->cdb_addr; + pq->cdb32 = cdb; + + p = pq->gfm_q1; + /* Init gfm_q1[] */ + for (i = 0; i < src_cnt; i++) + p[i] = 1; + + /* Align gfm[] to 32bit */ + gfmq_len = ALIGN(src_cnt, 4); + + /* Init gfm_q2[] */ + p += gfmq_len; + for (i = 0; i < src_cnt; i++) + p[i] = scf[i]; + + /* Filling frame 0 of compound frame descriptor with CDB */ + cf = desc->cf_addr; + fill_cfd_frame(cf, 0, sizeof(struct fsl_re_pq_cdb), desc->cdb_paddr, 0); + + /* Fill CFD's 1st & 2nd frame with dest buffers */ + for (i = 1, j = 0; i < 3; i++, j++) + fill_cfd_frame(cf, i, len, dest[j], 0); + + /* Fill CFD's rest of the frames with source buffers */ + for (i = 3, j = 0; j < save_src_cnt; i++, j++) + fill_cfd_frame(cf, i, len, src[j], 0); + + /* PQ computation continuation */ + if (flags & DMA_PREP_CONTINUE) { + if (src_cnt - save_src_cnt == 3) { + p[save_src_cnt] = 0; + p[save_src_cnt + 1] = 0; + p[save_src_cnt + 2] = 1; + fill_cfd_frame(cf, i++, len, dest[0], 0); + fill_cfd_frame(cf, i++, len, dest[1], 0); + fill_cfd_frame(cf, i++, len, dest[1], 0); + } else { + dev_err(re_chan->dev, "PQ tx continuation error!\n"); + return NULL; + } + } + + /* Setting the final bit in the last source buffer frame in CFD */ + cf[i - 1].efrl32 |= 1 << FSL_RE_CF_FINAL_SHIFT; + + return &desc->async_tx; +} + +/* + * Prep function for memcpy. In RAID Engine, memcpy is done through MOVE + * command. Logic of this function will need to be modified once multipage + * support is added in Linux's MD/ASYNC Layer + */ +static struct dma_async_tx_descriptor *fsl_re_prep_dma_memcpy( + struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, + size_t len, unsigned long flags) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc; + size_t length; + struct fsl_re_cmpnd_frame *cf; + struct fsl_re_move_cdb *move; + u32 cdb; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + + if (len > FSL_RE_MAX_DATA_LEN) { + dev_err(re_chan->dev, "cp tx length is %lu, max length is %d\n", + len, FSL_RE_MAX_DATA_LEN); + return NULL; + } + + desc = fsl_re_chan_alloc_desc(re_chan, flags); + if (desc <= 0) + return NULL; + + /* Filling move CDB */ + cdb = FSL_RE_MOVE_OPCODE << FSL_RE_CDB_OPCODE_SHIFT; + cdb |= FSL_RE_BLOCK_SIZE << FSL_RE_CDB_BLKSIZE_SHIFT; + cdb |= FSL_RE_INTR_ON_ERROR << FSL_RE_CDB_ERROR_SHIFT; + cdb |= FSL_RE_DATA_DEP << FSL_RE_CDB_DEPEND_SHIFT; + + move = desc->cdb_addr; + move->cdb32 = cdb; + + /* Filling frame 0 of CFD with move CDB */ + cf = desc->cf_addr; + fill_cfd_frame(cf, 0, sizeof(*move), desc->cdb_paddr, 0); + + length = min_t(size_t, len, FSL_RE_MAX_DATA_LEN); + + /* Fill CFD's 1st frame with dest buffer */ + fill_cfd_frame(cf, 1, length, dest, 0); + + /* Fill CFD's 2nd frame with src buffer */ + fill_cfd_frame(cf, 2, length, src, 1); + + return &desc->async_tx; +} + +static int fsl_re_alloc_chan_resources(struct dma_chan *chan) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc; + void *cf; + dma_addr_t paddr; + int i; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + for (i = 0; i < FSL_RE_MIN_DESCS; i++) { + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) + break; + + cf = dma_pool_alloc(re_chan->re_dev->cf_desc_pool, GFP_KERNEL, + &paddr); + if (!cf) { + kfree(desc); + break; + } + + INIT_LIST_HEAD(&desc->node); + fsl_re_init_desc(re_chan, desc, cf, paddr); + + list_add_tail(&desc->node, &re_chan->free_q); + re_chan->alloc_count++; + } + return re_chan->alloc_count; +} + +static void fsl_re_free_chan_resources(struct dma_chan *chan) +{ + struct fsl_re_chan *re_chan; + struct fsl_re_desc *desc; + + re_chan = container_of(chan, struct fsl_re_chan, chan); + while (re_chan->alloc_count--) { + desc = list_first_entry(&re_chan->free_q, + struct fsl_re_desc, + node); + + list_del(&desc->node); + dma_pool_free(re_chan->re_dev->cf_desc_pool, desc->cf_addr, + desc->cf_paddr); + kfree(desc); + } + + if (!list_empty(&re_chan->free_q)) + dev_err(re_chan->dev, "chan resource cannot be cleaned!\n"); +} + +static int fsl_re_chan_probe(struct platform_device *ofdev, + struct device_node *np, u8 q, u32 off) +{ + struct device *dev, *chandev; + struct fsl_re_drv_private *re_priv; + struct fsl_re_chan *chan; + struct dma_device *dma_dev; + u32 ptr; + u32 status; + int ret = 0, rc; + struct platform_device *chan_ofdev; + + dev = &ofdev->dev; + re_priv = dev_get_drvdata(dev); + dma_dev = &re_priv->dma_dev; + + chan = devm_kzalloc(dev, sizeof(*chan), GFP_KERNEL); + if (!chan) + return -ENOMEM; + + /* create platform device for chan node */ + chan_ofdev = of_platform_device_create(np, NULL, dev); + if (!chan_ofdev) { + dev_err(dev, "Not able to create ofdev for jr %d\n", q); + ret = -EINVAL; + goto err_free; + } + + /* read reg property from dts */ + rc = of_property_read_u32(np, "reg", &ptr); + if (rc) { + dev_err(dev, "Reg property not found in jr %d\n", q); + ret = -ENODEV; + goto err_free; + } + + chan->jrregs = (struct fsl_re_chan_cfg *)((u8 *)re_priv->re_regs + + off + ptr); + + /* read irq property from dts */ + chan->irq = irq_of_parse_and_map(np, 0); + if (chan->irq == NO_IRQ) { + dev_err(dev, "No IRQ defined for JR %d\n", q); + ret = -ENODEV; + goto err_free; + } + + snprintf(chan->name, sizeof(chan->name), "re_jr%02d", q); + + chandev = &chan_ofdev->dev; + tasklet_init(&chan->irqtask, fsl_re_dequeue, (unsigned long)chandev); + + ret = request_irq(chan->irq, fsl_re_isr, 0, chan->name, chandev); + if (ret) { + dev_err(dev, "Unable to register interrupt for JR %d\n", q); + ret = -EINVAL; + goto err_free; + } + + re_priv->re_jrs[q] = chan; + chan->chan.device = dma_dev; + chan->chan.private = chan; + chan->dev = chandev; + chan->re_dev = re_priv; + + spin_lock_init(&chan->desc_lock); + INIT_LIST_HEAD(&chan->ack_q); + INIT_LIST_HEAD(&chan->active_q); + INIT_LIST_HEAD(&chan->submit_q); + INIT_LIST_HEAD(&chan->free_q); + + chan->inb_ring_virt_addr = dma_pool_alloc(chan->re_dev->hw_desc_pool, + GFP_KERNEL, &chan->inb_phys_addr); + if (!chan->inb_ring_virt_addr) { + dev_err(dev, "No dma memory for inb_ring_virt_addr\n"); + ret = -ENOMEM; + goto err_free; + } + + chan->oub_ring_virt_addr = dma_pool_alloc(chan->re_dev->hw_desc_pool, + GFP_KERNEL, &chan->oub_phys_addr); + if (!chan->oub_ring_virt_addr) { + dev_err(dev, "No dma memory for oub_ring_virt_addr\n"); + ret = -ENOMEM; + goto err_free_1; + } + + /* Program the Inbound/Outbound ring base addresses and size */ + out_be32(&chan->jrregs->inbring_base_h, + chan->inb_phys_addr & FSL_RE_ADDR_BIT_MASK); + out_be32(&chan->jrregs->oubring_base_h, + chan->oub_phys_addr & FSL_RE_ADDR_BIT_MASK); + out_be32(&chan->jrregs->inbring_base_l, + chan->inb_phys_addr >> FSL_RE_ADDR_BIT_SHIFT); + out_be32(&chan->jrregs->oubring_base_l, + chan->oub_phys_addr >> FSL_RE_ADDR_BIT_SHIFT); + out_be32(&chan->jrregs->inbring_size, + FSL_RE_RING_SIZE << FSL_RE_RING_SIZE_SHIFT); + out_be32(&chan->jrregs->oubring_size, + FSL_RE_RING_SIZE << FSL_RE_RING_SIZE_SHIFT); + + /* Read LIODN value from u-boot */ + status = in_be32(&chan->jrregs->jr_config_1) & FSL_RE_REG_LIODN_MASK; + + /* Program the CFG reg */ + out_be32(&chan->jrregs->jr_config_1, + FSL_RE_CFG1_CBSI | FSL_RE_CFG1_CBS0 | status); + + dev_set_drvdata(chandev, chan); + + /* Enable RE/CHAN */ + out_be32(&chan->jrregs->jr_command, FSL_RE_ENABLE); + + return 0; + +err_free_1: + dma_pool_free(chan->re_dev->hw_desc_pool, chan->inb_ring_virt_addr, + chan->inb_phys_addr); +err_free: + return ret; +} + +/* Probe function for RAID Engine */ +static int fsl_re_probe(struct platform_device *ofdev) +{ + struct fsl_re_drv_private *re_priv; + struct device_node *np; + struct device_node *child; + u32 off; + u8 ridx = 0; + struct dma_device *dma_dev; + struct resource *res; + int rc; + struct device *dev = &ofdev->dev; + + re_priv = devm_kzalloc(dev, sizeof(*re_priv), GFP_KERNEL); + if (!re_priv) + return -ENOMEM; + + res = platform_get_resource(ofdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + /* IOMAP the entire RAID Engine region */ + re_priv->re_regs = devm_ioremap(dev, res->start, resource_size(res)); + if (!re_priv->re_regs) + return -EBUSY; + + /* Program the RE mode */ + out_be32(&re_priv->re_regs->global_config, FSL_RE_NON_DPAA_MODE); + + /* Program Galois Field polynomial */ + out_be32(&re_priv->re_regs->galois_field_config, FSL_RE_GFM_POLY); + + dev_info(dev, "version %x, mode %x, gfp %x\n", + in_be32(&re_priv->re_regs->re_version_id), + in_be32(&re_priv->re_regs->global_config), + in_be32(&re_priv->re_regs->galois_field_config)); + + dma_dev = &re_priv->dma_dev; + dma_dev->dev = dev; + INIT_LIST_HEAD(&dma_dev->channels); + dma_set_mask(dev, DMA_BIT_MASK(40)); + + dma_dev->device_alloc_chan_resources = fsl_re_alloc_chan_resources; + dma_dev->device_tx_status = fsl_re_tx_status; + dma_dev->device_issue_pending = fsl_re_issue_pending; + + dma_dev->max_xor = FSL_RE_MAX_XOR_SRCS; + dma_dev->device_prep_dma_xor = fsl_re_prep_dma_xor; + dma_cap_set(DMA_XOR, dma_dev->cap_mask); + + dma_dev->max_pq = FSL_RE_MAX_PQ_SRCS; + dma_dev->device_prep_dma_pq = fsl_re_prep_dma_pq; + dma_cap_set(DMA_PQ, dma_dev->cap_mask); + + dma_dev->device_prep_dma_memcpy = fsl_re_prep_dma_memcpy; + dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask); + + dma_dev->device_free_chan_resources = fsl_re_free_chan_resources; + + re_priv->total_chans = 0; + + re_priv->cf_desc_pool = dmam_pool_create("fsl_re_cf_desc_pool", dev, + FSL_RE_CF_CDB_SIZE, + FSL_RE_CF_CDB_ALIGN, 0); + + if (!re_priv->cf_desc_pool) { + dev_err(dev, "No memory for fsl re_cf desc pool\n"); + return -ENOMEM; + } + + re_priv->hw_desc_pool = dmam_pool_create("fsl_re_hw_desc_pool", dev, + sizeof(struct fsl_re_hw_desc) * FSL_RE_RING_SIZE, + FSL_RE_FRAME_ALIGN, 0); + if (!re_priv->hw_desc_pool) { + dev_err(dev, "No memory for fsl re_hw desc pool\n"); + return -ENOMEM; + } + + dev_set_drvdata(dev, re_priv); + + /* Parse Device tree to find out the total number of JQs present */ + for_each_compatible_node(np, NULL, "fsl,raideng-v1.0-job-queue") { + rc = of_property_read_u32(np, "reg", &off); + if (rc) { + dev_err(dev, "Reg property not found in JQ node\n"); + return -ENODEV; + } + /* Find out the Job Rings present under each JQ */ + for_each_child_of_node(np, child) { + rc = of_device_is_compatible(child, + "fsl,raideng-v1.0-job-ring"); + if (rc) { + fsl_re_chan_probe(ofdev, child, ridx++, off); + re_priv->total_chans++; + } + } + } + + dma_async_device_register(dma_dev); + + return 0; +} + +static void fsl_re_remove_chan(struct fsl_re_chan *chan) +{ + dma_pool_free(chan->re_dev->hw_desc_pool, chan->inb_ring_virt_addr, + chan->inb_phys_addr); + + dma_pool_free(chan->re_dev->hw_desc_pool, chan->oub_ring_virt_addr, + chan->oub_phys_addr); +} + +static int fsl_re_remove(struct platform_device *ofdev) +{ + struct fsl_re_drv_private *re_priv; + struct device *dev; + int i; + + dev = &ofdev->dev; + re_priv = dev_get_drvdata(dev); + + /* Cleanup chan related memory areas */ + for (i = 0; i < re_priv->total_chans; i++) + fsl_re_remove_chan(re_priv->re_jrs[i]); + + /* Unregister the driver */ + dma_async_device_unregister(&re_priv->dma_dev); + + return 0; +} + +static struct of_device_id fsl_re_ids[] = { + { .compatible = "fsl,raideng-v1.0", }, + {} +}; + +static struct platform_driver fsl_re_driver = { + .driver = { + .name = "fsl-raideng", + .owner = THIS_MODULE, + .of_match_table = fsl_re_ids, + }, + .probe = fsl_re_probe, + .remove = fsl_re_remove, +}; + +module_platform_driver(fsl_re_driver); + +MODULE_AUTHOR("Harninder Rai <harninder.rai@freescale.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Freescale RAID Engine Device Driver"); diff --git a/drivers/dma/fsl_raid.h b/drivers/dma/fsl_raid.h new file mode 100644 index 000000000000..69d743c04973 --- /dev/null +++ b/drivers/dma/fsl_raid.h @@ -0,0 +1,306 @@ +/* + * drivers/dma/fsl_raid.h + * + * Freescale RAID Engine device driver + * + * Author: + * Harninder Rai <harninder.rai@freescale.com> + * Naveen Burmi <naveenburmi@freescale.com> + * + * Rewrite: + * Xuelin Shi <xuelin.shi@freescale.com> + + * Copyright (c) 2010-2012 Freescale Semiconductor, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define FSL_RE_MAX_CHANS 4 +#define FSL_RE_DPAA_MODE BIT(30) +#define FSL_RE_NON_DPAA_MODE BIT(31) +#define FSL_RE_GFM_POLY 0x1d000000 +#define FSL_RE_ADD_JOB(x) ((x) << 16) +#define FSL_RE_RMVD_JOB(x) ((x) << 16) +#define FSL_RE_CFG1_CBSI 0x08000000 +#define FSL_RE_CFG1_CBS0 0x00080000 +#define FSL_RE_SLOT_FULL_SHIFT 8 +#define FSL_RE_SLOT_FULL(x) ((x) >> FSL_RE_SLOT_FULL_SHIFT) +#define FSL_RE_SLOT_AVAIL_SHIFT 8 +#define FSL_RE_SLOT_AVAIL(x) ((x) >> FSL_RE_SLOT_AVAIL_SHIFT) +#define FSL_RE_PQ_OPCODE 0x1B +#define FSL_RE_XOR_OPCODE 0x1A +#define FSL_RE_MOVE_OPCODE 0x8 +#define FSL_RE_FRAME_ALIGN 16 +#define FSL_RE_BLOCK_SIZE 0x3 /* 4096 bytes */ +#define FSL_RE_CACHEABLE_IO 0x0 +#define FSL_RE_BUFFER_OUTPUT 0x0 +#define FSL_RE_INTR_ON_ERROR 0x1 +#define FSL_RE_DATA_DEP 0x1 +#define FSL_RE_ENABLE_DPI 0x0 +#define FSL_RE_RING_SIZE 0x400 +#define FSL_RE_RING_SIZE_MASK (FSL_RE_RING_SIZE - 1) +#define FSL_RE_RING_SIZE_SHIFT 8 +#define FSL_RE_ADDR_BIT_SHIFT 4 +#define FSL_RE_ADDR_BIT_MASK (BIT(FSL_RE_ADDR_BIT_SHIFT) - 1) +#define FSL_RE_ERROR 0x40000000 +#define FSL_RE_INTR 0x80000000 +#define FSL_RE_CLR_INTR 0x80000000 +#define FSL_RE_PAUSE 0x80000000 +#define FSL_RE_ENABLE 0x80000000 +#define FSL_RE_REG_LIODN_MASK 0x00000FFF + +#define FSL_RE_CDB_OPCODE_MASK 0xF8000000 +#define FSL_RE_CDB_OPCODE_SHIFT 27 +#define FSL_RE_CDB_EXCLEN_MASK 0x03000000 +#define FSL_RE_CDB_EXCLEN_SHIFT 24 +#define FSL_RE_CDB_EXCLQ1_MASK 0x00F00000 +#define FSL_RE_CDB_EXCLQ1_SHIFT 20 +#define FSL_RE_CDB_EXCLQ2_MASK 0x000F0000 +#define FSL_RE_CDB_EXCLQ2_SHIFT 16 +#define FSL_RE_CDB_BLKSIZE_MASK 0x0000C000 +#define FSL_RE_CDB_BLKSIZE_SHIFT 14 +#define FSL_RE_CDB_CACHE_MASK 0x00003000 +#define FSL_RE_CDB_CACHE_SHIFT 12 +#define FSL_RE_CDB_BUFFER_MASK 0x00000800 +#define FSL_RE_CDB_BUFFER_SHIFT 11 +#define FSL_RE_CDB_ERROR_MASK 0x00000400 +#define FSL_RE_CDB_ERROR_SHIFT 10 +#define FSL_RE_CDB_NRCS_MASK 0x0000003C +#define FSL_RE_CDB_NRCS_SHIFT 6 +#define FSL_RE_CDB_DEPEND_MASK 0x00000008 +#define FSL_RE_CDB_DEPEND_SHIFT 3 +#define FSL_RE_CDB_DPI_MASK 0x00000004 +#define FSL_RE_CDB_DPI_SHIFT 2 + +/* + * the largest cf block is 19*sizeof(struct cmpnd_frame), which is 304 bytes. + * here 19 = 1(cdb)+2(dest)+16(src), align to 64bytes, that is 320 bytes. + * the largest cdb block: struct pq_cdb which is 180 bytes, adding to cf block + * 320+180=500, align to 64bytes, that is 512 bytes. + */ +#define FSL_RE_CF_DESC_SIZE 320 +#define FSL_RE_CF_CDB_SIZE 512 +#define FSL_RE_CF_CDB_ALIGN 64 + +struct fsl_re_ctrl { + /* General Configuration Registers */ + __be32 global_config; /* Global Configuration Register */ + u8 rsvd1[4]; + __be32 galois_field_config; /* Galois Field Configuration Register */ + u8 rsvd2[4]; + __be32 jq_wrr_config; /* WRR Configuration register */ + u8 rsvd3[4]; + __be32 crc_config; /* CRC Configuration register */ + u8 rsvd4[228]; + __be32 system_reset; /* System Reset Register */ + u8 rsvd5[252]; + __be32 global_status; /* Global Status Register */ + u8 rsvd6[832]; + __be32 re_liodn_base; /* LIODN Base Register */ + u8 rsvd7[1712]; + __be32 re_version_id; /* Version ID register of RE */ + __be32 re_version_id_2; /* Version ID 2 register of RE */ + u8 rsvd8[512]; + __be32 host_config; /* Host I/F Configuration Register */ +}; + +struct fsl_re_chan_cfg { + /* Registers for JR interface */ + __be32 jr_config_0; /* Job Queue Configuration 0 Register */ + __be32 jr_config_1; /* Job Queue Configuration 1 Register */ + __be32 jr_interrupt_status; /* Job Queue Interrupt Status Register */ + u8 rsvd1[4]; + __be32 jr_command; /* Job Queue Command Register */ + u8 rsvd2[4]; + __be32 jr_status; /* Job Queue Status Register */ + u8 rsvd3[228]; + + /* Input Ring */ + __be32 inbring_base_h; /* Inbound Ring Base Address Register - High */ + __be32 inbring_base_l; /* Inbound Ring Base Address Register - Low */ + __be32 inbring_size; /* Inbound Ring Size Register */ + u8 rsvd4[4]; + __be32 inbring_slot_avail; /* Inbound Ring Slot Available Register */ + u8 rsvd5[4]; + __be32 inbring_add_job; /* Inbound Ring Add Job Register */ + u8 rsvd6[4]; + __be32 inbring_cnsmr_indx; /* Inbound Ring Consumer Index Register */ + u8 rsvd7[220]; + + /* Output Ring */ + __be32 oubring_base_h; /* Outbound Ring Base Address Register - High */ + __be32 oubring_base_l; /* Outbound Ring Base Address Register - Low */ + __be32 oubring_size; /* Outbound Ring Size Register */ + u8 rsvd8[4]; + __be32 oubring_job_rmvd; /* Outbound Ring Job Removed Register */ + u8 rsvd9[4]; + __be32 oubring_slot_full; /* Outbound Ring Slot Full Register */ + u8 rsvd10[4]; + __be32 oubring_prdcr_indx; /* Outbound Ring Producer Index */ +}; + +/* + * Command Descriptor Block (CDB) for unicast move command. + * In RAID Engine terms, memcpy is done through move command + */ +struct fsl_re_move_cdb { + __be32 cdb32; +}; + +/* Data protection/integrity related fields */ +#define FSL_RE_DPI_APPS_MASK 0xC0000000 +#define FSL_RE_DPI_APPS_SHIFT 30 +#define FSL_RE_DPI_REF_MASK 0x30000000 +#define FSL_RE_DPI_REF_SHIFT 28 +#define FSL_RE_DPI_GUARD_MASK 0x0C000000 +#define FSL_RE_DPI_GUARD_SHIFT 26 +#define FSL_RE_DPI_ATTR_MASK 0x03000000 +#define FSL_RE_DPI_ATTR_SHIFT 24 +#define FSL_RE_DPI_META_MASK 0x0000FFFF + +struct fsl_re_dpi { + __be32 dpi32; + __be32 ref; +}; + +/* + * CDB for GenQ command. In RAID Engine terminology, XOR is + * done through this command + */ +struct fsl_re_xor_cdb { + __be32 cdb32; + u8 gfm[16]; + struct fsl_re_dpi dpi_dest_spec; + struct fsl_re_dpi dpi_src_spec[16]; +}; + +/* CDB for no-op command */ +struct fsl_re_noop_cdb { + __be32 cdb32; +}; + +/* + * CDB for GenQQ command. In RAID Engine terminology, P/Q is + * done through this command + */ +struct fsl_re_pq_cdb { + __be32 cdb32; + u8 gfm_q1[16]; + u8 gfm_q2[16]; + struct fsl_re_dpi dpi_dest_spec[2]; + struct fsl_re_dpi dpi_src_spec[16]; +}; + +/* Compound frame */ +#define FSL_RE_CF_ADDR_HIGH_MASK 0x000000FF +#define FSL_RE_CF_EXT_MASK 0x80000000 +#define FSL_RE_CF_EXT_SHIFT 31 +#define FSL_RE_CF_FINAL_MASK 0x40000000 +#define FSL_RE_CF_FINAL_SHIFT 30 +#define FSL_RE_CF_LENGTH_MASK 0x000FFFFF +#define FSL_RE_CF_BPID_MASK 0x00FF0000 +#define FSL_RE_CF_BPID_SHIFT 16 +#define FSL_RE_CF_OFFSET_MASK 0x00001FFF + +struct fsl_re_cmpnd_frame { + __be32 addr_high; + __be32 addr_low; + __be32 efrl32; + __be32 rbro32; +}; + +/* Frame descriptor */ +#define FSL_RE_HWDESC_LIODN_MASK 0x3F000000 +#define FSL_RE_HWDESC_LIODN_SHIFT 24 +#define FSL_RE_HWDESC_BPID_MASK 0x00FF0000 +#define FSL_RE_HWDESC_BPID_SHIFT 16 +#define FSL_RE_HWDESC_ELIODN_MASK 0x0000F000 +#define FSL_RE_HWDESC_ELIODN_SHIFT 12 +#define FSL_RE_HWDESC_FMT_SHIFT 29 +#define FSL_RE_HWDESC_FMT_MASK (0x3 << FSL_RE_HWDESC_FMT_SHIFT) + +struct fsl_re_hw_desc { + __be32 lbea32; + __be32 addr_low; + __be32 fmt32; + __be32 status; +}; + +/* Raid Engine device private data */ +struct fsl_re_drv_private { + u8 total_chans; + struct dma_device dma_dev; + struct fsl_re_ctrl *re_regs; + struct fsl_re_chan *re_jrs[FSL_RE_MAX_CHANS]; + struct dma_pool *cf_desc_pool; + struct dma_pool *hw_desc_pool; +}; + +/* Per job ring data structure */ +struct fsl_re_chan { + char name[16]; + spinlock_t desc_lock; /* queue lock */ + struct list_head ack_q; /* wait to acked queue */ + struct list_head active_q; /* already issued on hw, not completed */ + struct list_head submit_q; + struct list_head free_q; /* alloc available queue */ + struct device *dev; + struct fsl_re_drv_private *re_dev; + struct dma_chan chan; + struct fsl_re_chan_cfg *jrregs; + int irq; + struct tasklet_struct irqtask; + u32 alloc_count; + + /* hw descriptor ring for inbound queue*/ + dma_addr_t inb_phys_addr; + struct fsl_re_hw_desc *inb_ring_virt_addr; + u32 inb_count; + + /* hw descriptor ring for outbound queue */ + dma_addr_t oub_phys_addr; + struct fsl_re_hw_desc *oub_ring_virt_addr; + u32 oub_count; +}; + +/* Async transaction descriptor */ +struct fsl_re_desc { + struct dma_async_tx_descriptor async_tx; + struct list_head node; + struct fsl_re_hw_desc hwdesc; + struct fsl_re_chan *re_chan; + + /* hwdesc will point to cf_addr */ + void *cf_addr; + dma_addr_t cf_paddr; + + void *cdb_addr; + dma_addr_t cdb_paddr; + int status; +}; diff --git a/drivers/dma/img-mdc-dma.c b/drivers/dma/img-mdc-dma.c index ed045a9ad634..9ca56830cc63 100644 --- a/drivers/dma/img-mdc-dma.c +++ b/drivers/dma/img-mdc-dma.c @@ -689,11 +689,6 @@ static int mdc_slave_config(struct dma_chan *chan, return 0; } -static int mdc_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void mdc_free_chan_resources(struct dma_chan *chan) { struct mdc_chan *mchan = to_mdc_chan(chan); @@ -910,7 +905,6 @@ static int mdc_dma_probe(struct platform_device *pdev) mdma->dma_dev.device_prep_slave_sg = mdc_prep_slave_sg; mdma->dma_dev.device_prep_dma_cyclic = mdc_prep_dma_cyclic; mdma->dma_dev.device_prep_dma_memcpy = mdc_prep_dma_memcpy; - mdma->dma_dev.device_alloc_chan_resources = mdc_alloc_chan_resources; mdma->dma_dev.device_free_chan_resources = mdc_free_chan_resources; mdma->dma_dev.device_tx_status = mdc_tx_status; mdma->dma_dev.device_issue_pending = mdc_issue_pending; diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 66a0efb9651d..62bbd79338e0 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1260,6 +1260,7 @@ static void sdma_issue_pending(struct dma_chan *chan) #define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1 34 #define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V2 38 +#define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V3 41 static void sdma_add_scripts(struct sdma_engine *sdma, const struct sdma_script_start_addrs *addr) @@ -1306,6 +1307,9 @@ static void sdma_load_firmware(const struct firmware *fw, void *context) case 2: sdma->script_number = SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V2; break; + case 3: + sdma->script_number = SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V3; + break; default: dev_err(sdma->dev, "unknown firmware version\n"); goto err_firmware; diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c index 3b55bb8d969a..ea1e107ae884 100644 --- a/drivers/dma/ioat/dca.c +++ b/drivers/dma/ioat/dca.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 940c1502a8b5..ee0aa9f4ccfa 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index d63f68b1aa35..30f5c7eede16 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index 695483e6be32..69c7dfcad023 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h index 470292767e68..bf24ebe874b0 100644 --- a/drivers/dma/ioat/dma_v2.h +++ b/drivers/dma/ioat/dma_v2.h @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c index 194ec20c9408..64790a45ef5d 100644 --- a/drivers/dma/ioat/dma_v3.c +++ b/drivers/dma/ioat/dma_v3.c @@ -15,10 +15,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h index 02177ecf09f8..a3e731edce57 100644 --- a/drivers/dma/ioat/hw.h +++ b/drivers/dma/ioat/hw.h @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c index 5501eb072d69..76f0dc688a19 100644 --- a/drivers/dma/ioat/pci.c +++ b/drivers/dma/ioat/pci.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * * The full GNU General Public License is included in this distribution in * the file called "COPYING". * diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h index 2f1cfa0f1f47..909352f74c89 100644 --- a/drivers/dma/ioat/registers.h +++ b/drivers/dma/ioat/registers.h @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 263d9f6a207e..998826854fdd 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * */ /* diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index 6f7f43529ccb..647e362f01fd 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -313,11 +313,6 @@ static void k3_dma_tasklet(unsigned long arg) } } -static int k3_dma_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void k3_dma_free_chan_resources(struct dma_chan *chan) { struct k3_dma_chan *c = to_k3_chan(chan); @@ -654,7 +649,7 @@ static void k3_dma_free_desc(struct virt_dma_desc *vd) kfree(ds); } -static struct of_device_id k3_pdma_dt_ids[] = { +static const struct of_device_id k3_pdma_dt_ids[] = { { .compatible = "hisilicon,k3-dma-1.0", }, {} }; @@ -728,7 +723,6 @@ static int k3_dma_probe(struct platform_device *op) dma_cap_set(DMA_SLAVE, d->slave.cap_mask); dma_cap_set(DMA_MEMCPY, d->slave.cap_mask); d->slave.dev = &op->dev; - d->slave.device_alloc_chan_resources = k3_dma_alloc_chan_resources; d->slave.device_free_chan_resources = k3_dma_free_chan_resources; d->slave.device_tx_status = k3_dma_tx_status; d->slave.device_prep_dma_memcpy = k3_dma_prep_memcpy; diff --git a/drivers/dma/mmp_pdma.c b/drivers/dma/mmp_pdma.c index eb410044e1af..462a0229a743 100644 --- a/drivers/dma/mmp_pdma.c +++ b/drivers/dma/mmp_pdma.c @@ -973,7 +973,7 @@ static int mmp_pdma_chan_init(struct mmp_pdma_device *pdev, int idx, int irq) return 0; } -static struct of_device_id mmp_pdma_dt_ids[] = { +static const struct of_device_id mmp_pdma_dt_ids[] = { { .compatible = "marvell,pdma-1.0", }, {} }; diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c index b6f4e1fc9c78..449e785def17 100644 --- a/drivers/dma/mmp_tdma.c +++ b/drivers/dma/mmp_tdma.c @@ -613,7 +613,7 @@ struct dma_chan *mmp_tdma_xlate(struct of_phandle_args *dma_spec, return dma_request_channel(mask, mmp_tdma_filter_fn, ¶m); } -static struct of_device_id mmp_tdma_dt_ids[] = { +static const struct of_device_id mmp_tdma_dt_ids[] = { { .compatible = "marvell,adma-1.0", .data = (void *)MMP_AUD_TDMA}, { .compatible = "marvell,pxa910-squ", .data = (void *)PXA910_SQU}, {} diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c index 57d2457545f3..e6281e7aa46e 100644 --- a/drivers/dma/mpc512x_dma.c +++ b/drivers/dma/mpc512x_dma.c @@ -21,10 +21,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ @@ -1072,7 +1068,7 @@ static int mpc_dma_remove(struct platform_device *op) return 0; } -static struct of_device_id mpc_dma_match[] = { +static const struct of_device_id mpc_dma_match[] = { { .compatible = "fsl,mpc5121-dma", }, { .compatible = "fsl,mpc8308-dma", }, {}, diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index b03e8137b918..1c56001df676 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -10,10 +10,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #include <linux/init.h> @@ -1249,7 +1245,7 @@ static int mv_xor_remove(struct platform_device *pdev) } #ifdef CONFIG_OF -static struct of_device_id mv_xor_dt_ids[] = { +static const struct of_device_id mv_xor_dt_ids[] = { { .compatible = "marvell,orion-xor", }, {}, }; diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h index 78edc7e44569..91958dba39a2 100644 --- a/drivers/dma/mv_xor.h +++ b/drivers/dma/mv_xor.h @@ -9,10 +9,6 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ #ifndef MV_XOR_H diff --git a/drivers/dma/pch_dma.c b/drivers/dma/pch_dma.c index 35c143cb88da..b859792dde95 100644 --- a/drivers/dma/pch_dma.c +++ b/drivers/dma/pch_dma.c @@ -949,6 +949,7 @@ err_free_res: err_disable_pdev: pci_disable_device(pdev); err_free_mem: + kfree(pd); return err; } diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 0e1f56772855..a7d9d3029b14 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -556,7 +556,7 @@ static inline u32 _emit_ADDH(unsigned dry_run, u8 buf[], buf[0] = CMD_DMAADDH; buf[0] |= (da << 1); - *((u16 *)&buf[1]) = val; + *((__le16 *)&buf[1]) = cpu_to_le16(val); PL330_DBGCMD_DUMP(SZ_DMAADDH, "\tDMAADDH %s %u\n", da == 1 ? "DA" : "SA", val); @@ -710,7 +710,7 @@ static inline u32 _emit_MOV(unsigned dry_run, u8 buf[], buf[0] = CMD_DMAMOV; buf[1] = dst; - *((u32 *)&buf[2]) = val; + *((__le32 *)&buf[2]) = cpu_to_le32(val); PL330_DBGCMD_DUMP(SZ_DMAMOV, "\tDMAMOV %s 0x%x\n", dst == SAR ? "SAR" : (dst == DAR ? "DAR" : "CCR"), val); @@ -888,7 +888,7 @@ static inline u32 _emit_GO(unsigned dry_run, u8 buf[], buf[1] = chan & 0x7; - *((u32 *)&buf[2]) = addr; + *((__le32 *)&buf[2]) = cpu_to_le32(addr); return SZ_DMAGO; } @@ -928,7 +928,7 @@ static inline void _execute_DBGINSN(struct pl330_thread *thrd, } writel(val, regs + DBGINST0); - val = *((u32 *)&insn[2]); + val = le32_to_cpu(*((__le32 *)&insn[2])); writel(val, regs + DBGINST1); /* If timed out due to halted state-machine */ @@ -2162,7 +2162,7 @@ static int pl330_terminate_all(struct dma_chan *chan) * DMA transfer again. This pause feature was implemented to * allow safely read residue before channel termination. */ -int pl330_pause(struct dma_chan *chan) +static int pl330_pause(struct dma_chan *chan) { struct dma_pl330_chan *pch = to_pchan(chan); struct pl330_dmac *pl330 = pch->dmac; @@ -2203,8 +2203,8 @@ static void pl330_free_chan_resources(struct dma_chan *chan) pm_runtime_put_autosuspend(pch->dmac->ddma.dev); } -int pl330_get_current_xferred_count(struct dma_pl330_chan *pch, - struct dma_pl330_desc *desc) +static int pl330_get_current_xferred_count(struct dma_pl330_chan *pch, + struct dma_pl330_desc *desc) { struct pl330_thread *thrd = pch->thread; struct pl330_dmac *pl330 = pch->dmac; @@ -2259,7 +2259,17 @@ pl330_tx_status(struct dma_chan *chan, dma_cookie_t cookie, transferred = 0; residual += desc->bytes_requested - transferred; if (desc->txd.cookie == cookie) { - ret = desc->status; + switch (desc->status) { + case DONE: + ret = DMA_COMPLETE; + break; + case PREP: + case BUSY: + ret = DMA_IN_PROGRESS; + break; + default: + WARN_ON(1); + } break; } if (desc->last) diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c index fa764a39cd36..9217f893b0d1 100644 --- a/drivers/dma/ppc4xx/adma.c +++ b/drivers/dma/ppc4xx/adma.c @@ -16,10 +16,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c index 9c914d625906..5a250cdc8376 100644 --- a/drivers/dma/qcom_bam_dma.c +++ b/drivers/dma/qcom_bam_dma.c @@ -171,6 +171,35 @@ static const struct reg_offset_data bam_v1_4_reg_info[] = { [BAM_P_FIFO_SIZES] = { 0x1820, 0x00, 0x1000, 0x00 }, }; +static const struct reg_offset_data bam_v1_7_reg_info[] = { + [BAM_CTRL] = { 0x00000, 0x00, 0x00, 0x00 }, + [BAM_REVISION] = { 0x01000, 0x00, 0x00, 0x00 }, + [BAM_NUM_PIPES] = { 0x01008, 0x00, 0x00, 0x00 }, + [BAM_DESC_CNT_TRSHLD] = { 0x00008, 0x00, 0x00, 0x00 }, + [BAM_IRQ_SRCS] = { 0x03010, 0x00, 0x00, 0x00 }, + [BAM_IRQ_SRCS_MSK] = { 0x03014, 0x00, 0x00, 0x00 }, + [BAM_IRQ_SRCS_UNMASKED] = { 0x03018, 0x00, 0x00, 0x00 }, + [BAM_IRQ_STTS] = { 0x00014, 0x00, 0x00, 0x00 }, + [BAM_IRQ_CLR] = { 0x00018, 0x00, 0x00, 0x00 }, + [BAM_IRQ_EN] = { 0x0001C, 0x00, 0x00, 0x00 }, + [BAM_CNFG_BITS] = { 0x0007C, 0x00, 0x00, 0x00 }, + [BAM_IRQ_SRCS_EE] = { 0x03000, 0x00, 0x00, 0x1000 }, + [BAM_IRQ_SRCS_MSK_EE] = { 0x03004, 0x00, 0x00, 0x1000 }, + [BAM_P_CTRL] = { 0x13000, 0x1000, 0x00, 0x00 }, + [BAM_P_RST] = { 0x13004, 0x1000, 0x00, 0x00 }, + [BAM_P_HALT] = { 0x13008, 0x1000, 0x00, 0x00 }, + [BAM_P_IRQ_STTS] = { 0x13010, 0x1000, 0x00, 0x00 }, + [BAM_P_IRQ_CLR] = { 0x13014, 0x1000, 0x00, 0x00 }, + [BAM_P_IRQ_EN] = { 0x13018, 0x1000, 0x00, 0x00 }, + [BAM_P_EVNT_DEST_ADDR] = { 0x1382C, 0x00, 0x1000, 0x00 }, + [BAM_P_EVNT_REG] = { 0x13818, 0x00, 0x1000, 0x00 }, + [BAM_P_SW_OFSTS] = { 0x13800, 0x00, 0x1000, 0x00 }, + [BAM_P_DATA_FIFO_ADDR] = { 0x13824, 0x00, 0x1000, 0x00 }, + [BAM_P_DESC_FIFO_ADDR] = { 0x1381C, 0x00, 0x1000, 0x00 }, + [BAM_P_EVNT_GEN_TRSHLD] = { 0x13828, 0x00, 0x1000, 0x00 }, + [BAM_P_FIFO_SIZES] = { 0x13820, 0x00, 0x1000, 0x00 }, +}; + /* BAM CTRL */ #define BAM_SW_RST BIT(0) #define BAM_EN BIT(1) @@ -1051,6 +1080,7 @@ static void bam_channel_init(struct bam_device *bdev, struct bam_chan *bchan, static const struct of_device_id bam_of_match[] = { { .compatible = "qcom,bam-v1.3.0", .data = &bam_v1_3_reg_info }, { .compatible = "qcom,bam-v1.4.0", .data = &bam_v1_4_reg_info }, + { .compatible = "qcom,bam-v1.7.0", .data = &bam_v1_7_reg_info }, {} }; @@ -1113,7 +1143,7 @@ static int bam_dma_probe(struct platform_device *pdev) if (!bdev->channels) { ret = -ENOMEM; - goto err_disable_clk; + goto err_tasklet_kill; } /* allocate and initialize channels */ @@ -1125,7 +1155,7 @@ static int bam_dma_probe(struct platform_device *pdev) ret = devm_request_irq(bdev->dev, bdev->irq, bam_dma_irq, IRQF_TRIGGER_HIGH, "bam_dma", bdev); if (ret) - goto err_disable_clk; + goto err_bam_channel_exit; /* set max dma segment size */ bdev->common.dev = bdev->dev; @@ -1133,7 +1163,7 @@ static int bam_dma_probe(struct platform_device *pdev) ret = dma_set_max_seg_size(bdev->common.dev, BAM_MAX_DATA_SIZE); if (ret) { dev_err(bdev->dev, "cannot set maximum segment size\n"); - goto err_disable_clk; + goto err_bam_channel_exit; } platform_set_drvdata(pdev, bdev); @@ -1161,7 +1191,7 @@ static int bam_dma_probe(struct platform_device *pdev) ret = dma_async_device_register(&bdev->common); if (ret) { dev_err(bdev->dev, "failed to register dma async device\n"); - goto err_disable_clk; + goto err_bam_channel_exit; } ret = of_dma_controller_register(pdev->dev.of_node, bam_dma_xlate, @@ -1173,8 +1203,14 @@ static int bam_dma_probe(struct platform_device *pdev) err_unregister_dma: dma_async_device_unregister(&bdev->common); +err_bam_channel_exit: + for (i = 0; i < bdev->num_channels; i++) + tasklet_kill(&bdev->channels[i].vc.task); +err_tasklet_kill: + tasklet_kill(&bdev->task); err_disable_clk: clk_disable_unprepare(bdev->bamclk); + return ret; } diff --git a/drivers/dma/s3c24xx-dma.c b/drivers/dma/s3c24xx-dma.c index 2f91da3db836..01dcaf21b988 100644 --- a/drivers/dma/s3c24xx-dma.c +++ b/drivers/dma/s3c24xx-dma.c @@ -749,11 +749,6 @@ unlock: return ret; } -static int s3c24xx_dma_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void s3c24xx_dma_free_chan_resources(struct dma_chan *chan) { /* Ensure all queued descriptors are freed */ @@ -1238,7 +1233,7 @@ static int s3c24xx_dma_probe(struct platform_device *pdev) if (!s3cdma->phy_chans) return -ENOMEM; - /* aquire irqs and clocks for all physical channels */ + /* acquire irqs and clocks for all physical channels */ for (i = 0; i < pdata->num_phy_channels; i++) { struct s3c24xx_dma_phy *phy = &s3cdma->phy_chans[i]; char clk_name[6]; @@ -1266,7 +1261,7 @@ static int s3c24xx_dma_probe(struct platform_device *pdev) sprintf(clk_name, "dma.%d", i); phy->clk = devm_clk_get(&pdev->dev, clk_name); if (IS_ERR(phy->clk) && sdata->has_clocks) { - dev_err(&pdev->dev, "unable to aquire clock for channel %d, error %lu", + dev_err(&pdev->dev, "unable to acquire clock for channel %d, error %lu\n", i, PTR_ERR(phy->clk)); continue; } @@ -1290,8 +1285,6 @@ static int s3c24xx_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_MEMCPY, s3cdma->memcpy.cap_mask); dma_cap_set(DMA_PRIVATE, s3cdma->memcpy.cap_mask); s3cdma->memcpy.dev = &pdev->dev; - s3cdma->memcpy.device_alloc_chan_resources = - s3c24xx_dma_alloc_chan_resources; s3cdma->memcpy.device_free_chan_resources = s3c24xx_dma_free_chan_resources; s3cdma->memcpy.device_prep_dma_memcpy = s3c24xx_dma_prep_memcpy; @@ -1305,8 +1298,6 @@ static int s3c24xx_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_CYCLIC, s3cdma->slave.cap_mask); dma_cap_set(DMA_PRIVATE, s3cdma->slave.cap_mask); s3cdma->slave.dev = &pdev->dev; - s3cdma->slave.device_alloc_chan_resources = - s3c24xx_dma_alloc_chan_resources; s3cdma->slave.device_free_chan_resources = s3c24xx_dma_free_chan_resources; s3cdma->slave.device_tx_status = s3c24xx_dma_tx_status; diff --git a/drivers/dma/sa11x0-dma.c b/drivers/dma/sa11x0-dma.c index 5adf5407a8cb..43db255050d2 100644 --- a/drivers/dma/sa11x0-dma.c +++ b/drivers/dma/sa11x0-dma.c @@ -389,11 +389,6 @@ static void sa11x0_dma_tasklet(unsigned long arg) } -static int sa11x0_dma_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void sa11x0_dma_free_chan_resources(struct dma_chan *chan) { struct sa11x0_dma_chan *c = to_sa11x0_dma_chan(chan); @@ -835,7 +830,6 @@ static int sa11x0_dma_init_dmadev(struct dma_device *dmadev, INIT_LIST_HEAD(&dmadev->channels); dmadev->dev = dev; - dmadev->device_alloc_chan_resources = sa11x0_dma_alloc_chan_resources; dmadev->device_free_chan_resources = sa11x0_dma_free_chan_resources; dmadev->device_config = sa11x0_dma_device_config; dmadev->device_pause = sa11x0_dma_device_pause; @@ -948,6 +942,12 @@ static int sa11x0_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_CYCLIC, d->slave.cap_mask); d->slave.device_prep_slave_sg = sa11x0_dma_prep_slave_sg; d->slave.device_prep_dma_cyclic = sa11x0_dma_prep_dma_cyclic; + d->slave.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); + d->slave.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; + d->slave.src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES); + d->slave.dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES); ret = sa11x0_dma_init_dmadev(&d->slave, &pdev->dev); if (ret) { dev_warn(d->slave.dev, "failed to register slave async device: %d\n", diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig index 8190ad225a1b..0f371524a4d9 100644 --- a/drivers/dma/sh/Kconfig +++ b/drivers/dma/sh/Kconfig @@ -51,12 +51,6 @@ config RCAR_HPB_DMAE help Enable support for the Renesas R-Car series DMA controllers. -config RCAR_AUDMAC_PP - tristate "Renesas R-Car Audio DMAC Peripheral Peripheral support" - depends on SH_DMAE_BASE - help - Enable support for the Renesas R-Car Audio DMAC Peripheral Peripheral controllers. - config RCAR_DMAC tristate "Renesas R-Car Gen2 DMA Controller" depends on ARCH_SHMOBILE || COMPILE_TEST @@ -64,3 +58,12 @@ config RCAR_DMAC help This driver supports the general purpose DMA controller found in the Renesas R-Car second generation SoCs. + +config RENESAS_USB_DMAC + tristate "Renesas USB-DMA Controller" + depends on ARCH_SHMOBILE || COMPILE_TEST + select RENESAS_DMA + select DMA_VIRTUAL_CHANNELS + help + This driver supports the USB-DMA controller found in the Renesas + SoCs. diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile index 2852f9db61a4..b8a598066ce2 100644 --- a/drivers/dma/sh/Makefile +++ b/drivers/dma/sh/Makefile @@ -15,5 +15,5 @@ obj-$(CONFIG_SH_DMAE) += shdma.o obj-$(CONFIG_SUDMAC) += sudmac.o obj-$(CONFIG_RCAR_HPB_DMAE) += rcar-hpbdma.o -obj-$(CONFIG_RCAR_AUDMAC_PP) += rcar-audmapp.o obj-$(CONFIG_RCAR_DMAC) += rcar-dmac.o +obj-$(CONFIG_RENESAS_USB_DMAC) += usb-dmac.o diff --git a/drivers/dma/sh/rcar-audmapp.c b/drivers/dma/sh/rcar-audmapp.c deleted file mode 100644 index d95bbdd721f4..000000000000 --- a/drivers/dma/sh/rcar-audmapp.c +++ /dev/null @@ -1,376 +0,0 @@ -/* - * This is for Renesas R-Car Audio-DMAC-peri-peri. - * - * Copyright (C) 2014 Renesas Electronics Corporation - * Copyright (C) 2014 Kuninori Morimoto <kuninori.morimoto.gx@renesas.com> - * - * based on the drivers/dma/sh/shdma.c - * - * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de> - * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com> - * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved. - * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved. - * - * This is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/dmaengine.h> -#include <linux/of_dma.h> -#include <linux/platform_data/dma-rcar-audmapp.h> -#include <linux/platform_device.h> -#include <linux/shdma-base.h> - -/* - * DMA register - */ -#define PDMASAR 0x00 -#define PDMADAR 0x04 -#define PDMACHCR 0x0c - -/* PDMACHCR */ -#define PDMACHCR_DE (1 << 0) - -#define AUDMAPP_MAX_CHANNELS 29 - -/* Default MEMCPY transfer size = 2^2 = 4 bytes */ -#define LOG2_DEFAULT_XFER_SIZE 2 -#define AUDMAPP_SLAVE_NUMBER 256 -#define AUDMAPP_LEN_MAX (16 * 1024 * 1024) - -struct audmapp_chan { - struct shdma_chan shdma_chan; - void __iomem *base; - dma_addr_t slave_addr; - u32 chcr; -}; - -struct audmapp_device { - struct shdma_dev shdma_dev; - struct audmapp_pdata *pdata; - struct device *dev; - void __iomem *chan_reg; -}; - -struct audmapp_desc { - struct shdma_desc shdma_desc; - dma_addr_t src; - dma_addr_t dst; -}; - -#define to_shdma_chan(c) container_of(c, struct shdma_chan, dma_chan) - -#define to_chan(chan) container_of(chan, struct audmapp_chan, shdma_chan) -#define to_desc(sdesc) container_of(sdesc, struct audmapp_desc, shdma_desc) -#define to_dev(chan) container_of(chan->shdma_chan.dma_chan.device, \ - struct audmapp_device, shdma_dev.dma_dev) - -static void audmapp_write(struct audmapp_chan *auchan, u32 data, u32 reg) -{ - struct audmapp_device *audev = to_dev(auchan); - struct device *dev = audev->dev; - - dev_dbg(dev, "w %p : %08x\n", auchan->base + reg, data); - - iowrite32(data, auchan->base + reg); -} - -static u32 audmapp_read(struct audmapp_chan *auchan, u32 reg) -{ - return ioread32(auchan->base + reg); -} - -static void audmapp_halt(struct shdma_chan *schan) -{ - struct audmapp_chan *auchan = to_chan(schan); - int i; - - audmapp_write(auchan, 0, PDMACHCR); - - for (i = 0; i < 1024; i++) { - if (0 == audmapp_read(auchan, PDMACHCR)) - return; - udelay(1); - } -} - -static void audmapp_start_xfer(struct shdma_chan *schan, - struct shdma_desc *sdesc) -{ - struct audmapp_chan *auchan = to_chan(schan); - struct audmapp_device *audev = to_dev(auchan); - struct audmapp_desc *desc = to_desc(sdesc); - struct device *dev = audev->dev; - u32 chcr = auchan->chcr | PDMACHCR_DE; - - dev_dbg(dev, "src/dst/chcr = %pad/%pad/%08x\n", - &desc->src, &desc->dst, chcr); - - audmapp_write(auchan, desc->src, PDMASAR); - audmapp_write(auchan, desc->dst, PDMADAR); - audmapp_write(auchan, chcr, PDMACHCR); -} - -static int audmapp_get_config(struct audmapp_chan *auchan, int slave_id, - u32 *chcr, dma_addr_t *dst) -{ - struct audmapp_device *audev = to_dev(auchan); - struct audmapp_pdata *pdata = audev->pdata; - struct audmapp_slave_config *cfg; - int i; - - *chcr = 0; - *dst = 0; - - if (!pdata) { /* DT */ - *chcr = ((u32)slave_id) << 16; - auchan->shdma_chan.slave_id = (slave_id) >> 8; - return 0; - } - - /* non-DT */ - - if (slave_id >= AUDMAPP_SLAVE_NUMBER) - return -ENXIO; - - for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++) - if (cfg->slave_id == slave_id) { - *chcr = cfg->chcr; - *dst = cfg->dst; - return 0; - } - - return -ENXIO; -} - -static int audmapp_set_slave(struct shdma_chan *schan, int slave_id, - dma_addr_t slave_addr, bool try) -{ - struct audmapp_chan *auchan = to_chan(schan); - u32 chcr; - dma_addr_t dst; - int ret; - - ret = audmapp_get_config(auchan, slave_id, &chcr, &dst); - if (ret < 0) - return ret; - - if (try) - return 0; - - auchan->chcr = chcr; - auchan->slave_addr = slave_addr ? : dst; - - return 0; -} - -static int audmapp_desc_setup(struct shdma_chan *schan, - struct shdma_desc *sdesc, - dma_addr_t src, dma_addr_t dst, size_t *len) -{ - struct audmapp_desc *desc = to_desc(sdesc); - - if (*len > (size_t)AUDMAPP_LEN_MAX) - *len = (size_t)AUDMAPP_LEN_MAX; - - desc->src = src; - desc->dst = dst; - - return 0; -} - -static void audmapp_setup_xfer(struct shdma_chan *schan, - int slave_id) -{ -} - -static dma_addr_t audmapp_slave_addr(struct shdma_chan *schan) -{ - struct audmapp_chan *auchan = to_chan(schan); - - return auchan->slave_addr; -} - -static bool audmapp_channel_busy(struct shdma_chan *schan) -{ - struct audmapp_chan *auchan = to_chan(schan); - u32 chcr = audmapp_read(auchan, PDMACHCR); - - return chcr & ~PDMACHCR_DE; -} - -static bool audmapp_desc_completed(struct shdma_chan *schan, - struct shdma_desc *sdesc) -{ - return true; -} - -static struct shdma_desc *audmapp_embedded_desc(void *buf, int i) -{ - return &((struct audmapp_desc *)buf)[i].shdma_desc; -} - -static const struct shdma_ops audmapp_shdma_ops = { - .halt_channel = audmapp_halt, - .desc_setup = audmapp_desc_setup, - .set_slave = audmapp_set_slave, - .start_xfer = audmapp_start_xfer, - .embedded_desc = audmapp_embedded_desc, - .setup_xfer = audmapp_setup_xfer, - .slave_addr = audmapp_slave_addr, - .channel_busy = audmapp_channel_busy, - .desc_completed = audmapp_desc_completed, -}; - -static int audmapp_chan_probe(struct platform_device *pdev, - struct audmapp_device *audev, int id) -{ - struct shdma_dev *sdev = &audev->shdma_dev; - struct audmapp_chan *auchan; - struct shdma_chan *schan; - struct device *dev = audev->dev; - - auchan = devm_kzalloc(dev, sizeof(*auchan), GFP_KERNEL); - if (!auchan) - return -ENOMEM; - - schan = &auchan->shdma_chan; - schan->max_xfer_len = AUDMAPP_LEN_MAX; - - shdma_chan_probe(sdev, schan, id); - - auchan->base = audev->chan_reg + 0x20 + (0x10 * id); - dev_dbg(dev, "%02d : %p / %p", id, auchan->base, audev->chan_reg); - - return 0; -} - -static void audmapp_chan_remove(struct audmapp_device *audev) -{ - struct shdma_chan *schan; - int i; - - shdma_for_each_chan(schan, &audev->shdma_dev, i) { - BUG_ON(!schan); - shdma_chan_remove(schan); - } -} - -static struct dma_chan *audmapp_of_xlate(struct of_phandle_args *dma_spec, - struct of_dma *ofdma) -{ - dma_cap_mask_t mask; - struct dma_chan *chan; - u32 chcr = dma_spec->args[0]; - - if (dma_spec->args_count != 1) - return NULL; - - dma_cap_zero(mask); - dma_cap_set(DMA_SLAVE, mask); - - chan = dma_request_channel(mask, shdma_chan_filter, NULL); - if (chan) - to_shdma_chan(chan)->hw_req = chcr; - - return chan; -} - -static int audmapp_probe(struct platform_device *pdev) -{ - struct audmapp_pdata *pdata = pdev->dev.platform_data; - struct device_node *np = pdev->dev.of_node; - struct audmapp_device *audev; - struct shdma_dev *sdev; - struct dma_device *dma_dev; - struct resource *res; - int err, i; - - if (np) - of_dma_controller_register(np, audmapp_of_xlate, pdev); - else if (!pdata) - return -ENODEV; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - - audev = devm_kzalloc(&pdev->dev, sizeof(*audev), GFP_KERNEL); - if (!audev) - return -ENOMEM; - - audev->dev = &pdev->dev; - audev->pdata = pdata; - audev->chan_reg = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(audev->chan_reg)) - return PTR_ERR(audev->chan_reg); - - sdev = &audev->shdma_dev; - sdev->ops = &audmapp_shdma_ops; - sdev->desc_size = sizeof(struct audmapp_desc); - - dma_dev = &sdev->dma_dev; - dma_dev->copy_align = LOG2_DEFAULT_XFER_SIZE; - dma_cap_set(DMA_SLAVE, dma_dev->cap_mask); - - err = shdma_init(&pdev->dev, sdev, AUDMAPP_MAX_CHANNELS); - if (err < 0) - return err; - - platform_set_drvdata(pdev, audev); - - /* Create DMA Channel */ - for (i = 0; i < AUDMAPP_MAX_CHANNELS; i++) { - err = audmapp_chan_probe(pdev, audev, i); - if (err) - goto chan_probe_err; - } - - err = dma_async_device_register(dma_dev); - if (err < 0) - goto chan_probe_err; - - return err; - -chan_probe_err: - audmapp_chan_remove(audev); - shdma_cleanup(sdev); - - return err; -} - -static int audmapp_remove(struct platform_device *pdev) -{ - struct audmapp_device *audev = platform_get_drvdata(pdev); - struct dma_device *dma_dev = &audev->shdma_dev.dma_dev; - - dma_async_device_unregister(dma_dev); - - audmapp_chan_remove(audev); - shdma_cleanup(&audev->shdma_dev); - - return 0; -} - -static const struct of_device_id audmapp_of_match[] = { - { .compatible = "renesas,rcar-audmapp", }, - {}, -}; - -static struct platform_driver audmapp_driver = { - .probe = audmapp_probe, - .remove = audmapp_remove, - .driver = { - .name = "rcar-audmapp-engine", - .of_match_table = audmapp_of_match, - }, -}; -module_platform_driver(audmapp_driver); - -MODULE_AUTHOR("Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>"); -MODULE_DESCRIPTION("Renesas R-Car Audio DMAC peri-peri driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index 8ee383d339a5..10fcabad80f3 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -171,8 +171,7 @@ static struct shdma_desc *shdma_get_desc(struct shdma_chan *schan) return NULL; } -static int shdma_setup_slave(struct shdma_chan *schan, int slave_id, - dma_addr_t slave_addr) +static int shdma_setup_slave(struct shdma_chan *schan, dma_addr_t slave_addr) { struct shdma_dev *sdev = to_shdma_dev(schan->dma_chan.device); const struct shdma_ops *ops = sdev->ops; @@ -183,25 +182,23 @@ static int shdma_setup_slave(struct shdma_chan *schan, int slave_id, ret = ops->set_slave(schan, match, slave_addr, true); if (ret < 0) return ret; - - slave_id = schan->slave_id; } else { - match = slave_id; + match = schan->real_slave_id; } - if (slave_id < 0 || slave_id >= slave_num) + if (schan->real_slave_id < 0 || schan->real_slave_id >= slave_num) return -EINVAL; - if (test_and_set_bit(slave_id, shdma_slave_used)) + if (test_and_set_bit(schan->real_slave_id, shdma_slave_used)) return -EBUSY; ret = ops->set_slave(schan, match, slave_addr, false); if (ret < 0) { - clear_bit(slave_id, shdma_slave_used); + clear_bit(schan->real_slave_id, shdma_slave_used); return ret; } - schan->slave_id = slave_id; + schan->slave_id = schan->real_slave_id; return 0; } @@ -221,10 +218,12 @@ static int shdma_alloc_chan_resources(struct dma_chan *chan) */ if (slave) { /* Legacy mode: .private is set in filter */ - ret = shdma_setup_slave(schan, slave->slave_id, 0); + schan->real_slave_id = slave->slave_id; + ret = shdma_setup_slave(schan, 0); if (ret < 0) goto esetslave; } else { + /* Normal mode: real_slave_id was set by filter */ schan->slave_id = -EINVAL; } @@ -258,11 +257,14 @@ esetslave: /* * This is the standard shdma filter function to be used as a replacement to the - * "old" method, using the .private pointer. If for some reason you allocate a - * channel without slave data, use something like ERR_PTR(-EINVAL) as a filter + * "old" method, using the .private pointer. + * You always have to pass a valid slave id as the argument, old drivers that + * pass ERR_PTR(-EINVAL) as a filter parameter and set it up in dma_slave_config + * need to be updated so we can remove the slave_id field from dma_slave_config. * parameter. If this filter is used, the slave driver, after calling * dma_request_channel(), will also have to call dmaengine_slave_config() with - * .slave_id, .direction, and either .src_addr or .dst_addr set. + * .direction, and either .src_addr or .dst_addr set. + * * NOTE: this filter doesn't support multiple DMAC drivers with the DMA_SLAVE * capability! If this becomes a requirement, hardware glue drivers, using this * services would have to provide their own filters, which first would check @@ -276,7 +278,7 @@ bool shdma_chan_filter(struct dma_chan *chan, void *arg) { struct shdma_chan *schan; struct shdma_dev *sdev; - int match = (long)arg; + int slave_id = (long)arg; int ret; /* Only support channels handled by this driver. */ @@ -284,19 +286,39 @@ bool shdma_chan_filter(struct dma_chan *chan, void *arg) shdma_alloc_chan_resources) return false; - if (match < 0) + schan = to_shdma_chan(chan); + sdev = to_shdma_dev(chan->device); + + /* + * For DT, the schan->slave_id field is generated by the + * set_slave function from the slave ID that is passed in + * from xlate. For the non-DT case, the slave ID is + * directly passed into the filter function by the driver + */ + if (schan->dev->of_node) { + ret = sdev->ops->set_slave(schan, slave_id, 0, true); + if (ret < 0) + return false; + + schan->real_slave_id = schan->slave_id; + return true; + } + + if (slave_id < 0) { /* No slave requested - arbitrary channel */ + dev_warn(sdev->dma_dev.dev, "invalid slave ID passed to dma_request_slave\n"); return true; + } - schan = to_shdma_chan(chan); - if (!schan->dev->of_node && match >= slave_num) + if (slave_id >= slave_num) return false; - sdev = to_shdma_dev(schan->dma_chan.device); - ret = sdev->ops->set_slave(schan, match, 0, true); + ret = sdev->ops->set_slave(schan, slave_id, 0, true); if (ret < 0) return false; + schan->real_slave_id = slave_id; + return true; } EXPORT_SYMBOL(shdma_chan_filter); @@ -452,6 +474,8 @@ static void shdma_free_chan_resources(struct dma_chan *chan) chan->private = NULL; } + schan->real_slave_id = 0; + spin_lock_irq(&schan->chan_lock); list_splice_init(&schan->ld_free, &list); @@ -764,11 +788,20 @@ static int shdma_config(struct dma_chan *chan, */ if (!config) return -EINVAL; + + /* + * overriding the slave_id through dma_slave_config is deprecated, + * but possibly some out-of-tree drivers still do it. + */ + if (WARN_ON_ONCE(config->slave_id && + config->slave_id != schan->real_slave_id)) + schan->real_slave_id = config->slave_id; + /* * We could lock this, but you shouldn't be configuring the * channel, while using it... */ - return shdma_setup_slave(schan, config->slave_id, + return shdma_setup_slave(schan, config->direction == DMA_DEV_TO_MEM ? config->src_addr : config->dst_addr); } diff --git a/drivers/dma/sh/shdmac.c b/drivers/dma/sh/shdmac.c index 9f1d4c7dbab8..11707df1a689 100644 --- a/drivers/dma/sh/shdmac.c +++ b/drivers/dma/sh/shdmac.c @@ -443,7 +443,7 @@ static bool sh_dmae_reset(struct sh_dmae_device *shdev) return ret; } -#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARM) +#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE) static irqreturn_t sh_dmae_err(int irq, void *data) { struct sh_dmae_device *shdev = data; @@ -689,7 +689,7 @@ static int sh_dmae_probe(struct platform_device *pdev) const struct sh_dmae_pdata *pdata; unsigned long chan_flag[SH_DMAE_MAX_CHANNELS] = {}; int chan_irq[SH_DMAE_MAX_CHANNELS]; -#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARM) +#if defined(CONFIG_CPU_SH4) || defined(CONFIG_ARCH_SHMOBILE) unsigned long irqflags = 0; int errirq; #endif diff --git a/drivers/dma/sh/usb-dmac.c b/drivers/dma/sh/usb-dmac.c new file mode 100644 index 000000000000..f705798ce3eb --- /dev/null +++ b/drivers/dma/sh/usb-dmac.c @@ -0,0 +1,910 @@ +/* + * Renesas USB DMA Controller Driver + * + * Copyright (C) 2015 Renesas Electronics Corporation + * + * based on rcar-dmac.c + * Copyright (C) 2014 Renesas Electronics Inc. + * Author: Laurent Pinchart <laurent.pinchart@ideasonboard.com> + * + * This is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + */ + +#include <linux/delay.h> +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_dma.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include "../dmaengine.h" +#include "../virt-dma.h" + +/* + * struct usb_dmac_sg - Descriptor for a hardware transfer + * @mem_addr: memory address + * @size: transfer size in bytes + */ +struct usb_dmac_sg { + dma_addr_t mem_addr; + u32 size; +}; + +/* + * struct usb_dmac_desc - USB DMA Transfer Descriptor + * @vd: base virtual channel DMA transaction descriptor + * @direction: direction of the DMA transfer + * @sg_allocated_len: length of allocated sg + * @sg_len: length of sg + * @sg_index: index of sg + * @residue: residue after the DMAC completed a transfer + * @node: node for desc_got and desc_freed + * @done_cookie: cookie after the DMAC completed a transfer + * @sg: information for the transfer + */ +struct usb_dmac_desc { + struct virt_dma_desc vd; + enum dma_transfer_direction direction; + unsigned int sg_allocated_len; + unsigned int sg_len; + unsigned int sg_index; + u32 residue; + struct list_head node; + dma_cookie_t done_cookie; + struct usb_dmac_sg sg[0]; +}; + +#define to_usb_dmac_desc(vd) container_of(vd, struct usb_dmac_desc, vd) + +/* + * struct usb_dmac_chan - USB DMA Controller Channel + * @vc: base virtual DMA channel object + * @iomem: channel I/O memory base + * @index: index of this channel in the controller + * @irq: irq number of this channel + * @desc: the current descriptor + * @descs_allocated: number of descriptors allocated + * @desc_got: got descriptors + * @desc_freed: freed descriptors after the DMAC completed a transfer + */ +struct usb_dmac_chan { + struct virt_dma_chan vc; + void __iomem *iomem; + unsigned int index; + int irq; + struct usb_dmac_desc *desc; + int descs_allocated; + struct list_head desc_got; + struct list_head desc_freed; +}; + +#define to_usb_dmac_chan(c) container_of(c, struct usb_dmac_chan, vc.chan) + +/* + * struct usb_dmac - USB DMA Controller + * @engine: base DMA engine object + * @dev: the hardware device + * @iomem: remapped I/O memory base + * @n_channels: number of available channels + * @channels: array of DMAC channels + */ +struct usb_dmac { + struct dma_device engine; + struct device *dev; + void __iomem *iomem; + + unsigned int n_channels; + struct usb_dmac_chan *channels; +}; + +#define to_usb_dmac(d) container_of(d, struct usb_dmac, engine) + +/* ----------------------------------------------------------------------------- + * Registers + */ + +#define USB_DMAC_CHAN_OFFSET(i) (0x20 + 0x20 * (i)) + +#define USB_DMASWR 0x0008 +#define USB_DMASWR_SWR (1 << 0) +#define USB_DMAOR 0x0060 +#define USB_DMAOR_AE (1 << 2) +#define USB_DMAOR_DME (1 << 0) + +#define USB_DMASAR 0x0000 +#define USB_DMADAR 0x0004 +#define USB_DMATCR 0x0008 +#define USB_DMATCR_MASK 0x00ffffff +#define USB_DMACHCR 0x0014 +#define USB_DMACHCR_FTE (1 << 24) +#define USB_DMACHCR_NULLE (1 << 16) +#define USB_DMACHCR_NULL (1 << 12) +#define USB_DMACHCR_TS_8B ((0 << 7) | (0 << 6)) +#define USB_DMACHCR_TS_16B ((0 << 7) | (1 << 6)) +#define USB_DMACHCR_TS_32B ((1 << 7) | (0 << 6)) +#define USB_DMACHCR_IE (1 << 5) +#define USB_DMACHCR_SP (1 << 2) +#define USB_DMACHCR_TE (1 << 1) +#define USB_DMACHCR_DE (1 << 0) +#define USB_DMATEND 0x0018 + +/* Hardcode the xfer_shift to 5 (32bytes) */ +#define USB_DMAC_XFER_SHIFT 5 +#define USB_DMAC_XFER_SIZE (1 << USB_DMAC_XFER_SHIFT) +#define USB_DMAC_CHCR_TS USB_DMACHCR_TS_32B +#define USB_DMAC_SLAVE_BUSWIDTH DMA_SLAVE_BUSWIDTH_32_BYTES + +/* for descriptors */ +#define USB_DMAC_INITIAL_NR_DESC 16 +#define USB_DMAC_INITIAL_NR_SG 8 + +/* ----------------------------------------------------------------------------- + * Device access + */ + +static void usb_dmac_write(struct usb_dmac *dmac, u32 reg, u32 data) +{ + writel(data, dmac->iomem + reg); +} + +static u32 usb_dmac_read(struct usb_dmac *dmac, u32 reg) +{ + return readl(dmac->iomem + reg); +} + +static u32 usb_dmac_chan_read(struct usb_dmac_chan *chan, u32 reg) +{ + return readl(chan->iomem + reg); +} + +static void usb_dmac_chan_write(struct usb_dmac_chan *chan, u32 reg, u32 data) +{ + writel(data, chan->iomem + reg); +} + +/* ----------------------------------------------------------------------------- + * Initialization and configuration + */ + +static bool usb_dmac_chan_is_busy(struct usb_dmac_chan *chan) +{ + u32 chcr = usb_dmac_chan_read(chan, USB_DMACHCR); + + return (chcr & (USB_DMACHCR_DE | USB_DMACHCR_TE)) == USB_DMACHCR_DE; +} + +static u32 usb_dmac_calc_tend(u32 size) +{ + /* + * Please refer to the Figure "Example of Final Transaction Valid + * Data Transfer Enable (EDTEN) Setting" in the data sheet. + */ + return 0xffffffff << (32 - (size % USB_DMAC_XFER_SIZE ? : + USB_DMAC_XFER_SIZE)); +} + +/* This function is already held by vc.lock */ +static void usb_dmac_chan_start_sg(struct usb_dmac_chan *chan, + unsigned int index) +{ + struct usb_dmac_desc *desc = chan->desc; + struct usb_dmac_sg *sg = desc->sg + index; + dma_addr_t src_addr = 0, dst_addr = 0; + + WARN_ON_ONCE(usb_dmac_chan_is_busy(chan)); + + if (desc->direction == DMA_DEV_TO_MEM) + dst_addr = sg->mem_addr; + else + src_addr = sg->mem_addr; + + dev_dbg(chan->vc.chan.device->dev, + "chan%u: queue sg %p: %u@%pad -> %pad\n", + chan->index, sg, sg->size, &src_addr, &dst_addr); + + usb_dmac_chan_write(chan, USB_DMASAR, src_addr & 0xffffffff); + usb_dmac_chan_write(chan, USB_DMADAR, dst_addr & 0xffffffff); + usb_dmac_chan_write(chan, USB_DMATCR, + DIV_ROUND_UP(sg->size, USB_DMAC_XFER_SIZE)); + usb_dmac_chan_write(chan, USB_DMATEND, usb_dmac_calc_tend(sg->size)); + + usb_dmac_chan_write(chan, USB_DMACHCR, USB_DMAC_CHCR_TS | + USB_DMACHCR_NULLE | USB_DMACHCR_IE | USB_DMACHCR_DE); +} + +/* This function is already held by vc.lock */ +static void usb_dmac_chan_start_desc(struct usb_dmac_chan *chan) +{ + struct virt_dma_desc *vd; + + vd = vchan_next_desc(&chan->vc); + if (!vd) { + chan->desc = NULL; + return; + } + + /* + * Remove this request from vc->desc_issued. Otherwise, this driver + * will get the previous value from vchan_next_desc() after a transfer + * was completed. + */ + list_del(&vd->node); + + chan->desc = to_usb_dmac_desc(vd); + chan->desc->sg_index = 0; + usb_dmac_chan_start_sg(chan, 0); +} + +static int usb_dmac_init(struct usb_dmac *dmac) +{ + u16 dmaor; + + /* Clear all channels and enable the DMAC globally. */ + usb_dmac_write(dmac, USB_DMAOR, USB_DMAOR_DME); + + dmaor = usb_dmac_read(dmac, USB_DMAOR); + if ((dmaor & (USB_DMAOR_AE | USB_DMAOR_DME)) != USB_DMAOR_DME) { + dev_warn(dmac->dev, "DMAOR initialization failed.\n"); + return -EIO; + } + + return 0; +} + +/* ----------------------------------------------------------------------------- + * Descriptors allocation and free + */ +static int usb_dmac_desc_alloc(struct usb_dmac_chan *chan, unsigned int sg_len, + gfp_t gfp) +{ + struct usb_dmac_desc *desc; + unsigned long flags; + + desc = kzalloc(sizeof(*desc) + sg_len * sizeof(desc->sg[0]), gfp); + if (!desc) + return -ENOMEM; + + desc->sg_allocated_len = sg_len; + INIT_LIST_HEAD(&desc->node); + + spin_lock_irqsave(&chan->vc.lock, flags); + list_add_tail(&desc->node, &chan->desc_freed); + spin_unlock_irqrestore(&chan->vc.lock, flags); + + return 0; +} + +static void usb_dmac_desc_free(struct usb_dmac_chan *chan) +{ + struct usb_dmac_desc *desc, *_desc; + LIST_HEAD(list); + + list_splice_init(&chan->desc_freed, &list); + list_splice_init(&chan->desc_got, &list); + + list_for_each_entry_safe(desc, _desc, &list, node) { + list_del(&desc->node); + kfree(desc); + } + chan->descs_allocated = 0; +} + +static struct usb_dmac_desc *usb_dmac_desc_get(struct usb_dmac_chan *chan, + unsigned int sg_len, gfp_t gfp) +{ + struct usb_dmac_desc *desc = NULL; + unsigned long flags; + + /* Get a freed descritpor */ + spin_lock_irqsave(&chan->vc.lock, flags); + list_for_each_entry(desc, &chan->desc_freed, node) { + if (sg_len <= desc->sg_allocated_len) { + list_move_tail(&desc->node, &chan->desc_got); + spin_unlock_irqrestore(&chan->vc.lock, flags); + return desc; + } + } + spin_unlock_irqrestore(&chan->vc.lock, flags); + + /* Allocate a new descriptor */ + if (!usb_dmac_desc_alloc(chan, sg_len, gfp)) { + /* If allocated the desc, it was added to tail of the list */ + spin_lock_irqsave(&chan->vc.lock, flags); + desc = list_last_entry(&chan->desc_freed, struct usb_dmac_desc, + node); + list_move_tail(&desc->node, &chan->desc_got); + spin_unlock_irqrestore(&chan->vc.lock, flags); + return desc; + } + + return NULL; +} + +static void usb_dmac_desc_put(struct usb_dmac_chan *chan, + struct usb_dmac_desc *desc) +{ + unsigned long flags; + + spin_lock_irqsave(&chan->vc.lock, flags); + list_move_tail(&desc->node, &chan->desc_freed); + spin_unlock_irqrestore(&chan->vc.lock, flags); +} + +/* ----------------------------------------------------------------------------- + * Stop and reset + */ + +static void usb_dmac_soft_reset(struct usb_dmac_chan *uchan) +{ + struct dma_chan *chan = &uchan->vc.chan; + struct usb_dmac *dmac = to_usb_dmac(chan->device); + int i; + + /* Don't issue soft reset if any one of channels is busy */ + for (i = 0; i < dmac->n_channels; ++i) { + if (usb_dmac_chan_is_busy(uchan)) + return; + } + + usb_dmac_write(dmac, USB_DMAOR, 0); + usb_dmac_write(dmac, USB_DMASWR, USB_DMASWR_SWR); + udelay(100); + usb_dmac_write(dmac, USB_DMASWR, 0); + usb_dmac_write(dmac, USB_DMAOR, 1); +} + +static void usb_dmac_chan_halt(struct usb_dmac_chan *chan) +{ + u32 chcr = usb_dmac_chan_read(chan, USB_DMACHCR); + + chcr &= ~(USB_DMACHCR_IE | USB_DMACHCR_TE | USB_DMACHCR_DE); + usb_dmac_chan_write(chan, USB_DMACHCR, chcr); + + usb_dmac_soft_reset(chan); +} + +static void usb_dmac_stop(struct usb_dmac *dmac) +{ + usb_dmac_write(dmac, USB_DMAOR, 0); +} + +/* ----------------------------------------------------------------------------- + * DMA engine operations + */ + +static int usb_dmac_alloc_chan_resources(struct dma_chan *chan) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + int ret; + + while (uchan->descs_allocated < USB_DMAC_INITIAL_NR_DESC) { + ret = usb_dmac_desc_alloc(uchan, USB_DMAC_INITIAL_NR_SG, + GFP_KERNEL); + if (ret < 0) { + usb_dmac_desc_free(uchan); + return ret; + } + uchan->descs_allocated++; + } + + return pm_runtime_get_sync(chan->device->dev); +} + +static void usb_dmac_free_chan_resources(struct dma_chan *chan) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + unsigned long flags; + + /* Protect against ISR */ + spin_lock_irqsave(&uchan->vc.lock, flags); + usb_dmac_chan_halt(uchan); + spin_unlock_irqrestore(&uchan->vc.lock, flags); + + usb_dmac_desc_free(uchan); + vchan_free_chan_resources(&uchan->vc); + + pm_runtime_put(chan->device->dev); +} + +static struct dma_async_tx_descriptor * +usb_dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_transfer_direction dir, + unsigned long dma_flags, void *context) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + struct usb_dmac_desc *desc; + struct scatterlist *sg; + int i; + + if (!sg_len) { + dev_warn(chan->device->dev, + "%s: bad parameter: len=%d\n", __func__, sg_len); + return NULL; + } + + desc = usb_dmac_desc_get(uchan, sg_len, GFP_NOWAIT); + if (!desc) + return NULL; + + desc->direction = dir; + desc->sg_len = sg_len; + for_each_sg(sgl, sg, sg_len, i) { + desc->sg[i].mem_addr = sg_dma_address(sg); + desc->sg[i].size = sg_dma_len(sg); + } + + return vchan_tx_prep(&uchan->vc, &desc->vd, dma_flags); +} + +static int usb_dmac_chan_terminate_all(struct dma_chan *chan) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + struct usb_dmac_desc *desc; + unsigned long flags; + LIST_HEAD(head); + LIST_HEAD(list); + + spin_lock_irqsave(&uchan->vc.lock, flags); + usb_dmac_chan_halt(uchan); + vchan_get_all_descriptors(&uchan->vc, &head); + if (uchan->desc) + uchan->desc = NULL; + list_splice_init(&uchan->desc_got, &list); + list_for_each_entry(desc, &list, node) + list_move_tail(&desc->node, &uchan->desc_freed); + spin_unlock_irqrestore(&uchan->vc.lock, flags); + vchan_dma_desc_free_list(&uchan->vc, &head); + + return 0; +} + +static unsigned int usb_dmac_get_current_residue(struct usb_dmac_chan *chan, + struct usb_dmac_desc *desc, + int sg_index) +{ + struct usb_dmac_sg *sg = desc->sg + sg_index; + u32 mem_addr = sg->mem_addr & 0xffffffff; + unsigned int residue = sg->size; + + /* + * We cannot use USB_DMATCR to calculate residue because USB_DMATCR + * has unsuited value to calculate. + */ + if (desc->direction == DMA_DEV_TO_MEM) + residue -= usb_dmac_chan_read(chan, USB_DMADAR) - mem_addr; + else + residue -= usb_dmac_chan_read(chan, USB_DMASAR) - mem_addr; + + return residue; +} + +static u32 usb_dmac_chan_get_residue_if_complete(struct usb_dmac_chan *chan, + dma_cookie_t cookie) +{ + struct usb_dmac_desc *desc; + u32 residue = 0; + + list_for_each_entry_reverse(desc, &chan->desc_freed, node) { + if (desc->done_cookie == cookie) { + residue = desc->residue; + break; + } + } + + return residue; +} + +static u32 usb_dmac_chan_get_residue(struct usb_dmac_chan *chan, + dma_cookie_t cookie) +{ + u32 residue = 0; + struct virt_dma_desc *vd; + struct usb_dmac_desc *desc = chan->desc; + int i; + + if (!desc) { + vd = vchan_find_desc(&chan->vc, cookie); + if (!vd) + return 0; + desc = to_usb_dmac_desc(vd); + } + + /* Compute the size of all usb_dmac_sg still to be transferred */ + for (i = desc->sg_index + 1; i < desc->sg_len; i++) + residue += desc->sg[i].size; + + /* Add the residue for the current sg */ + residue += usb_dmac_get_current_residue(chan, desc, desc->sg_index); + + return residue; +} + +static enum dma_status usb_dmac_tx_status(struct dma_chan *chan, + dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + enum dma_status status; + unsigned int residue = 0; + unsigned long flags; + + status = dma_cookie_status(chan, cookie, txstate); + /* a client driver will get residue after DMA_COMPLETE */ + if (!txstate) + return status; + + spin_lock_irqsave(&uchan->vc.lock, flags); + if (status == DMA_COMPLETE) + residue = usb_dmac_chan_get_residue_if_complete(uchan, cookie); + else + residue = usb_dmac_chan_get_residue(uchan, cookie); + spin_unlock_irqrestore(&uchan->vc.lock, flags); + + dma_set_residue(txstate, residue); + + return status; +} + +static void usb_dmac_issue_pending(struct dma_chan *chan) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&uchan->vc.lock, flags); + if (vchan_issue_pending(&uchan->vc) && !uchan->desc) + usb_dmac_chan_start_desc(uchan); + spin_unlock_irqrestore(&uchan->vc.lock, flags); +} + +static void usb_dmac_virt_desc_free(struct virt_dma_desc *vd) +{ + struct usb_dmac_desc *desc = to_usb_dmac_desc(vd); + struct usb_dmac_chan *chan = to_usb_dmac_chan(vd->tx.chan); + + usb_dmac_desc_put(chan, desc); +} + +/* ----------------------------------------------------------------------------- + * IRQ handling + */ + +static void usb_dmac_isr_transfer_end(struct usb_dmac_chan *chan) +{ + struct usb_dmac_desc *desc = chan->desc; + + BUG_ON(!desc); + + if (++desc->sg_index < desc->sg_len) { + usb_dmac_chan_start_sg(chan, desc->sg_index); + } else { + desc->residue = usb_dmac_get_current_residue(chan, desc, + desc->sg_index - 1); + desc->done_cookie = desc->vd.tx.cookie; + vchan_cookie_complete(&desc->vd); + + /* Restart the next transfer if this driver has a next desc */ + usb_dmac_chan_start_desc(chan); + } +} + +static irqreturn_t usb_dmac_isr_channel(int irq, void *dev) +{ + struct usb_dmac_chan *chan = dev; + irqreturn_t ret = IRQ_NONE; + u32 mask = USB_DMACHCR_TE; + u32 check_bits = USB_DMACHCR_TE | USB_DMACHCR_SP; + u32 chcr; + + spin_lock(&chan->vc.lock); + + chcr = usb_dmac_chan_read(chan, USB_DMACHCR); + if (chcr & check_bits) + mask |= USB_DMACHCR_DE | check_bits; + if (chcr & USB_DMACHCR_NULL) { + /* An interruption of TE will happen after we set FTE */ + mask |= USB_DMACHCR_NULL; + chcr |= USB_DMACHCR_FTE; + ret |= IRQ_HANDLED; + } + usb_dmac_chan_write(chan, USB_DMACHCR, chcr & ~mask); + + if (chcr & check_bits) { + usb_dmac_isr_transfer_end(chan); + ret |= IRQ_HANDLED; + } + + spin_unlock(&chan->vc.lock); + + return ret; +} + +/* ----------------------------------------------------------------------------- + * OF xlate and channel filter + */ + +static bool usb_dmac_chan_filter(struct dma_chan *chan, void *arg) +{ + struct usb_dmac_chan *uchan = to_usb_dmac_chan(chan); + struct of_phandle_args *dma_spec = arg; + + if (dma_spec->np != chan->device->dev->of_node) + return false; + + /* USB-DMAC should be used with fixed usb controller's FIFO */ + if (uchan->index != dma_spec->args[0]) + return false; + + return true; +} + +static struct dma_chan *usb_dmac_of_xlate(struct of_phandle_args *dma_spec, + struct of_dma *ofdma) +{ + struct usb_dmac_chan *uchan; + struct dma_chan *chan; + dma_cap_mask_t mask; + + if (dma_spec->args_count != 1) + return NULL; + + /* Only slave DMA channels can be allocated via DT */ + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + + chan = dma_request_channel(mask, usb_dmac_chan_filter, dma_spec); + if (!chan) + return NULL; + + uchan = to_usb_dmac_chan(chan); + + return chan; +} + +/* ----------------------------------------------------------------------------- + * Power management + */ + +static int usb_dmac_runtime_suspend(struct device *dev) +{ + struct usb_dmac *dmac = dev_get_drvdata(dev); + int i; + + for (i = 0; i < dmac->n_channels; ++i) + usb_dmac_chan_halt(&dmac->channels[i]); + + return 0; +} + +static int usb_dmac_runtime_resume(struct device *dev) +{ + struct usb_dmac *dmac = dev_get_drvdata(dev); + + return usb_dmac_init(dmac); +} + +static const struct dev_pm_ops usb_dmac_pm = { + SET_RUNTIME_PM_OPS(usb_dmac_runtime_suspend, usb_dmac_runtime_resume, + NULL) +}; + +/* ----------------------------------------------------------------------------- + * Probe and remove + */ + +static int usb_dmac_chan_probe(struct usb_dmac *dmac, + struct usb_dmac_chan *uchan, + unsigned int index) +{ + struct platform_device *pdev = to_platform_device(dmac->dev); + char pdev_irqname[5]; + char *irqname; + int ret; + + uchan->index = index; + uchan->iomem = dmac->iomem + USB_DMAC_CHAN_OFFSET(index); + + /* Request the channel interrupt. */ + sprintf(pdev_irqname, "ch%u", index); + uchan->irq = platform_get_irq_byname(pdev, pdev_irqname); + if (uchan->irq < 0) { + dev_err(dmac->dev, "no IRQ specified for channel %u\n", index); + return -ENODEV; + } + + irqname = devm_kasprintf(dmac->dev, GFP_KERNEL, "%s:%u", + dev_name(dmac->dev), index); + if (!irqname) + return -ENOMEM; + + ret = devm_request_irq(dmac->dev, uchan->irq, usb_dmac_isr_channel, + IRQF_SHARED, irqname, uchan); + if (ret) { + dev_err(dmac->dev, "failed to request IRQ %u (%d)\n", + uchan->irq, ret); + return ret; + } + + uchan->vc.desc_free = usb_dmac_virt_desc_free; + vchan_init(&uchan->vc, &dmac->engine); + INIT_LIST_HEAD(&uchan->desc_freed); + INIT_LIST_HEAD(&uchan->desc_got); + + return 0; +} + +static int usb_dmac_parse_of(struct device *dev, struct usb_dmac *dmac) +{ + struct device_node *np = dev->of_node; + int ret; + + ret = of_property_read_u32(np, "dma-channels", &dmac->n_channels); + if (ret < 0) { + dev_err(dev, "unable to read dma-channels property\n"); + return ret; + } + + if (dmac->n_channels <= 0 || dmac->n_channels >= 100) { + dev_err(dev, "invalid number of channels %u\n", + dmac->n_channels); + return -EINVAL; + } + + return 0; +} + +static int usb_dmac_probe(struct platform_device *pdev) +{ + const enum dma_slave_buswidth widths = USB_DMAC_SLAVE_BUSWIDTH; + struct dma_device *engine; + struct usb_dmac *dmac; + struct resource *mem; + unsigned int i; + int ret; + + dmac = devm_kzalloc(&pdev->dev, sizeof(*dmac), GFP_KERNEL); + if (!dmac) + return -ENOMEM; + + dmac->dev = &pdev->dev; + platform_set_drvdata(pdev, dmac); + + ret = usb_dmac_parse_of(&pdev->dev, dmac); + if (ret < 0) + return ret; + + dmac->channels = devm_kcalloc(&pdev->dev, dmac->n_channels, + sizeof(*dmac->channels), GFP_KERNEL); + if (!dmac->channels) + return -ENOMEM; + + /* Request resources. */ + mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + dmac->iomem = devm_ioremap_resource(&pdev->dev, mem); + if (IS_ERR(dmac->iomem)) + return PTR_ERR(dmac->iomem); + + /* Enable runtime PM and initialize the device. */ + pm_runtime_enable(&pdev->dev); + ret = pm_runtime_get_sync(&pdev->dev); + if (ret < 0) { + dev_err(&pdev->dev, "runtime PM get sync failed (%d)\n", ret); + return ret; + } + + ret = usb_dmac_init(dmac); + pm_runtime_put(&pdev->dev); + + if (ret) { + dev_err(&pdev->dev, "failed to reset device\n"); + goto error; + } + + /* Initialize the channels. */ + INIT_LIST_HEAD(&dmac->engine.channels); + + for (i = 0; i < dmac->n_channels; ++i) { + ret = usb_dmac_chan_probe(dmac, &dmac->channels[i], i); + if (ret < 0) + goto error; + } + + /* Register the DMAC as a DMA provider for DT. */ + ret = of_dma_controller_register(pdev->dev.of_node, usb_dmac_of_xlate, + NULL); + if (ret < 0) + goto error; + + /* + * Register the DMA engine device. + * + * Default transfer size of 32 bytes requires 32-byte alignment. + */ + engine = &dmac->engine; + dma_cap_set(DMA_SLAVE, engine->cap_mask); + + engine->dev = &pdev->dev; + + engine->src_addr_widths = widths; + engine->dst_addr_widths = widths; + engine->directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM); + engine->residue_granularity = DMA_RESIDUE_GRANULARITY_BURST; + + engine->device_alloc_chan_resources = usb_dmac_alloc_chan_resources; + engine->device_free_chan_resources = usb_dmac_free_chan_resources; + engine->device_prep_slave_sg = usb_dmac_prep_slave_sg; + engine->device_terminate_all = usb_dmac_chan_terminate_all; + engine->device_tx_status = usb_dmac_tx_status; + engine->device_issue_pending = usb_dmac_issue_pending; + + ret = dma_async_device_register(engine); + if (ret < 0) + goto error; + + return 0; + +error: + of_dma_controller_free(pdev->dev.of_node); + pm_runtime_disable(&pdev->dev); + return ret; +} + +static void usb_dmac_chan_remove(struct usb_dmac *dmac, + struct usb_dmac_chan *uchan) +{ + usb_dmac_chan_halt(uchan); + devm_free_irq(dmac->dev, uchan->irq, uchan); +} + +static int usb_dmac_remove(struct platform_device *pdev) +{ + struct usb_dmac *dmac = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < dmac->n_channels; ++i) + usb_dmac_chan_remove(dmac, &dmac->channels[i]); + of_dma_controller_free(pdev->dev.of_node); + dma_async_device_unregister(&dmac->engine); + + pm_runtime_disable(&pdev->dev); + + return 0; +} + +static void usb_dmac_shutdown(struct platform_device *pdev) +{ + struct usb_dmac *dmac = platform_get_drvdata(pdev); + + usb_dmac_stop(dmac); +} + +static const struct of_device_id usb_dmac_of_ids[] = { + { .compatible = "renesas,usb-dmac", }, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(of, usb_dmac_of_ids); + +static struct platform_driver usb_dmac_driver = { + .driver = { + .pm = &usb_dmac_pm, + .name = "usb-dmac", + .of_match_table = usb_dmac_of_ids, + }, + .probe = usb_dmac_probe, + .remove = usb_dmac_remove, + .shutdown = usb_dmac_shutdown, +}; + +module_platform_driver(usb_dmac_driver); + +MODULE_DESCRIPTION("Renesas USB DMA Controller Driver"); +MODULE_AUTHOR("Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c index d0086e9f2082..a1afda43b8ef 100644 --- a/drivers/dma/sirf-dma.c +++ b/drivers/dma/sirf-dma.c @@ -896,7 +896,7 @@ static const struct dev_pm_ops sirfsoc_dma_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(sirfsoc_dma_pm_suspend, sirfsoc_dma_pm_resume) }; -static struct of_device_id sirfsoc_dma_match[] = { +static const struct of_device_id sirfsoc_dma_match[] = { { .compatible = "sirf,prima2-dmac", }, { .compatible = "sirf,marco-dmac", }, {}, diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 1332b1d4d541..3c10f034d4b9 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -2514,7 +2514,8 @@ static struct dma_async_tx_descriptor *d40_prep_memcpy(struct dma_chan *chan, sg_dma_len(&dst_sg) = size; sg_dma_len(&src_sg) = size; - return d40_prep_sg(chan, &src_sg, &dst_sg, 1, DMA_NONE, dma_flags); + return d40_prep_sg(chan, &src_sg, &dst_sg, 1, + DMA_MEM_TO_MEM, dma_flags); } static struct dma_async_tx_descriptor * @@ -2526,7 +2527,8 @@ d40_prep_memcpy_sg(struct dma_chan *chan, if (dst_nents != src_nents) return NULL; - return d40_prep_sg(chan, src_sg, dst_sg, src_nents, DMA_NONE, dma_flags); + return d40_prep_sg(chan, src_sg, dst_sg, src_nents, + DMA_MEM_TO_MEM, dma_flags); } static struct dma_async_tx_descriptor * diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c index 7ebcf9bec698..11e536586812 100644 --- a/drivers/dma/sun6i-dma.c +++ b/drivers/dma/sun6i-dma.c @@ -796,11 +796,6 @@ static void sun6i_dma_issue_pending(struct dma_chan *chan) spin_unlock_irqrestore(&vchan->vc.lock, flags); } -static int sun6i_dma_alloc_chan_resources(struct dma_chan *chan) -{ - return 0; -} - static void sun6i_dma_free_chan_resources(struct dma_chan *chan) { struct sun6i_dma_dev *sdev = to_sun6i_dma_dev(chan->device); @@ -896,7 +891,7 @@ static struct sun6i_dma_config sun8i_a23_dma_cfg = { .nr_max_vchans = 37, }; -static struct of_device_id sun6i_dma_match[] = { +static const struct of_device_id sun6i_dma_match[] = { { .compatible = "allwinner,sun6i-a31-dma", .data = &sun6i_a31_dma_cfg }, { .compatible = "allwinner,sun8i-a23-dma", .data = &sun8i_a23_dma_cfg }, { /* sentinel */ } @@ -957,7 +952,6 @@ static int sun6i_dma_probe(struct platform_device *pdev) dma_cap_set(DMA_SLAVE, sdc->slave.cap_mask); INIT_LIST_HEAD(&sdc->slave.channels); - sdc->slave.device_alloc_chan_resources = sun6i_dma_alloc_chan_resources; sdc->slave.device_free_chan_resources = sun6i_dma_free_chan_resources; sdc->slave.device_tx_status = sun6i_dma_tx_status; sdc->slave.device_issue_pending = sun6i_dma_issue_pending; diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c new file mode 100755 index 000000000000..f52e37502254 --- /dev/null +++ b/drivers/dma/xgene-dma.c @@ -0,0 +1,2089 @@ +/* + * Applied Micro X-Gene SoC DMA engine Driver + * + * Copyright (c) 2015, Applied Micro Circuits Corporation + * Authors: Rameshwar Prasad Sahu <rsahu@apm.com> + * Loc Ho <lho@apm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * NOTE: PM support is currently not available. + */ + +#include <linux/clk.h> +#include <linux/delay.h> +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> +#include <linux/dmapool.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of_device.h> + +#include "dmaengine.h" + +/* X-Gene DMA ring csr registers and bit definations */ +#define XGENE_DMA_RING_CONFIG 0x04 +#define XGENE_DMA_RING_ENABLE BIT(31) +#define XGENE_DMA_RING_ID 0x08 +#define XGENE_DMA_RING_ID_SETUP(v) ((v) | BIT(31)) +#define XGENE_DMA_RING_ID_BUF 0x0C +#define XGENE_DMA_RING_ID_BUF_SETUP(v) (((v) << 9) | BIT(21)) +#define XGENE_DMA_RING_THRESLD0_SET1 0x30 +#define XGENE_DMA_RING_THRESLD0_SET1_VAL 0X64 +#define XGENE_DMA_RING_THRESLD1_SET1 0x34 +#define XGENE_DMA_RING_THRESLD1_SET1_VAL 0xC8 +#define XGENE_DMA_RING_HYSTERESIS 0x68 +#define XGENE_DMA_RING_HYSTERESIS_VAL 0xFFFFFFFF +#define XGENE_DMA_RING_STATE 0x6C +#define XGENE_DMA_RING_STATE_WR_BASE 0x70 +#define XGENE_DMA_RING_NE_INT_MODE 0x017C +#define XGENE_DMA_RING_NE_INT_MODE_SET(m, v) \ + ((m) = ((m) & ~BIT(31 - (v))) | BIT(31 - (v))) +#define XGENE_DMA_RING_NE_INT_MODE_RESET(m, v) \ + ((m) &= (~BIT(31 - (v)))) +#define XGENE_DMA_RING_CLKEN 0xC208 +#define XGENE_DMA_RING_SRST 0xC200 +#define XGENE_DMA_RING_MEM_RAM_SHUTDOWN 0xD070 +#define XGENE_DMA_RING_BLK_MEM_RDY 0xD074 +#define XGENE_DMA_RING_BLK_MEM_RDY_VAL 0xFFFFFFFF +#define XGENE_DMA_RING_DESC_CNT(v) (((v) & 0x0001FFFE) >> 1) +#define XGENE_DMA_RING_ID_GET(owner, num) (((owner) << 6) | (num)) +#define XGENE_DMA_RING_DST_ID(v) ((1 << 10) | (v)) +#define XGENE_DMA_RING_CMD_OFFSET 0x2C +#define XGENE_DMA_RING_CMD_BASE_OFFSET(v) ((v) << 6) +#define XGENE_DMA_RING_COHERENT_SET(m) \ + (((u32 *)(m))[2] |= BIT(4)) +#define XGENE_DMA_RING_ADDRL_SET(m, v) \ + (((u32 *)(m))[2] |= (((v) >> 8) << 5)) +#define XGENE_DMA_RING_ADDRH_SET(m, v) \ + (((u32 *)(m))[3] |= ((v) >> 35)) +#define XGENE_DMA_RING_ACCEPTLERR_SET(m) \ + (((u32 *)(m))[3] |= BIT(19)) +#define XGENE_DMA_RING_SIZE_SET(m, v) \ + (((u32 *)(m))[3] |= ((v) << 23)) +#define XGENE_DMA_RING_RECOMBBUF_SET(m) \ + (((u32 *)(m))[3] |= BIT(27)) +#define XGENE_DMA_RING_RECOMTIMEOUTL_SET(m) \ + (((u32 *)(m))[3] |= (0x7 << 28)) +#define XGENE_DMA_RING_RECOMTIMEOUTH_SET(m) \ + (((u32 *)(m))[4] |= 0x3) +#define XGENE_DMA_RING_SELTHRSH_SET(m) \ + (((u32 *)(m))[4] |= BIT(3)) +#define XGENE_DMA_RING_TYPE_SET(m, v) \ + (((u32 *)(m))[4] |= ((v) << 19)) + +/* X-Gene DMA device csr registers and bit definitions */ +#define XGENE_DMA_IPBRR 0x0 +#define XGENE_DMA_DEV_ID_RD(v) ((v) & 0x00000FFF) +#define XGENE_DMA_BUS_ID_RD(v) (((v) >> 12) & 3) +#define XGENE_DMA_REV_NO_RD(v) (((v) >> 14) & 3) +#define XGENE_DMA_GCR 0x10 +#define XGENE_DMA_CH_SETUP(v) \ + ((v) = ((v) & ~0x000FFFFF) | 0x000AAFFF) +#define XGENE_DMA_ENABLE(v) ((v) |= BIT(31)) +#define XGENE_DMA_DISABLE(v) ((v) &= ~BIT(31)) +#define XGENE_DMA_RAID6_CONT 0x14 +#define XGENE_DMA_RAID6_MULTI_CTRL(v) ((v) << 24) +#define XGENE_DMA_INT 0x70 +#define XGENE_DMA_INT_MASK 0x74 +#define XGENE_DMA_INT_ALL_MASK 0xFFFFFFFF +#define XGENE_DMA_INT_ALL_UNMASK 0x0 +#define XGENE_DMA_INT_MASK_SHIFT 0x14 +#define XGENE_DMA_RING_INT0_MASK 0x90A0 +#define XGENE_DMA_RING_INT1_MASK 0x90A8 +#define XGENE_DMA_RING_INT2_MASK 0x90B0 +#define XGENE_DMA_RING_INT3_MASK 0x90B8 +#define XGENE_DMA_RING_INT4_MASK 0x90C0 +#define XGENE_DMA_CFG_RING_WQ_ASSOC 0x90E0 +#define XGENE_DMA_ASSOC_RING_MNGR1 0xFFFFFFFF +#define XGENE_DMA_MEM_RAM_SHUTDOWN 0xD070 +#define XGENE_DMA_BLK_MEM_RDY 0xD074 +#define XGENE_DMA_BLK_MEM_RDY_VAL 0xFFFFFFFF + +/* X-Gene SoC EFUSE csr register and bit defination */ +#define XGENE_SOC_JTAG1_SHADOW 0x18 +#define XGENE_DMA_PQ_DISABLE_MASK BIT(13) + +/* X-Gene DMA Descriptor format */ +#define XGENE_DMA_DESC_NV_BIT BIT_ULL(50) +#define XGENE_DMA_DESC_IN_BIT BIT_ULL(55) +#define XGENE_DMA_DESC_C_BIT BIT_ULL(63) +#define XGENE_DMA_DESC_DR_BIT BIT_ULL(61) +#define XGENE_DMA_DESC_ELERR_POS 46 +#define XGENE_DMA_DESC_RTYPE_POS 56 +#define XGENE_DMA_DESC_LERR_POS 60 +#define XGENE_DMA_DESC_FLYBY_POS 4 +#define XGENE_DMA_DESC_BUFLEN_POS 48 +#define XGENE_DMA_DESC_HOENQ_NUM_POS 48 + +#define XGENE_DMA_DESC_NV_SET(m) \ + (((u64 *)(m))[0] |= XGENE_DMA_DESC_NV_BIT) +#define XGENE_DMA_DESC_IN_SET(m) \ + (((u64 *)(m))[0] |= XGENE_DMA_DESC_IN_BIT) +#define XGENE_DMA_DESC_RTYPE_SET(m, v) \ + (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_RTYPE_POS)) +#define XGENE_DMA_DESC_BUFADDR_SET(m, v) \ + (((u64 *)(m))[0] |= (v)) +#define XGENE_DMA_DESC_BUFLEN_SET(m, v) \ + (((u64 *)(m))[0] |= ((u64)(v) << XGENE_DMA_DESC_BUFLEN_POS)) +#define XGENE_DMA_DESC_C_SET(m) \ + (((u64 *)(m))[1] |= XGENE_DMA_DESC_C_BIT) +#define XGENE_DMA_DESC_FLYBY_SET(m, v) \ + (((u64 *)(m))[2] |= ((v) << XGENE_DMA_DESC_FLYBY_POS)) +#define XGENE_DMA_DESC_MULTI_SET(m, v, i) \ + (((u64 *)(m))[2] |= ((u64)(v) << (((i) + 1) * 8))) +#define XGENE_DMA_DESC_DR_SET(m) \ + (((u64 *)(m))[2] |= XGENE_DMA_DESC_DR_BIT) +#define XGENE_DMA_DESC_DST_ADDR_SET(m, v) \ + (((u64 *)(m))[3] |= (v)) +#define XGENE_DMA_DESC_H0ENQ_NUM_SET(m, v) \ + (((u64 *)(m))[3] |= ((u64)(v) << XGENE_DMA_DESC_HOENQ_NUM_POS)) +#define XGENE_DMA_DESC_ELERR_RD(m) \ + (((m) >> XGENE_DMA_DESC_ELERR_POS) & 0x3) +#define XGENE_DMA_DESC_LERR_RD(m) \ + (((m) >> XGENE_DMA_DESC_LERR_POS) & 0x7) +#define XGENE_DMA_DESC_STATUS(elerr, lerr) \ + (((elerr) << 4) | (lerr)) + +/* X-Gene DMA descriptor empty s/w signature */ +#define XGENE_DMA_DESC_EMPTY_INDEX 0 +#define XGENE_DMA_DESC_EMPTY_SIGNATURE ~0ULL +#define XGENE_DMA_DESC_SET_EMPTY(m) \ + (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] = \ + XGENE_DMA_DESC_EMPTY_SIGNATURE) +#define XGENE_DMA_DESC_IS_EMPTY(m) \ + (((u64 *)(m))[XGENE_DMA_DESC_EMPTY_INDEX] == \ + XGENE_DMA_DESC_EMPTY_SIGNATURE) + +/* X-Gene DMA configurable parameters defines */ +#define XGENE_DMA_RING_NUM 512 +#define XGENE_DMA_BUFNUM 0x0 +#define XGENE_DMA_CPU_BUFNUM 0x18 +#define XGENE_DMA_RING_OWNER_DMA 0x03 +#define XGENE_DMA_RING_OWNER_CPU 0x0F +#define XGENE_DMA_RING_TYPE_REGULAR 0x01 +#define XGENE_DMA_RING_WQ_DESC_SIZE 32 /* 32 Bytes */ +#define XGENE_DMA_RING_NUM_CONFIG 5 +#define XGENE_DMA_MAX_CHANNEL 4 +#define XGENE_DMA_XOR_CHANNEL 0 +#define XGENE_DMA_PQ_CHANNEL 1 +#define XGENE_DMA_MAX_BYTE_CNT 0x4000 /* 16 KB */ +#define XGENE_DMA_MAX_64B_DESC_BYTE_CNT 0x14000 /* 80 KB */ +#define XGENE_DMA_XOR_ALIGNMENT 6 /* 64 Bytes */ +#define XGENE_DMA_MAX_XOR_SRC 5 +#define XGENE_DMA_16K_BUFFER_LEN_CODE 0x0 +#define XGENE_DMA_INVALID_LEN_CODE 0x7800 + +/* X-Gene DMA descriptor error codes */ +#define ERR_DESC_AXI 0x01 +#define ERR_BAD_DESC 0x02 +#define ERR_READ_DATA_AXI 0x03 +#define ERR_WRITE_DATA_AXI 0x04 +#define ERR_FBP_TIMEOUT 0x05 +#define ERR_ECC 0x06 +#define ERR_DIFF_SIZE 0x08 +#define ERR_SCT_GAT_LEN 0x09 +#define ERR_CRC_ERR 0x11 +#define ERR_CHKSUM 0x12 +#define ERR_DIF 0x13 + +/* X-Gene DMA error interrupt codes */ +#define ERR_DIF_SIZE_INT 0x0 +#define ERR_GS_ERR_INT 0x1 +#define ERR_FPB_TIMEO_INT 0x2 +#define ERR_WFIFO_OVF_INT 0x3 +#define ERR_RFIFO_OVF_INT 0x4 +#define ERR_WR_TIMEO_INT 0x5 +#define ERR_RD_TIMEO_INT 0x6 +#define ERR_WR_ERR_INT 0x7 +#define ERR_RD_ERR_INT 0x8 +#define ERR_BAD_DESC_INT 0x9 +#define ERR_DESC_DST_INT 0xA +#define ERR_DESC_SRC_INT 0xB + +/* X-Gene DMA flyby operation code */ +#define FLYBY_2SRC_XOR 0x8 +#define FLYBY_3SRC_XOR 0x9 +#define FLYBY_4SRC_XOR 0xA +#define FLYBY_5SRC_XOR 0xB + +/* X-Gene DMA SW descriptor flags */ +#define XGENE_DMA_FLAG_64B_DESC BIT(0) + +/* Define to dump X-Gene DMA descriptor */ +#define XGENE_DMA_DESC_DUMP(desc, m) \ + print_hex_dump(KERN_ERR, (m), \ + DUMP_PREFIX_ADDRESS, 16, 8, (desc), 32, 0) + +#define to_dma_desc_sw(tx) \ + container_of(tx, struct xgene_dma_desc_sw, tx) +#define to_dma_chan(dchan) \ + container_of(dchan, struct xgene_dma_chan, dma_chan) + +#define chan_dbg(chan, fmt, arg...) \ + dev_dbg(chan->dev, "%s: " fmt, chan->name, ##arg) +#define chan_err(chan, fmt, arg...) \ + dev_err(chan->dev, "%s: " fmt, chan->name, ##arg) + +struct xgene_dma_desc_hw { + u64 m0; + u64 m1; + u64 m2; + u64 m3; +}; + +enum xgene_dma_ring_cfgsize { + XGENE_DMA_RING_CFG_SIZE_512B, + XGENE_DMA_RING_CFG_SIZE_2KB, + XGENE_DMA_RING_CFG_SIZE_16KB, + XGENE_DMA_RING_CFG_SIZE_64KB, + XGENE_DMA_RING_CFG_SIZE_512KB, + XGENE_DMA_RING_CFG_SIZE_INVALID +}; + +struct xgene_dma_ring { + struct xgene_dma *pdma; + u8 buf_num; + u16 id; + u16 num; + u16 head; + u16 owner; + u16 slots; + u16 dst_ring_num; + u32 size; + void __iomem *cmd; + void __iomem *cmd_base; + dma_addr_t desc_paddr; + u32 state[XGENE_DMA_RING_NUM_CONFIG]; + enum xgene_dma_ring_cfgsize cfgsize; + union { + void *desc_vaddr; + struct xgene_dma_desc_hw *desc_hw; + }; +}; + +struct xgene_dma_desc_sw { + struct xgene_dma_desc_hw desc1; + struct xgene_dma_desc_hw desc2; + u32 flags; + struct list_head node; + struct list_head tx_list; + struct dma_async_tx_descriptor tx; +}; + +/** + * struct xgene_dma_chan - internal representation of an X-Gene DMA channel + * @dma_chan: dmaengine channel object member + * @pdma: X-Gene DMA device structure reference + * @dev: struct device reference for dma mapping api + * @id: raw id of this channel + * @rx_irq: channel IRQ + * @name: name of X-Gene DMA channel + * @lock: serializes enqueue/dequeue operations to the descriptor pool + * @pending: number of transaction request pushed to DMA controller for + * execution, but still waiting for completion, + * @max_outstanding: max number of outstanding request we can push to channel + * @ld_pending: descriptors which are queued to run, but have not yet been + * submitted to the hardware for execution + * @ld_running: descriptors which are currently being executing by the hardware + * @ld_completed: descriptors which have finished execution by the hardware. + * These descriptors have already had their cleanup actions run. They + * are waiting for the ACK bit to be set by the async tx API. + * @desc_pool: descriptor pool for DMA operations + * @tasklet: bottom half where all completed descriptors cleans + * @tx_ring: transmit ring descriptor that we use to prepare actual + * descriptors for further executions + * @rx_ring: receive ring descriptor that we use to get completed DMA + * descriptors during cleanup time + */ +struct xgene_dma_chan { + struct dma_chan dma_chan; + struct xgene_dma *pdma; + struct device *dev; + int id; + int rx_irq; + char name[10]; + spinlock_t lock; + int pending; + int max_outstanding; + struct list_head ld_pending; + struct list_head ld_running; + struct list_head ld_completed; + struct dma_pool *desc_pool; + struct tasklet_struct tasklet; + struct xgene_dma_ring tx_ring; + struct xgene_dma_ring rx_ring; +}; + +/** + * struct xgene_dma - internal representation of an X-Gene DMA device + * @err_irq: DMA error irq number + * @ring_num: start id number for DMA ring + * @csr_dma: base for DMA register access + * @csr_ring: base for DMA ring register access + * @csr_ring_cmd: base for DMA ring command register access + * @csr_efuse: base for efuse register access + * @dma_dev: embedded struct dma_device + * @chan: reference to X-Gene DMA channels + */ +struct xgene_dma { + struct device *dev; + struct clk *clk; + int err_irq; + int ring_num; + void __iomem *csr_dma; + void __iomem *csr_ring; + void __iomem *csr_ring_cmd; + void __iomem *csr_efuse; + struct dma_device dma_dev[XGENE_DMA_MAX_CHANNEL]; + struct xgene_dma_chan chan[XGENE_DMA_MAX_CHANNEL]; +}; + +static const char * const xgene_dma_desc_err[] = { + [ERR_DESC_AXI] = "AXI error when reading src/dst link list", + [ERR_BAD_DESC] = "ERR or El_ERR fields not set to zero in desc", + [ERR_READ_DATA_AXI] = "AXI error when reading data", + [ERR_WRITE_DATA_AXI] = "AXI error when writing data", + [ERR_FBP_TIMEOUT] = "Timeout on bufpool fetch", + [ERR_ECC] = "ECC double bit error", + [ERR_DIFF_SIZE] = "Bufpool too small to hold all the DIF result", + [ERR_SCT_GAT_LEN] = "Gather and scatter data length not same", + [ERR_CRC_ERR] = "CRC error", + [ERR_CHKSUM] = "Checksum error", + [ERR_DIF] = "DIF error", +}; + +static const char * const xgene_dma_err[] = { + [ERR_DIF_SIZE_INT] = "DIF size error", + [ERR_GS_ERR_INT] = "Gather scatter not same size error", + [ERR_FPB_TIMEO_INT] = "Free pool time out error", + [ERR_WFIFO_OVF_INT] = "Write FIFO over flow error", + [ERR_RFIFO_OVF_INT] = "Read FIFO over flow error", + [ERR_WR_TIMEO_INT] = "Write time out error", + [ERR_RD_TIMEO_INT] = "Read time out error", + [ERR_WR_ERR_INT] = "HBF bus write error", + [ERR_RD_ERR_INT] = "HBF bus read error", + [ERR_BAD_DESC_INT] = "Ring descriptor HE0 not set error", + [ERR_DESC_DST_INT] = "HFB reading dst link address error", + [ERR_DESC_SRC_INT] = "HFB reading src link address error", +}; + +static bool is_pq_enabled(struct xgene_dma *pdma) +{ + u32 val; + + val = ioread32(pdma->csr_efuse + XGENE_SOC_JTAG1_SHADOW); + return !(val & XGENE_DMA_PQ_DISABLE_MASK); +} + +static void xgene_dma_cpu_to_le64(u64 *desc, int count) +{ + int i; + + for (i = 0; i < count; i++) + desc[i] = cpu_to_le64(desc[i]); +} + +static u16 xgene_dma_encode_len(u32 len) +{ + return (len < XGENE_DMA_MAX_BYTE_CNT) ? + len : XGENE_DMA_16K_BUFFER_LEN_CODE; +} + +static u8 xgene_dma_encode_xor_flyby(u32 src_cnt) +{ + static u8 flyby_type[] = { + FLYBY_2SRC_XOR, /* Dummy */ + FLYBY_2SRC_XOR, /* Dummy */ + FLYBY_2SRC_XOR, + FLYBY_3SRC_XOR, + FLYBY_4SRC_XOR, + FLYBY_5SRC_XOR + }; + + return flyby_type[src_cnt]; +} + +static u32 xgene_dma_ring_desc_cnt(struct xgene_dma_ring *ring) +{ + u32 __iomem *cmd_base = ring->cmd_base; + u32 ring_state = ioread32(&cmd_base[1]); + + return XGENE_DMA_RING_DESC_CNT(ring_state); +} + +static void xgene_dma_set_src_buffer(void *ext8, size_t *len, + dma_addr_t *paddr) +{ + size_t nbytes = (*len < XGENE_DMA_MAX_BYTE_CNT) ? + *len : XGENE_DMA_MAX_BYTE_CNT; + + XGENE_DMA_DESC_BUFADDR_SET(ext8, *paddr); + XGENE_DMA_DESC_BUFLEN_SET(ext8, xgene_dma_encode_len(nbytes)); + *len -= nbytes; + *paddr += nbytes; +} + +static void xgene_dma_invalidate_buffer(void *ext8) +{ + XGENE_DMA_DESC_BUFLEN_SET(ext8, XGENE_DMA_INVALID_LEN_CODE); +} + +static void *xgene_dma_lookup_ext8(u64 *desc, int idx) +{ + return (idx % 2) ? (desc + idx - 1) : (desc + idx + 1); +} + +static void xgene_dma_init_desc(void *desc, u16 dst_ring_num) +{ + XGENE_DMA_DESC_C_SET(desc); /* Coherent IO */ + XGENE_DMA_DESC_IN_SET(desc); + XGENE_DMA_DESC_H0ENQ_NUM_SET(desc, dst_ring_num); + XGENE_DMA_DESC_RTYPE_SET(desc, XGENE_DMA_RING_OWNER_DMA); +} + +static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan, + struct xgene_dma_desc_sw *desc_sw, + dma_addr_t dst, dma_addr_t src, + size_t len) +{ + void *desc1, *desc2; + int i; + + /* Get 1st descriptor */ + desc1 = &desc_sw->desc1; + xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num); + + /* Set destination address */ + XGENE_DMA_DESC_DR_SET(desc1); + XGENE_DMA_DESC_DST_ADDR_SET(desc1, dst); + + /* Set 1st source address */ + xgene_dma_set_src_buffer(desc1 + 8, &len, &src); + + if (len <= 0) { + desc2 = NULL; + goto skip_additional_src; + } + + /* + * We need to split this source buffer, + * and need to use 2nd descriptor + */ + desc2 = &desc_sw->desc2; + XGENE_DMA_DESC_NV_SET(desc1); + + /* Set 2nd to 5th source address */ + for (i = 0; i < 4 && len; i++) + xgene_dma_set_src_buffer(xgene_dma_lookup_ext8(desc2, i), + &len, &src); + + /* Invalidate unused source address field */ + for (; i < 4; i++) + xgene_dma_invalidate_buffer(xgene_dma_lookup_ext8(desc2, i)); + + /* Updated flag that we have prepared 64B descriptor */ + desc_sw->flags |= XGENE_DMA_FLAG_64B_DESC; + +skip_additional_src: + /* Hardware stores descriptor in little endian format */ + xgene_dma_cpu_to_le64(desc1, 4); + if (desc2) + xgene_dma_cpu_to_le64(desc2, 4); +} + +static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan, + struct xgene_dma_desc_sw *desc_sw, + dma_addr_t *dst, dma_addr_t *src, + u32 src_cnt, size_t *nbytes, + const u8 *scf) +{ + void *desc1, *desc2; + size_t len = *nbytes; + int i; + + desc1 = &desc_sw->desc1; + desc2 = &desc_sw->desc2; + + /* Initialize DMA descriptor */ + xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num); + + /* Set destination address */ + XGENE_DMA_DESC_DR_SET(desc1); + XGENE_DMA_DESC_DST_ADDR_SET(desc1, *dst); + + /* We have multiple source addresses, so need to set NV bit*/ + XGENE_DMA_DESC_NV_SET(desc1); + + /* Set flyby opcode */ + XGENE_DMA_DESC_FLYBY_SET(desc1, xgene_dma_encode_xor_flyby(src_cnt)); + + /* Set 1st to 5th source addresses */ + for (i = 0; i < src_cnt; i++) { + len = *nbytes; + xgene_dma_set_src_buffer((i == 0) ? (desc1 + 8) : + xgene_dma_lookup_ext8(desc2, i - 1), + &len, &src[i]); + XGENE_DMA_DESC_MULTI_SET(desc1, scf[i], i); + } + + /* Hardware stores descriptor in little endian format */ + xgene_dma_cpu_to_le64(desc1, 4); + xgene_dma_cpu_to_le64(desc2, 4); + + /* Update meta data */ + *nbytes = len; + *dst += XGENE_DMA_MAX_BYTE_CNT; + + /* We need always 64B descriptor to perform xor or pq operations */ + desc_sw->flags |= XGENE_DMA_FLAG_64B_DESC; +} + +static dma_cookie_t xgene_dma_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct xgene_dma_desc_sw *desc; + struct xgene_dma_chan *chan; + dma_cookie_t cookie; + + if (unlikely(!tx)) + return -EINVAL; + + chan = to_dma_chan(tx->chan); + desc = to_dma_desc_sw(tx); + + spin_lock_bh(&chan->lock); + + cookie = dma_cookie_assign(tx); + + /* Add this transaction list onto the tail of the pending queue */ + list_splice_tail_init(&desc->tx_list, &chan->ld_pending); + + spin_unlock_bh(&chan->lock); + + return cookie; +} + +static void xgene_dma_clean_descriptor(struct xgene_dma_chan *chan, + struct xgene_dma_desc_sw *desc) +{ + list_del(&desc->node); + chan_dbg(chan, "LD %p free\n", desc); + dma_pool_free(chan->desc_pool, desc, desc->tx.phys); +} + +static struct xgene_dma_desc_sw *xgene_dma_alloc_descriptor( + struct xgene_dma_chan *chan) +{ + struct xgene_dma_desc_sw *desc; + dma_addr_t phys; + + desc = dma_pool_alloc(chan->desc_pool, GFP_NOWAIT, &phys); + if (!desc) { + chan_err(chan, "Failed to allocate LDs\n"); + return NULL; + } + + memset(desc, 0, sizeof(*desc)); + + INIT_LIST_HEAD(&desc->tx_list); + desc->tx.phys = phys; + desc->tx.tx_submit = xgene_dma_tx_submit; + dma_async_tx_descriptor_init(&desc->tx, &chan->dma_chan); + + chan_dbg(chan, "LD %p allocated\n", desc); + + return desc; +} + +/** + * xgene_dma_clean_completed_descriptor - free all descriptors which + * has been completed and acked + * @chan: X-Gene DMA channel + * + * This function is used on all completed and acked descriptors. + */ +static void xgene_dma_clean_completed_descriptor(struct xgene_dma_chan *chan) +{ + struct xgene_dma_desc_sw *desc, *_desc; + + /* Run the callback for each descriptor, in order */ + list_for_each_entry_safe(desc, _desc, &chan->ld_completed, node) { + if (async_tx_test_ack(&desc->tx)) + xgene_dma_clean_descriptor(chan, desc); + } +} + +/** + * xgene_dma_run_tx_complete_actions - cleanup a single link descriptor + * @chan: X-Gene DMA channel + * @desc: descriptor to cleanup and free + * + * This function is used on a descriptor which has been executed by the DMA + * controller. It will run any callbacks, submit any dependencies. + */ +static void xgene_dma_run_tx_complete_actions(struct xgene_dma_chan *chan, + struct xgene_dma_desc_sw *desc) +{ + struct dma_async_tx_descriptor *tx = &desc->tx; + + /* + * If this is not the last transaction in the group, + * then no need to complete cookie and run any callback as + * this is not the tx_descriptor which had been sent to caller + * of this DMA request + */ + + if (tx->cookie == 0) + return; + + dma_cookie_complete(tx); + + /* Run the link descriptor callback function */ + if (tx->callback) + tx->callback(tx->callback_param); + + dma_descriptor_unmap(tx); + + /* Run any dependencies */ + dma_run_dependencies(tx); +} + +/** + * xgene_dma_clean_running_descriptor - move the completed descriptor from + * ld_running to ld_completed + * @chan: X-Gene DMA channel + * @desc: the descriptor which is completed + * + * Free the descriptor directly if acked by async_tx api, + * else move it to queue ld_completed. + */ +static void xgene_dma_clean_running_descriptor(struct xgene_dma_chan *chan, + struct xgene_dma_desc_sw *desc) +{ + /* Remove from the list of running transactions */ + list_del(&desc->node); + + /* + * the client is allowed to attach dependent operations + * until 'ack' is set + */ + if (!async_tx_test_ack(&desc->tx)) { + /* + * Move this descriptor to the list of descriptors which is + * completed, but still awaiting the 'ack' bit to be set. + */ + list_add_tail(&desc->node, &chan->ld_completed); + return; + } + + chan_dbg(chan, "LD %p free\n", desc); + dma_pool_free(chan->desc_pool, desc, desc->tx.phys); +} + +static int xgene_chan_xfer_request(struct xgene_dma_ring *ring, + struct xgene_dma_desc_sw *desc_sw) +{ + struct xgene_dma_desc_hw *desc_hw; + + /* Check if can push more descriptor to hw for execution */ + if (xgene_dma_ring_desc_cnt(ring) > (ring->slots - 2)) + return -EBUSY; + + /* Get hw descriptor from DMA tx ring */ + desc_hw = &ring->desc_hw[ring->head]; + + /* + * Increment the head count to point next + * descriptor for next time + */ + if (++ring->head == ring->slots) + ring->head = 0; + + /* Copy prepared sw descriptor data to hw descriptor */ + memcpy(desc_hw, &desc_sw->desc1, sizeof(*desc_hw)); + + /* + * Check if we have prepared 64B descriptor, + * in this case we need one more hw descriptor + */ + if (desc_sw->flags & XGENE_DMA_FLAG_64B_DESC) { + desc_hw = &ring->desc_hw[ring->head]; + + if (++ring->head == ring->slots) + ring->head = 0; + + memcpy(desc_hw, &desc_sw->desc2, sizeof(*desc_hw)); + } + + /* Notify the hw that we have descriptor ready for execution */ + iowrite32((desc_sw->flags & XGENE_DMA_FLAG_64B_DESC) ? + 2 : 1, ring->cmd); + + return 0; +} + +/** + * xgene_chan_xfer_ld_pending - push any pending transactions to hw + * @chan : X-Gene DMA channel + * + * LOCKING: must hold chan->desc_lock + */ +static void xgene_chan_xfer_ld_pending(struct xgene_dma_chan *chan) +{ + struct xgene_dma_desc_sw *desc_sw, *_desc_sw; + int ret; + + /* + * If the list of pending descriptors is empty, then we + * don't need to do any work at all + */ + if (list_empty(&chan->ld_pending)) { + chan_dbg(chan, "No pending LDs\n"); + return; + } + + /* + * Move elements from the queue of pending transactions onto the list + * of running transactions and push it to hw for further executions + */ + list_for_each_entry_safe(desc_sw, _desc_sw, &chan->ld_pending, node) { + /* + * Check if have pushed max number of transactions to hw + * as capable, so let's stop here and will push remaining + * elements from pening ld queue after completing some + * descriptors that we have already pushed + */ + if (chan->pending >= chan->max_outstanding) + return; + + ret = xgene_chan_xfer_request(&chan->tx_ring, desc_sw); + if (ret) + return; + + /* + * Delete this element from ld pending queue and append it to + * ld running queue + */ + list_move_tail(&desc_sw->node, &chan->ld_running); + + /* Increment the pending transaction count */ + chan->pending++; + } +} + +/** + * xgene_dma_cleanup_descriptors - cleanup link descriptors which are completed + * and move them to ld_completed to free until flag 'ack' is set + * @chan: X-Gene DMA channel + * + * This function is used on descriptors which have been executed by the DMA + * controller. It will run any callbacks, submit any dependencies, then + * free these descriptors if flag 'ack' is set. + */ +static void xgene_dma_cleanup_descriptors(struct xgene_dma_chan *chan) +{ + struct xgene_dma_ring *ring = &chan->rx_ring; + struct xgene_dma_desc_sw *desc_sw, *_desc_sw; + struct xgene_dma_desc_hw *desc_hw; + u8 status; + + /* Clean already completed and acked descriptors */ + xgene_dma_clean_completed_descriptor(chan); + + /* Run the callback for each descriptor, in order */ + list_for_each_entry_safe(desc_sw, _desc_sw, &chan->ld_running, node) { + /* Get subsequent hw descriptor from DMA rx ring */ + desc_hw = &ring->desc_hw[ring->head]; + + /* Check if this descriptor has been completed */ + if (unlikely(XGENE_DMA_DESC_IS_EMPTY(desc_hw))) + break; + + if (++ring->head == ring->slots) + ring->head = 0; + + /* Check if we have any error with DMA transactions */ + status = XGENE_DMA_DESC_STATUS( + XGENE_DMA_DESC_ELERR_RD(le64_to_cpu( + desc_hw->m0)), + XGENE_DMA_DESC_LERR_RD(le64_to_cpu( + desc_hw->m0))); + if (status) { + /* Print the DMA error type */ + chan_err(chan, "%s\n", xgene_dma_desc_err[status]); + + /* + * We have DMA transactions error here. Dump DMA Tx + * and Rx descriptors for this request */ + XGENE_DMA_DESC_DUMP(&desc_sw->desc1, + "X-Gene DMA TX DESC1: "); + + if (desc_sw->flags & XGENE_DMA_FLAG_64B_DESC) + XGENE_DMA_DESC_DUMP(&desc_sw->desc2, + "X-Gene DMA TX DESC2: "); + + XGENE_DMA_DESC_DUMP(desc_hw, + "X-Gene DMA RX ERR DESC: "); + } + + /* Notify the hw about this completed descriptor */ + iowrite32(-1, ring->cmd); + + /* Mark this hw descriptor as processed */ + XGENE_DMA_DESC_SET_EMPTY(desc_hw); + + xgene_dma_run_tx_complete_actions(chan, desc_sw); + + xgene_dma_clean_running_descriptor(chan, desc_sw); + + /* + * Decrement the pending transaction count + * as we have processed one + */ + chan->pending--; + } + + /* + * Start any pending transactions automatically + * In the ideal case, we keep the DMA controller busy while we go + * ahead and free the descriptors below. + */ + xgene_chan_xfer_ld_pending(chan); +} + +static int xgene_dma_alloc_chan_resources(struct dma_chan *dchan) +{ + struct xgene_dma_chan *chan = to_dma_chan(dchan); + + /* Has this channel already been allocated? */ + if (chan->desc_pool) + return 1; + + chan->desc_pool = dma_pool_create(chan->name, chan->dev, + sizeof(struct xgene_dma_desc_sw), + 0, 0); + if (!chan->desc_pool) { + chan_err(chan, "Failed to allocate descriptor pool\n"); + return -ENOMEM; + } + + chan_dbg(chan, "Allocate descripto pool\n"); + + return 1; +} + +/** + * xgene_dma_free_desc_list - Free all descriptors in a queue + * @chan: X-Gene DMA channel + * @list: the list to free + * + * LOCKING: must hold chan->desc_lock + */ +static void xgene_dma_free_desc_list(struct xgene_dma_chan *chan, + struct list_head *list) +{ + struct xgene_dma_desc_sw *desc, *_desc; + + list_for_each_entry_safe(desc, _desc, list, node) + xgene_dma_clean_descriptor(chan, desc); +} + +static void xgene_dma_free_tx_desc_list(struct xgene_dma_chan *chan, + struct list_head *list) +{ + struct xgene_dma_desc_sw *desc, *_desc; + + list_for_each_entry_safe(desc, _desc, list, node) + xgene_dma_clean_descriptor(chan, desc); +} + +static void xgene_dma_free_chan_resources(struct dma_chan *dchan) +{ + struct xgene_dma_chan *chan = to_dma_chan(dchan); + + chan_dbg(chan, "Free all resources\n"); + + if (!chan->desc_pool) + return; + + spin_lock_bh(&chan->lock); + + /* Process all running descriptor */ + xgene_dma_cleanup_descriptors(chan); + + /* Clean all link descriptor queues */ + xgene_dma_free_desc_list(chan, &chan->ld_pending); + xgene_dma_free_desc_list(chan, &chan->ld_running); + xgene_dma_free_desc_list(chan, &chan->ld_completed); + + spin_unlock_bh(&chan->lock); + + /* Delete this channel DMA pool */ + dma_pool_destroy(chan->desc_pool); + chan->desc_pool = NULL; +} + +static struct dma_async_tx_descriptor *xgene_dma_prep_memcpy( + struct dma_chan *dchan, dma_addr_t dst, dma_addr_t src, + size_t len, unsigned long flags) +{ + struct xgene_dma_desc_sw *first = NULL, *new; + struct xgene_dma_chan *chan; + size_t copy; + + if (unlikely(!dchan || !len)) + return NULL; + + chan = to_dma_chan(dchan); + + do { + /* Allocate the link descriptor from DMA pool */ + new = xgene_dma_alloc_descriptor(chan); + if (!new) + goto fail; + + /* Create the largest transaction possible */ + copy = min_t(size_t, len, XGENE_DMA_MAX_64B_DESC_BYTE_CNT); + + /* Prepare DMA descriptor */ + xgene_dma_prep_cpy_desc(chan, new, dst, src, copy); + + if (!first) + first = new; + + new->tx.cookie = 0; + async_tx_ack(&new->tx); + + /* Update metadata */ + len -= copy; + dst += copy; + src += copy; + + /* Insert the link descriptor to the LD ring */ + list_add_tail(&new->node, &first->tx_list); + } while (len); + + new->tx.flags = flags; /* client is in control of this ack */ + new->tx.cookie = -EBUSY; + list_splice(&first->tx_list, &new->tx_list); + + return &new->tx; + +fail: + if (!first) + return NULL; + + xgene_dma_free_tx_desc_list(chan, &first->tx_list); + return NULL; +} + +static struct dma_async_tx_descriptor *xgene_dma_prep_sg( + struct dma_chan *dchan, struct scatterlist *dst_sg, + u32 dst_nents, struct scatterlist *src_sg, + u32 src_nents, unsigned long flags) +{ + struct xgene_dma_desc_sw *first = NULL, *new = NULL; + struct xgene_dma_chan *chan; + size_t dst_avail, src_avail; + dma_addr_t dst, src; + size_t len; + + if (unlikely(!dchan)) + return NULL; + + if (unlikely(!dst_nents || !src_nents)) + return NULL; + + if (unlikely(!dst_sg || !src_sg)) + return NULL; + + chan = to_dma_chan(dchan); + + /* Get prepared for the loop */ + dst_avail = sg_dma_len(dst_sg); + src_avail = sg_dma_len(src_sg); + dst_nents--; + src_nents--; + + /* Run until we are out of scatterlist entries */ + while (true) { + /* Create the largest transaction possible */ + len = min_t(size_t, src_avail, dst_avail); + len = min_t(size_t, len, XGENE_DMA_MAX_64B_DESC_BYTE_CNT); + if (len == 0) + goto fetch; + + dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) - dst_avail; + src = sg_dma_address(src_sg) + sg_dma_len(src_sg) - src_avail; + + /* Allocate the link descriptor from DMA pool */ + new = xgene_dma_alloc_descriptor(chan); + if (!new) + goto fail; + + /* Prepare DMA descriptor */ + xgene_dma_prep_cpy_desc(chan, new, dst, src, len); + + if (!first) + first = new; + + new->tx.cookie = 0; + async_tx_ack(&new->tx); + + /* update metadata */ + dst_avail -= len; + src_avail -= len; + + /* Insert the link descriptor to the LD ring */ + list_add_tail(&new->node, &first->tx_list); + +fetch: + /* fetch the next dst scatterlist entry */ + if (dst_avail == 0) { + /* no more entries: we're done */ + if (dst_nents == 0) + break; + + /* fetch the next entry: if there are no more: done */ + dst_sg = sg_next(dst_sg); + if (!dst_sg) + break; + + dst_nents--; + dst_avail = sg_dma_len(dst_sg); + } + + /* fetch the next src scatterlist entry */ + if (src_avail == 0) { + /* no more entries: we're done */ + if (src_nents == 0) + break; + + /* fetch the next entry: if there are no more: done */ + src_sg = sg_next(src_sg); + if (!src_sg) + break; + + src_nents--; + src_avail = sg_dma_len(src_sg); + } + } + + if (!new) + return NULL; + + new->tx.flags = flags; /* client is in control of this ack */ + new->tx.cookie = -EBUSY; + list_splice(&first->tx_list, &new->tx_list); + + return &new->tx; +fail: + if (!first) + return NULL; + + xgene_dma_free_tx_desc_list(chan, &first->tx_list); + return NULL; +} + +static struct dma_async_tx_descriptor *xgene_dma_prep_xor( + struct dma_chan *dchan, dma_addr_t dst, dma_addr_t *src, + u32 src_cnt, size_t len, unsigned long flags) +{ + struct xgene_dma_desc_sw *first = NULL, *new; + struct xgene_dma_chan *chan; + static u8 multi[XGENE_DMA_MAX_XOR_SRC] = { + 0x01, 0x01, 0x01, 0x01, 0x01}; + + if (unlikely(!dchan || !len)) + return NULL; + + chan = to_dma_chan(dchan); + + do { + /* Allocate the link descriptor from DMA pool */ + new = xgene_dma_alloc_descriptor(chan); + if (!new) + goto fail; + + /* Prepare xor DMA descriptor */ + xgene_dma_prep_xor_desc(chan, new, &dst, src, + src_cnt, &len, multi); + + if (!first) + first = new; + + new->tx.cookie = 0; + async_tx_ack(&new->tx); + + /* Insert the link descriptor to the LD ring */ + list_add_tail(&new->node, &first->tx_list); + } while (len); + + new->tx.flags = flags; /* client is in control of this ack */ + new->tx.cookie = -EBUSY; + list_splice(&first->tx_list, &new->tx_list); + + return &new->tx; + +fail: + if (!first) + return NULL; + + xgene_dma_free_tx_desc_list(chan, &first->tx_list); + return NULL; +} + +static struct dma_async_tx_descriptor *xgene_dma_prep_pq( + struct dma_chan *dchan, dma_addr_t *dst, dma_addr_t *src, + u32 src_cnt, const u8 *scf, size_t len, unsigned long flags) +{ + struct xgene_dma_desc_sw *first = NULL, *new; + struct xgene_dma_chan *chan; + size_t _len = len; + dma_addr_t _src[XGENE_DMA_MAX_XOR_SRC]; + static u8 multi[XGENE_DMA_MAX_XOR_SRC] = {0x01, 0x01, 0x01, 0x01, 0x01}; + + if (unlikely(!dchan || !len)) + return NULL; + + chan = to_dma_chan(dchan); + + /* + * Save source addresses on local variable, may be we have to + * prepare two descriptor to generate P and Q if both enabled + * in the flags by client + */ + memcpy(_src, src, sizeof(*src) * src_cnt); + + if (flags & DMA_PREP_PQ_DISABLE_P) + len = 0; + + if (flags & DMA_PREP_PQ_DISABLE_Q) + _len = 0; + + do { + /* Allocate the link descriptor from DMA pool */ + new = xgene_dma_alloc_descriptor(chan); + if (!new) + goto fail; + + if (!first) + first = new; + + new->tx.cookie = 0; + async_tx_ack(&new->tx); + + /* Insert the link descriptor to the LD ring */ + list_add_tail(&new->node, &first->tx_list); + + /* + * Prepare DMA descriptor to generate P, + * if DMA_PREP_PQ_DISABLE_P flag is not set + */ + if (len) { + xgene_dma_prep_xor_desc(chan, new, &dst[0], src, + src_cnt, &len, multi); + continue; + } + + /* + * Prepare DMA descriptor to generate Q, + * if DMA_PREP_PQ_DISABLE_Q flag is not set + */ + if (_len) { + xgene_dma_prep_xor_desc(chan, new, &dst[1], _src, + src_cnt, &_len, scf); + } + } while (len || _len); + + new->tx.flags = flags; /* client is in control of this ack */ + new->tx.cookie = -EBUSY; + list_splice(&first->tx_list, &new->tx_list); + + return &new->tx; + +fail: + if (!first) + return NULL; + + xgene_dma_free_tx_desc_list(chan, &first->tx_list); + return NULL; +} + +static void xgene_dma_issue_pending(struct dma_chan *dchan) +{ + struct xgene_dma_chan *chan = to_dma_chan(dchan); + + spin_lock_bh(&chan->lock); + xgene_chan_xfer_ld_pending(chan); + spin_unlock_bh(&chan->lock); +} + +static enum dma_status xgene_dma_tx_status(struct dma_chan *dchan, + dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + return dma_cookie_status(dchan, cookie, txstate); +} + +static void xgene_dma_tasklet_cb(unsigned long data) +{ + struct xgene_dma_chan *chan = (struct xgene_dma_chan *)data; + + spin_lock_bh(&chan->lock); + + /* Run all cleanup for descriptors which have been completed */ + xgene_dma_cleanup_descriptors(chan); + + /* Re-enable DMA channel IRQ */ + enable_irq(chan->rx_irq); + + spin_unlock_bh(&chan->lock); +} + +static irqreturn_t xgene_dma_chan_ring_isr(int irq, void *id) +{ + struct xgene_dma_chan *chan = (struct xgene_dma_chan *)id; + + BUG_ON(!chan); + + /* + * Disable DMA channel IRQ until we process completed + * descriptors + */ + disable_irq_nosync(chan->rx_irq); + + /* + * Schedule the tasklet to handle all cleanup of the current + * transaction. It will start a new transaction if there is + * one pending. + */ + tasklet_schedule(&chan->tasklet); + + return IRQ_HANDLED; +} + +static irqreturn_t xgene_dma_err_isr(int irq, void *id) +{ + struct xgene_dma *pdma = (struct xgene_dma *)id; + unsigned long int_mask; + u32 val, i; + + val = ioread32(pdma->csr_dma + XGENE_DMA_INT); + + /* Clear DMA interrupts */ + iowrite32(val, pdma->csr_dma + XGENE_DMA_INT); + + /* Print DMA error info */ + int_mask = val >> XGENE_DMA_INT_MASK_SHIFT; + for_each_set_bit(i, &int_mask, ARRAY_SIZE(xgene_dma_err)) + dev_err(pdma->dev, + "Interrupt status 0x%08X %s\n", val, xgene_dma_err[i]); + + return IRQ_HANDLED; +} + +static void xgene_dma_wr_ring_state(struct xgene_dma_ring *ring) +{ + int i; + + iowrite32(ring->num, ring->pdma->csr_ring + XGENE_DMA_RING_STATE); + + for (i = 0; i < XGENE_DMA_RING_NUM_CONFIG; i++) + iowrite32(ring->state[i], ring->pdma->csr_ring + + XGENE_DMA_RING_STATE_WR_BASE + (i * 4)); +} + +static void xgene_dma_clr_ring_state(struct xgene_dma_ring *ring) +{ + memset(ring->state, 0, sizeof(u32) * XGENE_DMA_RING_NUM_CONFIG); + xgene_dma_wr_ring_state(ring); +} + +static void xgene_dma_setup_ring(struct xgene_dma_ring *ring) +{ + void *ring_cfg = ring->state; + u64 addr = ring->desc_paddr; + void *desc; + u32 i, val; + + ring->slots = ring->size / XGENE_DMA_RING_WQ_DESC_SIZE; + + /* Clear DMA ring state */ + xgene_dma_clr_ring_state(ring); + + /* Set DMA ring type */ + XGENE_DMA_RING_TYPE_SET(ring_cfg, XGENE_DMA_RING_TYPE_REGULAR); + + if (ring->owner == XGENE_DMA_RING_OWNER_DMA) { + /* Set recombination buffer and timeout */ + XGENE_DMA_RING_RECOMBBUF_SET(ring_cfg); + XGENE_DMA_RING_RECOMTIMEOUTL_SET(ring_cfg); + XGENE_DMA_RING_RECOMTIMEOUTH_SET(ring_cfg); + } + + /* Initialize DMA ring state */ + XGENE_DMA_RING_SELTHRSH_SET(ring_cfg); + XGENE_DMA_RING_ACCEPTLERR_SET(ring_cfg); + XGENE_DMA_RING_COHERENT_SET(ring_cfg); + XGENE_DMA_RING_ADDRL_SET(ring_cfg, addr); + XGENE_DMA_RING_ADDRH_SET(ring_cfg, addr); + XGENE_DMA_RING_SIZE_SET(ring_cfg, ring->cfgsize); + + /* Write DMA ring configurations */ + xgene_dma_wr_ring_state(ring); + + /* Set DMA ring id */ + iowrite32(XGENE_DMA_RING_ID_SETUP(ring->id), + ring->pdma->csr_ring + XGENE_DMA_RING_ID); + + /* Set DMA ring buffer */ + iowrite32(XGENE_DMA_RING_ID_BUF_SETUP(ring->num), + ring->pdma->csr_ring + XGENE_DMA_RING_ID_BUF); + + if (ring->owner != XGENE_DMA_RING_OWNER_CPU) + return; + + /* Set empty signature to DMA Rx ring descriptors */ + for (i = 0; i < ring->slots; i++) { + desc = &ring->desc_hw[i]; + XGENE_DMA_DESC_SET_EMPTY(desc); + } + + /* Enable DMA Rx ring interrupt */ + val = ioread32(ring->pdma->csr_ring + XGENE_DMA_RING_NE_INT_MODE); + XGENE_DMA_RING_NE_INT_MODE_SET(val, ring->buf_num); + iowrite32(val, ring->pdma->csr_ring + XGENE_DMA_RING_NE_INT_MODE); +} + +static void xgene_dma_clear_ring(struct xgene_dma_ring *ring) +{ + u32 ring_id, val; + + if (ring->owner == XGENE_DMA_RING_OWNER_CPU) { + /* Disable DMA Rx ring interrupt */ + val = ioread32(ring->pdma->csr_ring + + XGENE_DMA_RING_NE_INT_MODE); + XGENE_DMA_RING_NE_INT_MODE_RESET(val, ring->buf_num); + iowrite32(val, ring->pdma->csr_ring + + XGENE_DMA_RING_NE_INT_MODE); + } + + /* Clear DMA ring state */ + ring_id = XGENE_DMA_RING_ID_SETUP(ring->id); + iowrite32(ring_id, ring->pdma->csr_ring + XGENE_DMA_RING_ID); + + iowrite32(0, ring->pdma->csr_ring + XGENE_DMA_RING_ID_BUF); + xgene_dma_clr_ring_state(ring); +} + +static void xgene_dma_set_ring_cmd(struct xgene_dma_ring *ring) +{ + ring->cmd_base = ring->pdma->csr_ring_cmd + + XGENE_DMA_RING_CMD_BASE_OFFSET((ring->num - + XGENE_DMA_RING_NUM)); + + ring->cmd = ring->cmd_base + XGENE_DMA_RING_CMD_OFFSET; +} + +static int xgene_dma_get_ring_size(struct xgene_dma_chan *chan, + enum xgene_dma_ring_cfgsize cfgsize) +{ + int size; + + switch (cfgsize) { + case XGENE_DMA_RING_CFG_SIZE_512B: + size = 0x200; + break; + case XGENE_DMA_RING_CFG_SIZE_2KB: + size = 0x800; + break; + case XGENE_DMA_RING_CFG_SIZE_16KB: + size = 0x4000; + break; + case XGENE_DMA_RING_CFG_SIZE_64KB: + size = 0x10000; + break; + case XGENE_DMA_RING_CFG_SIZE_512KB: + size = 0x80000; + break; + default: + chan_err(chan, "Unsupported cfg ring size %d\n", cfgsize); + return -EINVAL; + } + + return size; +} + +static void xgene_dma_delete_ring_one(struct xgene_dma_ring *ring) +{ + /* Clear DMA ring configurations */ + xgene_dma_clear_ring(ring); + + /* De-allocate DMA ring descriptor */ + if (ring->desc_vaddr) { + dma_free_coherent(ring->pdma->dev, ring->size, + ring->desc_vaddr, ring->desc_paddr); + ring->desc_vaddr = NULL; + } +} + +static void xgene_dma_delete_chan_rings(struct xgene_dma_chan *chan) +{ + xgene_dma_delete_ring_one(&chan->rx_ring); + xgene_dma_delete_ring_one(&chan->tx_ring); +} + +static int xgene_dma_create_ring_one(struct xgene_dma_chan *chan, + struct xgene_dma_ring *ring, + enum xgene_dma_ring_cfgsize cfgsize) +{ + /* Setup DMA ring descriptor variables */ + ring->pdma = chan->pdma; + ring->cfgsize = cfgsize; + ring->num = chan->pdma->ring_num++; + ring->id = XGENE_DMA_RING_ID_GET(ring->owner, ring->buf_num); + + ring->size = xgene_dma_get_ring_size(chan, cfgsize); + if (ring->size <= 0) + return ring->size; + + /* Allocate memory for DMA ring descriptor */ + ring->desc_vaddr = dma_zalloc_coherent(chan->dev, ring->size, + &ring->desc_paddr, GFP_KERNEL); + if (!ring->desc_vaddr) { + chan_err(chan, "Failed to allocate ring desc\n"); + return -ENOMEM; + } + + /* Configure and enable DMA ring */ + xgene_dma_set_ring_cmd(ring); + xgene_dma_setup_ring(ring); + + return 0; +} + +static int xgene_dma_create_chan_rings(struct xgene_dma_chan *chan) +{ + struct xgene_dma_ring *rx_ring = &chan->rx_ring; + struct xgene_dma_ring *tx_ring = &chan->tx_ring; + int ret; + + /* Create DMA Rx ring descriptor */ + rx_ring->owner = XGENE_DMA_RING_OWNER_CPU; + rx_ring->buf_num = XGENE_DMA_CPU_BUFNUM + chan->id; + + ret = xgene_dma_create_ring_one(chan, rx_ring, + XGENE_DMA_RING_CFG_SIZE_64KB); + if (ret) + return ret; + + chan_dbg(chan, "Rx ring id 0x%X num %d desc 0x%p\n", + rx_ring->id, rx_ring->num, rx_ring->desc_vaddr); + + /* Create DMA Tx ring descriptor */ + tx_ring->owner = XGENE_DMA_RING_OWNER_DMA; + tx_ring->buf_num = XGENE_DMA_BUFNUM + chan->id; + + ret = xgene_dma_create_ring_one(chan, tx_ring, + XGENE_DMA_RING_CFG_SIZE_64KB); + if (ret) { + xgene_dma_delete_ring_one(rx_ring); + return ret; + } + + tx_ring->dst_ring_num = XGENE_DMA_RING_DST_ID(rx_ring->num); + + chan_dbg(chan, + "Tx ring id 0x%X num %d desc 0x%p\n", + tx_ring->id, tx_ring->num, tx_ring->desc_vaddr); + + /* Set the max outstanding request possible to this channel */ + chan->max_outstanding = rx_ring->slots; + + return ret; +} + +static int xgene_dma_init_rings(struct xgene_dma *pdma) +{ + int ret, i, j; + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) { + ret = xgene_dma_create_chan_rings(&pdma->chan[i]); + if (ret) { + for (j = 0; j < i; j++) + xgene_dma_delete_chan_rings(&pdma->chan[j]); + return ret; + } + } + + return ret; +} + +static void xgene_dma_enable(struct xgene_dma *pdma) +{ + u32 val; + + /* Configure and enable DMA engine */ + val = ioread32(pdma->csr_dma + XGENE_DMA_GCR); + XGENE_DMA_CH_SETUP(val); + XGENE_DMA_ENABLE(val); + iowrite32(val, pdma->csr_dma + XGENE_DMA_GCR); +} + +static void xgene_dma_disable(struct xgene_dma *pdma) +{ + u32 val; + + val = ioread32(pdma->csr_dma + XGENE_DMA_GCR); + XGENE_DMA_DISABLE(val); + iowrite32(val, pdma->csr_dma + XGENE_DMA_GCR); +} + +static void xgene_dma_mask_interrupts(struct xgene_dma *pdma) +{ + /* + * Mask DMA ring overflow, underflow and + * AXI write/read error interrupts + */ + iowrite32(XGENE_DMA_INT_ALL_MASK, + pdma->csr_dma + XGENE_DMA_RING_INT0_MASK); + iowrite32(XGENE_DMA_INT_ALL_MASK, + pdma->csr_dma + XGENE_DMA_RING_INT1_MASK); + iowrite32(XGENE_DMA_INT_ALL_MASK, + pdma->csr_dma + XGENE_DMA_RING_INT2_MASK); + iowrite32(XGENE_DMA_INT_ALL_MASK, + pdma->csr_dma + XGENE_DMA_RING_INT3_MASK); + iowrite32(XGENE_DMA_INT_ALL_MASK, + pdma->csr_dma + XGENE_DMA_RING_INT4_MASK); + + /* Mask DMA error interrupts */ + iowrite32(XGENE_DMA_INT_ALL_MASK, pdma->csr_dma + XGENE_DMA_INT_MASK); +} + +static void xgene_dma_unmask_interrupts(struct xgene_dma *pdma) +{ + /* + * Unmask DMA ring overflow, underflow and + * AXI write/read error interrupts + */ + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_RING_INT0_MASK); + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_RING_INT1_MASK); + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_RING_INT2_MASK); + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_RING_INT3_MASK); + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_RING_INT4_MASK); + + /* Unmask DMA error interrupts */ + iowrite32(XGENE_DMA_INT_ALL_UNMASK, + pdma->csr_dma + XGENE_DMA_INT_MASK); +} + +static void xgene_dma_init_hw(struct xgene_dma *pdma) +{ + u32 val; + + /* Associate DMA ring to corresponding ring HW */ + iowrite32(XGENE_DMA_ASSOC_RING_MNGR1, + pdma->csr_dma + XGENE_DMA_CFG_RING_WQ_ASSOC); + + /* Configure RAID6 polynomial control setting */ + if (is_pq_enabled(pdma)) + iowrite32(XGENE_DMA_RAID6_MULTI_CTRL(0x1D), + pdma->csr_dma + XGENE_DMA_RAID6_CONT); + else + dev_info(pdma->dev, "PQ is disabled in HW\n"); + + xgene_dma_enable(pdma); + xgene_dma_unmask_interrupts(pdma); + + /* Get DMA id and version info */ + val = ioread32(pdma->csr_dma + XGENE_DMA_IPBRR); + + /* DMA device info */ + dev_info(pdma->dev, + "X-Gene DMA v%d.%02d.%02d driver registered %d channels", + XGENE_DMA_REV_NO_RD(val), XGENE_DMA_BUS_ID_RD(val), + XGENE_DMA_DEV_ID_RD(val), XGENE_DMA_MAX_CHANNEL); +} + +static int xgene_dma_init_ring_mngr(struct xgene_dma *pdma) +{ + if (ioread32(pdma->csr_ring + XGENE_DMA_RING_CLKEN) && + (!ioread32(pdma->csr_ring + XGENE_DMA_RING_SRST))) + return 0; + + iowrite32(0x3, pdma->csr_ring + XGENE_DMA_RING_CLKEN); + iowrite32(0x0, pdma->csr_ring + XGENE_DMA_RING_SRST); + + /* Bring up memory */ + iowrite32(0x0, pdma->csr_ring + XGENE_DMA_RING_MEM_RAM_SHUTDOWN); + + /* Force a barrier */ + ioread32(pdma->csr_ring + XGENE_DMA_RING_MEM_RAM_SHUTDOWN); + + /* reset may take up to 1ms */ + usleep_range(1000, 1100); + + if (ioread32(pdma->csr_ring + XGENE_DMA_RING_BLK_MEM_RDY) + != XGENE_DMA_RING_BLK_MEM_RDY_VAL) { + dev_err(pdma->dev, + "Failed to release ring mngr memory from shutdown\n"); + return -ENODEV; + } + + /* program threshold set 1 and all hysteresis */ + iowrite32(XGENE_DMA_RING_THRESLD0_SET1_VAL, + pdma->csr_ring + XGENE_DMA_RING_THRESLD0_SET1); + iowrite32(XGENE_DMA_RING_THRESLD1_SET1_VAL, + pdma->csr_ring + XGENE_DMA_RING_THRESLD1_SET1); + iowrite32(XGENE_DMA_RING_HYSTERESIS_VAL, + pdma->csr_ring + XGENE_DMA_RING_HYSTERESIS); + + /* Enable QPcore and assign error queue */ + iowrite32(XGENE_DMA_RING_ENABLE, + pdma->csr_ring + XGENE_DMA_RING_CONFIG); + + return 0; +} + +static int xgene_dma_init_mem(struct xgene_dma *pdma) +{ + int ret; + + ret = xgene_dma_init_ring_mngr(pdma); + if (ret) + return ret; + + /* Bring up memory */ + iowrite32(0x0, pdma->csr_dma + XGENE_DMA_MEM_RAM_SHUTDOWN); + + /* Force a barrier */ + ioread32(pdma->csr_dma + XGENE_DMA_MEM_RAM_SHUTDOWN); + + /* reset may take up to 1ms */ + usleep_range(1000, 1100); + + if (ioread32(pdma->csr_dma + XGENE_DMA_BLK_MEM_RDY) + != XGENE_DMA_BLK_MEM_RDY_VAL) { + dev_err(pdma->dev, + "Failed to release DMA memory from shutdown\n"); + return -ENODEV; + } + + return 0; +} + +static int xgene_dma_request_irqs(struct xgene_dma *pdma) +{ + struct xgene_dma_chan *chan; + int ret, i, j; + + /* Register DMA error irq */ + ret = devm_request_irq(pdma->dev, pdma->err_irq, xgene_dma_err_isr, + 0, "dma_error", pdma); + if (ret) { + dev_err(pdma->dev, + "Failed to register error IRQ %d\n", pdma->err_irq); + return ret; + } + + /* Register DMA channel rx irq */ + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) { + chan = &pdma->chan[i]; + ret = devm_request_irq(chan->dev, chan->rx_irq, + xgene_dma_chan_ring_isr, + 0, chan->name, chan); + if (ret) { + chan_err(chan, "Failed to register Rx IRQ %d\n", + chan->rx_irq); + devm_free_irq(pdma->dev, pdma->err_irq, pdma); + + for (j = 0; j < i; j++) { + chan = &pdma->chan[i]; + devm_free_irq(chan->dev, chan->rx_irq, chan); + } + + return ret; + } + } + + return 0; +} + +static void xgene_dma_free_irqs(struct xgene_dma *pdma) +{ + struct xgene_dma_chan *chan; + int i; + + /* Free DMA device error irq */ + devm_free_irq(pdma->dev, pdma->err_irq, pdma); + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) { + chan = &pdma->chan[i]; + devm_free_irq(chan->dev, chan->rx_irq, chan); + } +} + +static void xgene_dma_set_caps(struct xgene_dma_chan *chan, + struct dma_device *dma_dev) +{ + /* Initialize DMA device capability mask */ + dma_cap_zero(dma_dev->cap_mask); + + /* Set DMA device capability */ + dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask); + dma_cap_set(DMA_SG, dma_dev->cap_mask); + + /* Basically here, the X-Gene SoC DMA engine channel 0 supports XOR + * and channel 1 supports XOR, PQ both. First thing here is we have + * mechanism in hw to enable/disable PQ/XOR supports on channel 1, + * we can make sure this by reading SoC Efuse register. + * Second thing, we have hw errata that if we run channel 0 and + * channel 1 simultaneously with executing XOR and PQ request, + * suddenly DMA engine hangs, So here we enable XOR on channel 0 only + * if XOR and PQ supports on channel 1 is disabled. + */ + if ((chan->id == XGENE_DMA_PQ_CHANNEL) && + is_pq_enabled(chan->pdma)) { + dma_cap_set(DMA_PQ, dma_dev->cap_mask); + dma_cap_set(DMA_XOR, dma_dev->cap_mask); + } else if ((chan->id == XGENE_DMA_XOR_CHANNEL) && + !is_pq_enabled(chan->pdma)) { + dma_cap_set(DMA_XOR, dma_dev->cap_mask); + } + + /* Set base and prep routines */ + dma_dev->dev = chan->dev; + dma_dev->device_alloc_chan_resources = xgene_dma_alloc_chan_resources; + dma_dev->device_free_chan_resources = xgene_dma_free_chan_resources; + dma_dev->device_issue_pending = xgene_dma_issue_pending; + dma_dev->device_tx_status = xgene_dma_tx_status; + dma_dev->device_prep_dma_memcpy = xgene_dma_prep_memcpy; + dma_dev->device_prep_dma_sg = xgene_dma_prep_sg; + + if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { + dma_dev->device_prep_dma_xor = xgene_dma_prep_xor; + dma_dev->max_xor = XGENE_DMA_MAX_XOR_SRC; + dma_dev->xor_align = XGENE_DMA_XOR_ALIGNMENT; + } + + if (dma_has_cap(DMA_PQ, dma_dev->cap_mask)) { + dma_dev->device_prep_dma_pq = xgene_dma_prep_pq; + dma_dev->max_pq = XGENE_DMA_MAX_XOR_SRC; + dma_dev->pq_align = XGENE_DMA_XOR_ALIGNMENT; + } +} + +static int xgene_dma_async_register(struct xgene_dma *pdma, int id) +{ + struct xgene_dma_chan *chan = &pdma->chan[id]; + struct dma_device *dma_dev = &pdma->dma_dev[id]; + int ret; + + chan->dma_chan.device = dma_dev; + + spin_lock_init(&chan->lock); + INIT_LIST_HEAD(&chan->ld_pending); + INIT_LIST_HEAD(&chan->ld_running); + INIT_LIST_HEAD(&chan->ld_completed); + tasklet_init(&chan->tasklet, xgene_dma_tasklet_cb, + (unsigned long)chan); + + chan->pending = 0; + chan->desc_pool = NULL; + dma_cookie_init(&chan->dma_chan); + + /* Setup dma device capabilities and prep routines */ + xgene_dma_set_caps(chan, dma_dev); + + /* Initialize DMA device list head */ + INIT_LIST_HEAD(&dma_dev->channels); + list_add_tail(&chan->dma_chan.device_node, &dma_dev->channels); + + /* Register with Linux async DMA framework*/ + ret = dma_async_device_register(dma_dev); + if (ret) { + chan_err(chan, "Failed to register async device %d", ret); + tasklet_kill(&chan->tasklet); + + return ret; + } + + /* DMA capability info */ + dev_info(pdma->dev, + "%s: CAPABILITY ( %s%s%s%s)\n", dma_chan_name(&chan->dma_chan), + dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "MEMCPY " : "", + dma_has_cap(DMA_SG, dma_dev->cap_mask) ? "SGCPY " : "", + dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "XOR " : "", + dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "PQ " : ""); + + return 0; +} + +static int xgene_dma_init_async(struct xgene_dma *pdma) +{ + int ret, i, j; + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL ; i++) { + ret = xgene_dma_async_register(pdma, i); + if (ret) { + for (j = 0; j < i; j++) { + dma_async_device_unregister(&pdma->dma_dev[j]); + tasklet_kill(&pdma->chan[j].tasklet); + } + + return ret; + } + } + + return ret; +} + +static void xgene_dma_async_unregister(struct xgene_dma *pdma) +{ + int i; + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) + dma_async_device_unregister(&pdma->dma_dev[i]); +} + +static void xgene_dma_init_channels(struct xgene_dma *pdma) +{ + struct xgene_dma_chan *chan; + int i; + + pdma->ring_num = XGENE_DMA_RING_NUM; + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) { + chan = &pdma->chan[i]; + chan->dev = pdma->dev; + chan->pdma = pdma; + chan->id = i; + snprintf(chan->name, sizeof(chan->name), "dmachan%d", chan->id); + } +} + +static int xgene_dma_get_resources(struct platform_device *pdev, + struct xgene_dma *pdma) +{ + struct resource *res; + int irq, i; + + /* Get DMA csr region */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "Failed to get csr region\n"); + return -ENXIO; + } + + pdma->csr_dma = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); + if (!pdma->csr_dma) { + dev_err(&pdev->dev, "Failed to ioremap csr region"); + return -ENOMEM; + } + + /* Get DMA ring csr region */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!res) { + dev_err(&pdev->dev, "Failed to get ring csr region\n"); + return -ENXIO; + } + + pdma->csr_ring = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); + if (!pdma->csr_ring) { + dev_err(&pdev->dev, "Failed to ioremap ring csr region"); + return -ENOMEM; + } + + /* Get DMA ring cmd csr region */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 2); + if (!res) { + dev_err(&pdev->dev, "Failed to get ring cmd csr region\n"); + return -ENXIO; + } + + pdma->csr_ring_cmd = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); + if (!pdma->csr_ring_cmd) { + dev_err(&pdev->dev, "Failed to ioremap ring cmd csr region"); + return -ENOMEM; + } + + /* Get efuse csr region */ + res = platform_get_resource(pdev, IORESOURCE_MEM, 3); + if (!res) { + dev_err(&pdev->dev, "Failed to get efuse csr region\n"); + return -ENXIO; + } + + pdma->csr_efuse = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); + if (!pdma->csr_efuse) { + dev_err(&pdev->dev, "Failed to ioremap efuse csr region"); + return -ENOMEM; + } + + /* Get DMA error interrupt */ + irq = platform_get_irq(pdev, 0); + if (irq <= 0) { + dev_err(&pdev->dev, "Failed to get Error IRQ\n"); + return -ENXIO; + } + + pdma->err_irq = irq; + + /* Get DMA Rx ring descriptor interrupts for all DMA channels */ + for (i = 1; i <= XGENE_DMA_MAX_CHANNEL; i++) { + irq = platform_get_irq(pdev, i); + if (irq <= 0) { + dev_err(&pdev->dev, "Failed to get Rx IRQ\n"); + return -ENXIO; + } + + pdma->chan[i - 1].rx_irq = irq; + } + + return 0; +} + +static int xgene_dma_probe(struct platform_device *pdev) +{ + struct xgene_dma *pdma; + int ret, i; + + pdma = devm_kzalloc(&pdev->dev, sizeof(*pdma), GFP_KERNEL); + if (!pdma) + return -ENOMEM; + + pdma->dev = &pdev->dev; + platform_set_drvdata(pdev, pdma); + + ret = xgene_dma_get_resources(pdev, pdma); + if (ret) + return ret; + + pdma->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(pdma->clk)) { + dev_err(&pdev->dev, "Failed to get clk\n"); + return PTR_ERR(pdma->clk); + } + + /* Enable clk before accessing registers */ + ret = clk_prepare_enable(pdma->clk); + if (ret) { + dev_err(&pdev->dev, "Failed to enable clk %d\n", ret); + return ret; + } + + /* Remove DMA RAM out of shutdown */ + ret = xgene_dma_init_mem(pdma); + if (ret) + goto err_clk_enable; + + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(42)); + if (ret) { + dev_err(&pdev->dev, "No usable DMA configuration\n"); + goto err_dma_mask; + } + + /* Initialize DMA channels software state */ + xgene_dma_init_channels(pdma); + + /* Configue DMA rings */ + ret = xgene_dma_init_rings(pdma); + if (ret) + goto err_clk_enable; + + ret = xgene_dma_request_irqs(pdma); + if (ret) + goto err_request_irq; + + /* Configure and enable DMA engine */ + xgene_dma_init_hw(pdma); + + /* Register DMA device with linux async framework */ + ret = xgene_dma_init_async(pdma); + if (ret) + goto err_async_init; + + return 0; + +err_async_init: + xgene_dma_free_irqs(pdma); + +err_request_irq: + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) + xgene_dma_delete_chan_rings(&pdma->chan[i]); + +err_dma_mask: +err_clk_enable: + clk_disable_unprepare(pdma->clk); + + return ret; +} + +static int xgene_dma_remove(struct platform_device *pdev) +{ + struct xgene_dma *pdma = platform_get_drvdata(pdev); + struct xgene_dma_chan *chan; + int i; + + xgene_dma_async_unregister(pdma); + + /* Mask interrupts and disable DMA engine */ + xgene_dma_mask_interrupts(pdma); + xgene_dma_disable(pdma); + xgene_dma_free_irqs(pdma); + + for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) { + chan = &pdma->chan[i]; + tasklet_kill(&chan->tasklet); + xgene_dma_delete_chan_rings(chan); + } + + clk_disable_unprepare(pdma->clk); + + return 0; +} + +static const struct of_device_id xgene_dma_of_match_ptr[] = { + {.compatible = "apm,xgene-storm-dma",}, + {}, +}; +MODULE_DEVICE_TABLE(of, xgene_dma_of_match_ptr); + +static struct platform_driver xgene_dma_driver = { + .probe = xgene_dma_probe, + .remove = xgene_dma_remove, + .driver = { + .name = "X-Gene-DMA", + .of_match_table = xgene_dma_of_match_ptr, + }, +}; + +module_platform_driver(xgene_dma_driver); + +MODULE_DESCRIPTION("APM X-Gene SoC DMA driver"); +MODULE_AUTHOR("Rameshwar Prasad Sahu <rsahu@apm.com>"); +MODULE_AUTHOR("Loc Ho <lho@apm.com>"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); diff --git a/drivers/dma/xilinx/xilinx_vdma.c b/drivers/dma/xilinx/xilinx_vdma.c index bdd2a5dd7220..d8434d465885 100644 --- a/drivers/dma/xilinx/xilinx_vdma.c +++ b/drivers/dma/xilinx/xilinx_vdma.c @@ -22,9 +22,9 @@ * (at your option) any later version. */ -#include <linux/amba/xilinx_dma.h> #include <linux/bitops.h> #include <linux/dmapool.h> +#include <linux/dma/xilinx_dma.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index ef5feeecec84..580e10acaa3a 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -538,8 +538,14 @@ struct dma_buf * armada_gem_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags) { - return dma_buf_export(obj, &armada_gem_prime_dmabuf_ops, obj->size, - O_RDWR, NULL); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &armada_gem_prime_dmabuf_ops; + exp_info.size = obj->size; + exp_info.flags = O_RDWR; + exp_info.priv = obj; + + return dma_buf_export(&exp_info); } struct drm_gem_object * diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 7482b06cd08f..7fec191b45f7 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -339,13 +339,17 @@ static const struct dma_buf_ops drm_gem_prime_dmabuf_ops = { struct dma_buf *drm_gem_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags) { - struct reservation_object *robj = NULL; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &drm_gem_prime_dmabuf_ops; + exp_info.size = obj->size; + exp_info.flags = flags; + exp_info.priv = obj; if (dev->driver->gem_prime_res_obj) - robj = dev->driver->gem_prime_res_obj(obj); + exp_info.resv = dev->driver->gem_prime_res_obj(obj); - return dma_buf_export(obj, &drm_gem_prime_dmabuf_ops, obj->size, - flags, robj); + return dma_buf_export(&exp_info); } EXPORT_SYMBOL(drm_gem_prime_export); diff --git a/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c b/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c index 3833bf8ca025..cd485c091b30 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c @@ -185,9 +185,14 @@ struct dma_buf *exynos_dmabuf_prime_export(struct drm_device *drm_dev, struct drm_gem_object *obj, int flags) { struct exynos_drm_gem_obj *exynos_gem_obj = to_exynos_gem_obj(obj); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); - return dma_buf_export(obj, &exynos_dmabuf_ops, - exynos_gem_obj->base.size, flags, NULL); + exp_info.ops = &exynos_dmabuf_ops; + exp_info.size = exynos_gem_obj->base.size; + exp_info.flags = flags; + exp_info.priv = obj; + + return dma_buf_export(&exp_info); } struct drm_gem_object *exynos_dmabuf_prime_import(struct drm_device *drm_dev, diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index c24c3f1ff8a3..c302ffb5a168 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1038,7 +1038,7 @@ static void vlv_save_gunit_s0ix_state(struct drm_i915_private *dev_priv) s->lra_limits[i] = I915_READ(GEN7_LRA_LIMITS_BASE + i * 4); s->media_max_req_count = I915_READ(GEN7_MEDIA_MAX_REQ_COUNT); - s->gfx_max_req_count = I915_READ(GEN7_MEDIA_MAX_REQ_COUNT); + s->gfx_max_req_count = I915_READ(GEN7_GFX_MAX_REQ_COUNT); s->render_hwsp = I915_READ(RENDER_HWS_PGA_GEN7); s->ecochk = I915_READ(GAM_ECOCHK); @@ -1120,7 +1120,7 @@ static void vlv_restore_gunit_s0ix_state(struct drm_i915_private *dev_priv) I915_WRITE(GEN7_LRA_LIMITS_BASE + i * 4, s->lra_limits[i]); I915_WRITE(GEN7_MEDIA_MAX_REQ_COUNT, s->media_max_req_count); - I915_WRITE(GEN7_MEDIA_MAX_REQ_COUNT, s->gfx_max_req_count); + I915_WRITE(GEN7_GFX_MAX_REQ_COUNT, s->gfx_max_req_count); I915_WRITE(RENDER_HWS_PGA_GEN7, s->render_hwsp); I915_WRITE(GAM_ECOCHK, s->ecochk); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d07c0b1fb498..53394f998a1f 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2377,10 +2377,11 @@ int __i915_add_request(struct intel_engine_cs *ring, ret = ring->add_request(ring); if (ret) return ret; + + request->tail = intel_ring_get_tail(ringbuf); } request->head = request_start; - request->tail = intel_ring_get_tail(ringbuf); /* Whilst this request exists, batch_obj will be on the * active_list, and so will hold the active reference. Only when this diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c index 82a1f4b57778..7998da27c500 100644 --- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c @@ -230,6 +230,13 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev, struct drm_gem_object *gem_obj, int flags) { struct drm_i915_gem_object *obj = to_intel_bo(gem_obj); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &i915_dmabuf_ops; + exp_info.size = gem_obj->size; + exp_info.flags = flags; + exp_info.priv = gem_obj; + if (obj->ops->dmabuf_export) { int ret = obj->ops->dmabuf_export(obj); @@ -237,8 +244,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_device *dev, return ERR_PTR(ret); } - return dma_buf_export(gem_obj, &i915_dmabuf_ops, gem_obj->size, flags, - NULL); + return dma_buf_export(&exp_info); } static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index b522eb6e59a4..3da1af46625c 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -1807,6 +1807,7 @@ enum skl_disp_power_wells { #define GMBUS_CYCLE_INDEX (2<<25) #define GMBUS_CYCLE_STOP (4<<25) #define GMBUS_BYTE_COUNT_SHIFT 16 +#define GMBUS_BYTE_COUNT_MAX 256U #define GMBUS_SLAVE_INDEX_SHIFT 8 #define GMBUS_SLAVE_ADDR_SHIFT 1 #define GMBUS_SLAVE_READ (1<<0) diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c index b31088a551f2..56e437e31580 100644 --- a/drivers/gpu/drm/i915/intel_i2c.c +++ b/drivers/gpu/drm/i915/intel_i2c.c @@ -270,18 +270,17 @@ gmbus_wait_idle(struct drm_i915_private *dev_priv) } static int -gmbus_xfer_read(struct drm_i915_private *dev_priv, struct i2c_msg *msg, - u32 gmbus1_index) +gmbus_xfer_read_chunk(struct drm_i915_private *dev_priv, + unsigned short addr, u8 *buf, unsigned int len, + u32 gmbus1_index) { int reg_offset = dev_priv->gpio_mmio_base; - u16 len = msg->len; - u8 *buf = msg->buf; I915_WRITE(GMBUS1 + reg_offset, gmbus1_index | GMBUS_CYCLE_WAIT | (len << GMBUS_BYTE_COUNT_SHIFT) | - (msg->addr << GMBUS_SLAVE_ADDR_SHIFT) | + (addr << GMBUS_SLAVE_ADDR_SHIFT) | GMBUS_SLAVE_READ | GMBUS_SW_RDY); while (len) { int ret; @@ -303,11 +302,35 @@ gmbus_xfer_read(struct drm_i915_private *dev_priv, struct i2c_msg *msg, } static int -gmbus_xfer_write(struct drm_i915_private *dev_priv, struct i2c_msg *msg) +gmbus_xfer_read(struct drm_i915_private *dev_priv, struct i2c_msg *msg, + u32 gmbus1_index) { - int reg_offset = dev_priv->gpio_mmio_base; - u16 len = msg->len; u8 *buf = msg->buf; + unsigned int rx_size = msg->len; + unsigned int len; + int ret; + + do { + len = min(rx_size, GMBUS_BYTE_COUNT_MAX); + + ret = gmbus_xfer_read_chunk(dev_priv, msg->addr, + buf, len, gmbus1_index); + if (ret) + return ret; + + rx_size -= len; + buf += len; + } while (rx_size != 0); + + return 0; +} + +static int +gmbus_xfer_write_chunk(struct drm_i915_private *dev_priv, + unsigned short addr, u8 *buf, unsigned int len) +{ + int reg_offset = dev_priv->gpio_mmio_base; + unsigned int chunk_size = len; u32 val, loop; val = loop = 0; @@ -319,8 +342,8 @@ gmbus_xfer_write(struct drm_i915_private *dev_priv, struct i2c_msg *msg) I915_WRITE(GMBUS3 + reg_offset, val); I915_WRITE(GMBUS1 + reg_offset, GMBUS_CYCLE_WAIT | - (msg->len << GMBUS_BYTE_COUNT_SHIFT) | - (msg->addr << GMBUS_SLAVE_ADDR_SHIFT) | + (chunk_size << GMBUS_BYTE_COUNT_SHIFT) | + (addr << GMBUS_SLAVE_ADDR_SHIFT) | GMBUS_SLAVE_WRITE | GMBUS_SW_RDY); while (len) { int ret; @@ -337,6 +360,29 @@ gmbus_xfer_write(struct drm_i915_private *dev_priv, struct i2c_msg *msg) if (ret) return ret; } + + return 0; +} + +static int +gmbus_xfer_write(struct drm_i915_private *dev_priv, struct i2c_msg *msg) +{ + u8 *buf = msg->buf; + unsigned int tx_size = msg->len; + unsigned int len; + int ret; + + do { + len = min(tx_size, GMBUS_BYTE_COUNT_MAX); + + ret = gmbus_xfer_write_chunk(dev_priv, msg->addr, buf, len); + if (ret) + return ret; + + buf += len; + tx_size -= len; + } while (tx_size != 0); + return 0; } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index fcb074bd55dc..09df74b8e917 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -393,6 +393,26 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) } } + if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) { + /* + * WaIdleLiteRestore: make sure we never cause a lite + * restore with HEAD==TAIL + */ + if (req0 && req0->elsp_submitted) { + /* + * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL + * as we resubmit the request. See gen8_emit_request() + * for where we prepare the padding after the end of the + * request. + */ + struct intel_ringbuffer *ringbuf; + + ringbuf = req0->ctx->engine[ring->id].ringbuf; + req0->tail += 8; + req0->tail &= ringbuf->size - 1; + } + } + WARN_ON(req1 && req1->elsp_submitted); execlists_submit_contexts(ring, req0->ctx, req0->tail, @@ -1315,7 +1335,12 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, u32 cmd; int ret; - ret = intel_logical_ring_begin(ringbuf, request->ctx, 6); + /* + * Reserve space for 2 NOOPs at the end of each request to be + * used as a workaround for not being allowed to do lite + * restore with HEAD==TAIL (WaIdleLiteRestore). + */ + ret = intel_logical_ring_begin(ringbuf, request->ctx, 8); if (ret) return ret; @@ -1333,6 +1358,14 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, intel_logical_ring_emit(ringbuf, MI_NOOP); intel_logical_ring_advance_and_submit(ringbuf, request->ctx, request); + /* + * Here we add two extra NOOPs as padding to avoid + * lite restore of a context with HEAD==TAIL. + */ + intel_logical_ring_emit(ringbuf, MI_NOOP); + intel_logical_ring_emit(ringbuf, MI_NOOP); + intel_logical_ring_advance(ringbuf); + return 0; } diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c index b46dabd9faf7..344fd789170d 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c +++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c @@ -171,7 +171,14 @@ static struct dma_buf_ops omap_dmabuf_ops = { struct dma_buf *omap_gem_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags) { - return dma_buf_export(obj, &omap_dmabuf_ops, obj->size, flags, NULL); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &omap_dmabuf_ops; + exp_info.size = obj->size; + exp_info.flags = flags; + exp_info.priv = obj; + + return dma_buf_export(&exp_info); } struct drm_gem_object *omap_gem_prime_import(struct drm_device *dev, diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index cfb481943b6b..1217272a51f2 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -627,8 +627,14 @@ struct dma_buf *tegra_gem_prime_export(struct drm_device *drm, struct drm_gem_object *gem, int flags) { - return dma_buf_export(gem, &tegra_gem_prime_dmabuf_ops, gem->size, - flags, NULL); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &tegra_gem_prime_dmabuf_ops; + exp_info.size = gem->size; + exp_info.flags = flags; + exp_info.priv = gem; + + return dma_buf_export(&exp_info); } struct drm_gem_object *tegra_gem_prime_import(struct drm_device *drm, diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index 12c87110db3a..4f5fa8d65fe9 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -683,6 +683,12 @@ int ttm_prime_handle_to_fd(struct ttm_object_file *tfile, dma_buf = prime->dma_buf; if (!dma_buf || !get_dma_buf_unless_doomed(dma_buf)) { + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &tdev->ops; + exp_info.size = prime->size; + exp_info.flags = flags; + exp_info.priv = prime; /* * Need to create a new dma_buf, with memory accounting. @@ -694,8 +700,7 @@ int ttm_prime_handle_to_fd(struct ttm_object_file *tfile, goto out_unref; } - dma_buf = dma_buf_export(prime, &tdev->ops, - prime->size, flags, NULL); + dma_buf = dma_buf_export(&exp_info); if (IS_ERR(dma_buf)) { ret = PTR_ERR(dma_buf); ttm_mem_global_free(tdev->mem_glob, diff --git a/drivers/gpu/drm/udl/udl_dmabuf.c b/drivers/gpu/drm/udl/udl_dmabuf.c index ac8a66b4dfc2..e2243edd1ce3 100644 --- a/drivers/gpu/drm/udl/udl_dmabuf.c +++ b/drivers/gpu/drm/udl/udl_dmabuf.c @@ -202,7 +202,14 @@ static struct dma_buf_ops udl_dmabuf_ops = { struct dma_buf *udl_gem_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags) { - return dma_buf_export(obj, &udl_dmabuf_ops, obj->size, flags, NULL); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &udl_dmabuf_ops; + exp_info.size = obj->size; + exp_info.flags = flags; + exp_info.priv = obj; + + return dma_buf_export(&exp_info); } static int udl_prime_create(struct drm_device *dev, diff --git a/drivers/i2c/busses/i2c-cros-ec-tunnel.c b/drivers/i2c/busses/i2c-cros-ec-tunnel.c index 875c22ae5400..fa8dedd8c3a2 100644 --- a/drivers/i2c/busses/i2c-cros-ec-tunnel.c +++ b/drivers/i2c/busses/i2c-cros-ec-tunnel.c @@ -182,72 +182,41 @@ static int ec_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg i2c_msgs[], const u16 bus_num = bus->remote_bus; int request_len; int response_len; - u8 *request = NULL; - u8 *response = NULL; int result; - struct cros_ec_command msg; + struct cros_ec_command msg = { }; request_len = ec_i2c_count_message(i2c_msgs, num); if (request_len < 0) { dev_warn(dev, "Error constructing message %d\n", request_len); - result = request_len; - goto exit; + return request_len; } + response_len = ec_i2c_count_response(i2c_msgs, num); if (response_len < 0) { /* Unexpected; no errors should come when NULL response */ dev_warn(dev, "Error preparing response %d\n", response_len); - result = response_len; - goto exit; - } - - if (request_len <= ARRAY_SIZE(bus->request_buf)) { - request = bus->request_buf; - } else { - request = kzalloc(request_len, GFP_KERNEL); - if (request == NULL) { - result = -ENOMEM; - goto exit; - } - } - if (response_len <= ARRAY_SIZE(bus->response_buf)) { - response = bus->response_buf; - } else { - response = kzalloc(response_len, GFP_KERNEL); - if (response == NULL) { - result = -ENOMEM; - goto exit; - } + return response_len; } - result = ec_i2c_construct_message(request, i2c_msgs, num, bus_num); + result = ec_i2c_construct_message(msg.outdata, i2c_msgs, num, bus_num); if (result) - goto exit; + return result; msg.version = 0; msg.command = EC_CMD_I2C_PASSTHRU; - msg.outdata = request; msg.outsize = request_len; - msg.indata = response; msg.insize = response_len; result = cros_ec_cmd_xfer(bus->ec, &msg); if (result < 0) - goto exit; + return result; - result = ec_i2c_parse_response(response, i2c_msgs, &num); + result = ec_i2c_parse_response(msg.indata, i2c_msgs, &num); if (result < 0) - goto exit; + return result; /* Indicate success by saying how many messages were sent */ - result = num; -exit: - if (request != bus->request_buf) - kfree(request); - if (response != bus->response_buf) - kfree(response); - - return result; + return num; } static u32 ec_i2c_functionality(struct i2c_adapter *adap) diff --git a/drivers/i2c/busses/i2c-digicolor.c b/drivers/i2c/busses/i2c-digicolor.c index 03f1e5549896..9604024e0eb0 100644 --- a/drivers/i2c/busses/i2c-digicolor.c +++ b/drivers/i2c/busses/i2c-digicolor.c @@ -12,11 +12,10 @@ #include <linux/clk.h> #include <linux/completion.h> +#include <linux/delay.h> #include <linux/i2c.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/clk.h> -#include <linux/delay.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/of.h> diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 56fceff6ba14..3e84f6c090a5 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -913,7 +913,7 @@ static void __exit mxs_i2c_exit(void) module_exit(mxs_i2c_exit); MODULE_AUTHOR("Marek Vasut <marex@denx.de>"); -MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>"); +MODULE_AUTHOR("Wolfram Sang <kernel@pengutronix.de>"); MODULE_DESCRIPTION("MXS I2C Bus Driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" DRIVER_NAME); diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c index 6336f02ec566..3bd2e7d06e4b 100644 --- a/drivers/i2c/busses/i2c-pca-platform.c +++ b/drivers/i2c/busses/i2c-pca-platform.c @@ -285,6 +285,6 @@ static struct platform_driver i2c_pca_pf_driver = { module_platform_driver(i2c_pca_pf_driver); -MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>"); +MODULE_AUTHOR("Wolfram Sang <kernel@pengutronix.de>"); MODULE_DESCRIPTION("I2C-PCA9564/PCA9665 platform driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/i2c/busses/i2c-rk3x.c b/drivers/i2c/busses/i2c-rk3x.c index 5f96b1b3e3a5..019d5426fe52 100644 --- a/drivers/i2c/busses/i2c-rk3x.c +++ b/drivers/i2c/busses/i2c-rk3x.c @@ -833,7 +833,7 @@ static int rk3x_i2c_xfer(struct i2c_adapter *adap, clk_disable(i2c->clk); spin_unlock_irqrestore(&i2c->lock, flags); - return ret; + return ret < 0 ? ret : num; } static u32 rk3x_i2c_func(struct i2c_adapter *adap) diff --git a/drivers/i2c/busses/i2c-st.c b/drivers/i2c/busses/i2c-st.c index 88057fad9dfe..ea72dca32fdf 100644 --- a/drivers/i2c/busses/i2c-st.c +++ b/drivers/i2c/busses/i2c-st.c @@ -10,17 +10,18 @@ * published by the Free Software Foundation. */ -#include <linux/module.h> -#include <linux/platform_device.h> -#include <linux/i2c.h> #include <linux/clk.h> -#include <linux/io.h> #include <linux/delay.h> -#include <linux/interrupt.h> #include <linux/err.h> -#include <linux/of.h> +#include <linux/i2c.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> #include <linux/of_address.h> #include <linux/of_irq.h> +#include <linux/of.h> +#include <linux/pinctrl/consumer.h> +#include <linux/platform_device.h> /* SSC registers */ #define SSC_BRG 0x000 diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 098f698fe8f4..987c124432c5 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -1413,6 +1413,8 @@ static int i2c_register_adapter(struct i2c_adapter *adap) dev_dbg(&adap->dev, "adapter [%s] registered\n", adap->name); + pm_runtime_no_callbacks(&adap->dev); + #ifdef CONFIG_I2C_COMPAT res = class_compat_create_link(i2c_adapter_compat_class, &adap->dev, adap->dev.parent); diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index 593f7ca9adc7..06cc1ff088f1 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -32,8 +32,9 @@ struct i2c_mux_priv { struct i2c_algorithm algo; struct i2c_adapter *parent; - void *mux_priv; /* the mux chip/device */ - u32 chan_id; /* the channel id */ + struct device *mux_dev; + void *mux_priv; + u32 chan_id; int (*select)(struct i2c_adapter *, void *mux_priv, u32 chan_id); int (*deselect)(struct i2c_adapter *, void *mux_priv, u32 chan_id); @@ -119,6 +120,7 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent, /* Set up private adapter data */ priv->parent = parent; + priv->mux_dev = mux_dev; priv->mux_priv = mux_priv; priv->chan_id = chan_id; priv->select = select; @@ -203,7 +205,7 @@ void i2c_del_mux_adapter(struct i2c_adapter *adap) char symlink_name[20]; snprintf(symlink_name, sizeof(symlink_name), "channel-%u", priv->chan_id); - sysfs_remove_link(&adap->dev.parent->kobj, symlink_name); + sysfs_remove_link(&priv->mux_dev->kobj, symlink_name); sysfs_remove_link(&priv->adap.dev.kobj, "mux_device"); i2c_del_adapter(adap); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8c014b5dab4c..38acb3cfc545 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -99,12 +99,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + if (!size) + return ERR_PTR(-EINVAL); + /* * If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ - if ((PAGE_ALIGN(addr + size) <= size) || - (PAGE_ALIGN(addr + size) <= addr)) + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) return ERR_PTR(-EINVAL); if (!can_do_mlock()) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 259dcc7779f5..88cce9bb72fe 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -246,6 +246,17 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uqp); } + list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { + struct ib_srq *srq = uobj->object; + struct ib_uevent_object *uevent = + container_of(uobj, struct ib_uevent_object, uobject); + + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); + ib_destroy_srq(srq); + ib_uverbs_release_uevent(file, uevent); + kfree(uevent); + } + list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { struct ib_cq *cq = uobj->object; struct ib_uverbs_event_file *ev_file = cq->cq_context; @@ -258,17 +269,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(ucq); } - list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { - struct ib_srq *srq = uobj->object; - struct ib_uevent_object *uevent = - container_of(uobj, struct ib_uevent_object, uobject); - - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - ib_destroy_srq(srq); - ib_uverbs_release_uevent(file, uevent); - kfree(uevent); - } - list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index 33c45dfcbd88..1ca8e32a9592 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -82,14 +82,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) - error = ipathfs_mknod(parent->d_inode, *dentry, + error = ipathfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return error; } @@ -277,11 +277,11 @@ static int remove_file(struct dentry *parent, char *name) } spin_lock(&tmp->d_lock); - if (!d_unhashed(tmp) && tmp->d_inode) { + if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { dget_dlock(tmp); __d_drop(tmp); spin_unlock(&tmp->d_lock); - simple_unlink(parent->d_inode, tmp); + simple_unlink(d_inode(parent), tmp); } else spin_unlock(&tmp->d_lock); @@ -302,7 +302,7 @@ static int remove_device_files(struct super_block *sb, int ret; root = dget(sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -315,10 +315,10 @@ static int remove_device_files(struct super_block *sb, remove_file(dir, "flash"); remove_file(dir, "atomic_counters"); d_delete(dir); - ret = simple_rmdir(root->d_inode, dir); + ret = simple_rmdir(d_inode(root), dir); bail: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); return ret; } diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index a31e031afd87..0f00204d2ece 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -58,14 +58,19 @@ struct mlx4_alias_guid_work_context { int query_id; struct list_head list; int block_num; + ib_sa_comp_mask guid_indexes; + u8 method; }; struct mlx4_next_alias_guid_work { u8 port; u8 block_num; + u8 method; struct mlx4_sriov_alias_guid_info_rec_det rec_det; }; +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec); void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data) @@ -118,6 +123,57 @@ ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) return IB_SA_COMP_MASK(4 + index); } +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init) +{ + __be64 curr_guid, required_guid; + int record_num = slave / 8; + int index = slave % 8; + int port_index = port - 1; + unsigned long flags; + int do_work = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port_index].state_flags & + GUID_STATE_NEED_PORT_INIT) + goto unlock; + if (!slave_init) { + curr_guid = *(__be64 *)&dev->sriov. + alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index]; + if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) || + !curr_guid) + goto unlock; + required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } else { + required_guid = mlx4_get_admin_guid(dev->dev, slave, port); + if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto unlock; + } + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index] = required_guid; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(index); + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE; + /* set to run immediately */ + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].time_to_run = 0; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + guids_retry_schedule[index] = 0; + do_work = 1; +unlock: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + + if (do_work) + mlx4_ib_init_alias_guid_work(dev, port_index); +} + /* * Whenever new GUID is set/unset (guid table change) create event and * notify the relevant slave (master also should be notified). @@ -138,10 +194,15 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, enum slave_port_state prev_state; __be64 tmp_cur_ag, form_cache_ag; enum slave_port_gen_event gen_event; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags; + __be64 required_value; if (!mlx4_is_master(dev->dev)) return; + rec = &dev->sriov.alias_guid.ports_guid[port_num - 1]. + all_rec_per_port[block_num]; guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. ports_guid[port_num - 1]. all_rec_per_port[block_num].guid_indexes); @@ -166,8 +227,27 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, */ if (tmp_cur_ag != form_cache_ag) continue; - mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + + if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + required_value = 0; + + if (tmp_cur_ag == required_value) { + rec->guid_indexes = rec->guid_indexes & + ~mlx4_ib_get_aguid_comp_mask_from_ix(i); + } else { + /* may notify port down if value is 0 */ + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { + spin_unlock_irqrestore(&dev->sriov. + alias_guid.ag_work_lock, flags); + continue; + } + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, + flags); + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); /*2 cases: Valid GUID, and Invalid Guid*/ if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ @@ -188,10 +268,14 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, set_and_calc_slave_port_state(dev->dev, slave_id, port_num, MLX4_PORT_STATE_IB_EVENT_GID_INVALID, &gen_event); - pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", - slave_id, port_num); - mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num, - MLX4_PORT_CHANGE_SUBTYPE_DOWN); + if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) { + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, + slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } } } } @@ -206,6 +290,9 @@ static void aliasguid_query_handler(int status, int i; struct mlx4_sriov_alias_guid_info_rec_det *rec; unsigned long flags, flags1; + ib_sa_comp_mask declined_guid_indexes = 0; + ib_sa_comp_mask applied_guid_indexes = 0; + unsigned int resched_delay_sec = 0; if (!context) return; @@ -216,9 +303,9 @@ static void aliasguid_query_handler(int status, all_rec_per_port[cb_ctx->block_num]; if (status) { - rec->status = MLX4_GUID_INFO_STATUS_IDLE; pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); + rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -235,57 +322,101 @@ static void aliasguid_query_handler(int status, rec = &dev->sriov.alias_guid.ports_guid[port_index]. all_rec_per_port[guid_rec->block_num]; - rec->status = MLX4_GUID_INFO_STATUS_SET; - rec->method = MLX4_GUID_INFO_RECORD_SET; - + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { - __be64 tmp_cur_ag; - tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + __be64 sm_response, required_val; + + if (!(cb_ctx->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + sm_response = *(__be64 *)&guid_rec->guid_info_list + [i * GUID_REC_SIZE]; + required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) { + if (required_val == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto next_entry; + + /* A new value was set till we got the response */ + pr_debug("need to set new value %llx, record num %d, block_num:%d\n", + be64_to_cpu(required_val), + i, guid_rec->block_num); + goto entry_declined; + } + /* check if the SM didn't assign one of the records. - * if it didn't, if it was not sysadmin request: - * ask the SM to give a new GUID, (instead of the driver request). + * if it didn't, re-ask for. */ - if (tmp_cur_ag == MLX4_NOT_SET_GUID) { - mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in " - "block_num: %d was declined by SM, " - "ownership by %d (0 = driver, 1=sysAdmin," - " 2=None)\n", __func__, i, - guid_rec->block_num, rec->ownership); - if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) { - /* if it is driver assign, asks for new GUID from SM*/ - *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = - MLX4_NOT_SET_GUID; - - /* Mark the record as not assigned, and let it - * be sent again in the next work sched.*/ - rec->status = MLX4_GUID_INFO_STATUS_IDLE; - rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); - } + if (sm_response == MLX4_NOT_SET_GUID) { + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, + "%s:Record num %d in block_num: %d was declined by SM\n", + __func__, i, + guid_rec->block_num); + goto entry_declined; } else { /* properly assigned record. */ /* We save the GUID we just got from the SM in the * admin_guid in order to be persistent, and in the * request from the sm the process will ask for the same GUID */ - if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN && - tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) { - /* the sysadmin assignment failed.*/ - mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" - " admin guid after SysAdmin " - "configuration. " - "Record num %d in block_num:%d " - "was declined by SM, " - "new val(0x%llx) was kept\n", - __func__, i, - guid_rec->block_num, - be64_to_cpu(*(__be64 *) & - rec->all_recs[i * GUID_REC_SIZE])); + if (required_val && + sm_response != required_val) { + /* Warn only on first retry */ + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept, SM returned (0x%llx)\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(required_val), + be64_to_cpu(sm_response)); + goto entry_declined; } else { - memcpy(&rec->all_recs[i * GUID_REC_SIZE], - &guid_rec->guid_info_list[i * GUID_REC_SIZE], - GUID_REC_SIZE); + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + sm_response; + if (required_val == 0) + mlx4_set_admin_guid(dev->dev, + sm_response, + (guid_rec->block_num + * NUM_ALIAS_GUID_IN_REC) + i, + cb_ctx->port); + goto next_entry; } } +entry_declined: + declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + rec->guids_retry_schedule[i] = + (rec->guids_retry_schedule[i] == 0) ? 1 : + min((unsigned int)60, + rec->guids_retry_schedule[i] * 2); + /* using the minimum value among all entries in that record */ + resched_delay_sec = (resched_delay_sec == 0) ? + rec->guids_retry_schedule[i] : + min(resched_delay_sec, + rec->guids_retry_schedule[i]); + continue; + +next_entry: + rec->guids_retry_schedule[i] = 0; } + + applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes; + if (declined_guid_indexes || + rec->guid_indexes & ~(applied_guid_indexes)) { + pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n", + guid_rec->block_num, + be64_to_cpu((__force __be64)rec->guid_indexes), + be64_to_cpu((__force __be64)applied_guid_indexes), + be64_to_cpu((__force __be64)declined_guid_indexes)); + rec->time_to_run = ktime_get_real_ns() + + resched_delay_sec * NSEC_PER_SEC; + } else { + rec->status = MLX4_GUID_INFO_STATUS_SET; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); /* The func is call here to close the cases when the sm doesn't send smp, so in the sa response the driver @@ -297,10 +428,13 @@ static void aliasguid_query_handler(int status, out: spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); - if (!dev->sriov.is_going_down) + if (!dev->sriov.is_going_down) { + get_low_record_time_index(dev, port_index, &resched_delay_sec); queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, &dev->sriov.alias_guid.ports_guid[port_index]. - alias_guid_work, 0); + alias_guid_work, + msecs_to_jiffies(resched_delay_sec * 1000)); + } if (cb_ctx->sa_query) { list_del(&cb_ctx->list); kfree(cb_ctx); @@ -317,9 +451,7 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) ib_sa_comp_mask comp_mask = 0; dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status - = MLX4_GUID_INFO_STATUS_IDLE; - dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method - = MLX4_GUID_INFO_RECORD_SET; + = MLX4_GUID_INFO_STATUS_SET; /* calculate the comp_mask for that record.*/ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { @@ -333,19 +465,21 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) need to assign GUIDs, then don't put it up for assignment. */ if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || - (!index && !i) || - MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid. - ports_guid[port - 1].all_rec_per_port[index].ownership) + (!index && !i)) continue; comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); } dev->sriov.alias_guid.ports_guid[port - 1]. - all_rec_per_port[index].guid_indexes = comp_mask; + all_rec_per_port[index].guid_indexes |= comp_mask; + if (dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes) + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE; + } static int set_guid_rec(struct ib_device *ibdev, - u8 port, int index, - struct mlx4_sriov_alias_guid_info_rec_det *rec_det) + struct mlx4_next_alias_guid_work *rec) { int err; struct mlx4_ib_dev *dev = to_mdev(ibdev); @@ -354,6 +488,9 @@ static int set_guid_rec(struct ib_device *ibdev, struct ib_port_attr attr; struct mlx4_alias_guid_work_context *callback_context; unsigned long resched_delay, flags, flags1; + u8 port = rec->port + 1; + int index = rec->block_num; + struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det; struct list_head *head = &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; @@ -380,6 +517,8 @@ static int set_guid_rec(struct ib_device *ibdev, callback_context->port = port; callback_context->dev = dev; callback_context->block_num = index; + callback_context->guid_indexes = rec_det->guid_indexes; + callback_context->method = rec->method; memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); @@ -399,7 +538,7 @@ static int set_guid_rec(struct ib_device *ibdev, callback_context->query_id = ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, ibdev, port, &guid_info_rec, - comp_mask, rec_det->method, 1000, + comp_mask, rec->method, 1000, GFP_KERNEL, aliasguid_query_handler, callback_context, &callback_context->sa_query); @@ -434,6 +573,30 @@ out: return err; } +static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port) +{ + int j, k, entry; + __be64 guid; + + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + entry = j * NUM_ALIAS_GUID_IN_REC + k; + /* no request for the 0 entry (hw guid) */ + if (!entry || entry > dev->dev->persist->num_vfs || + !mlx4_is_slave_active(dev->dev, entry)) + continue; + guid = mlx4_get_admin_guid(dev->dev, entry, port); + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[j].all_recs + [GUID_REC_SIZE * k] = guid; + pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n", + entry, + be64_to_cpu(guid), + port); + } + } +} void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) { int i; @@ -443,6 +606,13 @@ void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + + if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags & + GUID_STATE_NEED_PORT_INIT) { + mlx4_ib_guid_port_init(dev, port); + dev->sriov.alias_guid.ports_guid[port - 1].state_flags &= + (~GUID_STATE_NEED_PORT_INIT); + } for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) invalidate_guid_record(dev, port, i); @@ -462,60 +632,107 @@ void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); } -/* The function returns the next record that was - * not configured (or failed to be configured) */ -static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, - struct mlx4_next_alias_guid_work *rec) +static void set_required_record(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *next_rec, + int record_index) { - int j; - unsigned long flags; + int i; + int lowset_time_entry = -1; + int lowest_time = 0; + ib_sa_comp_mask delete_guid_indexes = 0; + ib_sa_comp_mask set_guid_indexes = 0; + struct mlx4_sriov_alias_guid_info_rec_det *rec = + &dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[record_index]; - for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { - spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); - if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == - MLX4_GUID_INFO_STATUS_IDLE) { - memcpy(&rec->rec_det, - &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], - sizeof (struct mlx4_sriov_alias_guid_info_rec_det)); - rec->port = port; - rec->block_num = j; - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = - MLX4_GUID_INFO_STATUS_PENDING; - spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); - return 0; + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + if (!(rec->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + + if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + delete_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + else + set_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + + if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <= + lowest_time) { + lowset_time_entry = i; + lowest_time = rec->guids_retry_schedule[i]; } - spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); } - return -ENOENT; + + memcpy(&next_rec->rec_det, rec, sizeof(*rec)); + next_rec->port = port; + next_rec->block_num = record_index; + + if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) { + next_rec->rec_det.guid_indexes = delete_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_DELETE; + } else { + next_rec->rec_det.guid_indexes = set_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_SET; + } } -static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, - int rec_index, - struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +/* return index of record that should be updated based on lowest + * rescheduled time + */ +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec) { - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = - rec_det->guid_indexes; - memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, - rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = - rec_det->status; + int record_index = -1; + u64 low_record_time = 0; + struct mlx4_sriov_alias_guid_info_rec_det rec; + int j; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + rec = dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[j]; + if (rec.status == MLX4_GUID_INFO_STATUS_IDLE && + rec.guid_indexes) { + if (record_index == -1 || + rec.time_to_run < low_record_time) { + record_index = j; + low_record_time = rec.time_to_run; + } + } + } + if (resched_delay_sec) { + u64 curr_time = ktime_get_real_ns(); + + *resched_delay_sec = (low_record_time < curr_time) ? 0 : + div_u64((low_record_time - curr_time), NSEC_PER_SEC); + } + + return record_index; } -static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) { - int j; - struct mlx4_sriov_alias_guid_info_rec_det rec_det ; - - for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { - memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); - rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) | - IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 | - IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 | - IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 | - IB_SA_GUIDINFO_REC_GID7; - rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; - set_administratively_guid_record(dev, port, j, &rec_det); + unsigned long flags; + int record_index; + int ret = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + record_index = get_low_record_time_index(dev, port, NULL); + + if (record_index < 0) { + ret = -ENOENT; + goto out; } + + set_required_record(dev, port, rec, record_index); +out: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return ret; } static void alias_guid_work(struct work_struct *work) @@ -545,9 +762,7 @@ static void alias_guid_work(struct work_struct *work) goto out; } - set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, - &rec->rec_det); - + set_guid_rec(&dev->ib_dev, rec); out: kfree(rec); } @@ -562,6 +777,12 @@ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); if (!dev->sriov.is_going_down) { + /* If there is pending one should cancell then run, otherwise + * won't run till previous one is ended as same work + * struct is used. + */ + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port]. + alias_guid_work); queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); } @@ -609,7 +830,7 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { char alias_wq_name[15]; int ret = 0; - int i, j, k; + int i, j; union ib_gid gid; if (!mlx4_is_master(dev->dev)) @@ -633,33 +854,25 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) for (i = 0 ; i < dev->num_ports; i++) { memset(&dev->sriov.alias_guid.ports_guid[i], 0, sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); - /*Check if the SM doesn't need to assign the GUIDs*/ + dev->sriov.alias_guid.ports_guid[i].state_flags |= + GUID_STATE_NEED_PORT_INIT; for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { - if (mlx4_ib_sm_guid_assign) { - dev->sriov.alias_guid.ports_guid[i]. - all_rec_per_port[j]. - ownership = MLX4_GUID_DRIVER_ASSIGN; - continue; - } - dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j]. - ownership = MLX4_GUID_NONE_ASSIGN; - /*mark each val as it was deleted, - till the sysAdmin will give it valid val*/ - for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { - *(__be64 *)&dev->sriov.alias_guid.ports_guid[i]. - all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] = - cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); - } + /* mark each val as it was deleted */ + memset(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs, 0xFF, + sizeof(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs)); } INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); /*prepare the records, set them to be allocated by sm*/ + if (mlx4_ib_sm_guid_assign) + for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++) + mlx4_set_admin_guid(dev->dev, 0, j, i + 1); for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) invalidate_guid_record(dev, i + 1, j); dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - if (mlx4_ib_sm_guid_assign) - set_all_slaves_guids(dev, i); snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 59040265e361..9cd2b002d7ae 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1430,6 +1430,10 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->ring[i].addr, rx_buf_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) { + kfree(tun_qp->ring[i].addr); + goto err; + } } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { @@ -1442,6 +1446,11 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->tx_ring[i].buf.addr, tx_buf_size, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, + tun_qp->tx_ring[i].buf.map)) { + kfree(tun_qp->tx_ring[i].buf.addr); + goto tx_err; + } tun_qp->tx_ring[i].ah = NULL; } spin_lock_init(&tun_qp->tx_lock); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 976bea794b5f..57070c529dfb 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -66,9 +66,9 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -int mlx4_ib_sm_guid_assign = 1; +int mlx4_ib_sm_guid_assign = 0; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); -MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" @@ -2791,9 +2791,31 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 1); + } + } return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 0); + } + } /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index f829fd935b79..fce3934372a1 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -342,14 +342,9 @@ struct mlx4_ib_ah { enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, - MLX4_GUID_INFO_STATUS_PENDING, }; -enum mlx4_guid_alias_rec_ownership { - MLX4_GUID_DRIVER_ASSIGN, - MLX4_GUID_SYSADMIN_ASSIGN, - MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ -}; +#define GUID_STATE_NEED_PORT_INIT 0x01 enum mlx4_guid_alias_rec_method { MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, @@ -360,8 +355,8 @@ struct mlx4_sriov_alias_guid_info_rec_det { u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ - u8 method; /*set or delete*/ - enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ + unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; + u64 time_to_run; }; struct mlx4_sriov_alias_guid_port_rec_det { @@ -369,6 +364,7 @@ struct mlx4_sriov_alias_guid_port_rec_det { struct workqueue_struct *wq; struct delayed_work alias_guid_work; u8 port; + u32 state_flags; struct mlx4_sriov_alias_guid *parent; struct list_head cb_list; }; @@ -802,6 +798,8 @@ int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init); int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index ed2bd6701f9b..02fc91c68027 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -566,6 +566,10 @@ static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) { + kfree(qp->sqp_proxy_rcv[i].addr); + goto err; + } } return 0; @@ -2605,8 +2609,7 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); - *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | - wr->wr.ud.hlen); + *lso_hdr_sz = cpu_to_be32(wr->wr.ud.mss << 16 | wr->wr.ud.hlen); *lso_seg_len = halign; return 0; } diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index d10c2b8a5dad..6797108ce873 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -46,21 +46,17 @@ static ssize_t show_admin_alias_guid(struct device *dev, struct device_attribute *attr, char *buf) { - int record_num;/*0-15*/ - int guid_index_in_rec; /*0 - 7*/ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; struct mlx4_ib_dev *mdev = port->dev; + __be64 sysadmin_ag_val; - record_num = mlx4_ib_iov_dentry->entry_num / 8 ; - guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev, + mlx4_ib_iov_dentry->entry_num, + port->num); - return sprintf(buf, "%llx\n", - be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. - ports_guid[port->num - 1]. - all_rec_per_port[record_num]. - all_recs[8 * guid_index_in_rec])); + return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val)); } /* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. @@ -80,6 +76,7 @@ static ssize_t store_admin_alias_guid(struct device *dev, struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; struct mlx4_ib_dev *mdev = port->dev; u64 sysadmin_ag_val; + unsigned long flags; record_num = mlx4_ib_iov_dentry->entry_num / 8; guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; @@ -87,6 +84,7 @@ static ssize_t store_admin_alias_guid(struct device *dev, pr_err("GUID 0 block 0 is RO\n"); return count; } + spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags); sscanf(buf, "%llx", &sysadmin_ag_val); *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. all_rec_per_port[record_num]. @@ -96,33 +94,15 @@ static ssize_t store_admin_alias_guid(struct device *dev, /* Change the state to be pending for update */ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status = MLX4_GUID_INFO_STATUS_IDLE ; - - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method - = MLX4_GUID_INFO_RECORD_SET; - - switch (sysadmin_ag_val) { - case MLX4_GUID_FOR_DELETE_VAL: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method - = MLX4_GUID_INFO_RECORD_DELETE; - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_SYSADMIN_ASSIGN; - break; - /* The sysadmin requests the SM to re-assign */ - case MLX4_NOT_SET_GUID: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_DRIVER_ASSIGN; - break; - /* The sysadmin requests a specific value.*/ - default: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_SYSADMIN_ASSIGN; - break; - } + mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val), + mlx4_ib_iov_dentry->entry_num, + port->num); /* set the record index */ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes - = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags); mlx4_ib_init_alias_guid_work(mdev, port->num - 1); return count; diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 650897a8591e..bdd5d3857203 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -89,14 +89,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) - error = qibfs_mknod(parent->d_inode, *dentry, + error = qibfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return error; } @@ -455,10 +455,10 @@ static int remove_file(struct dentry *parent, char *name) } spin_lock(&tmp->d_lock); - if (!d_unhashed(tmp) && tmp->d_inode) { + if (!d_unhashed(tmp) && d_really_is_positive(tmp)) { __d_drop(tmp); spin_unlock(&tmp->d_lock); - simple_unlink(parent->d_inode, tmp); + simple_unlink(d_inode(parent), tmp); } else { spin_unlock(&tmp->d_lock); } @@ -481,7 +481,7 @@ static int remove_device_files(struct super_block *sb, int ret, i; root = dget(sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); snprintf(unit, sizeof(unit), "%u", dd->unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -491,7 +491,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -506,13 +506,13 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - mutex_unlock(&dir->d_inode->i_mutex); - ret = simple_rmdir(root->d_inode, dir); + mutex_unlock(&d_inode(dir)->i_mutex); + ret = simple_rmdir(d_inode(root), dir); d_delete(dir); dput(dir); bail: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); return ret; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..bd94b0a6e9e5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -87,7 +87,6 @@ enum { IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_FLAG_SUBINTERFACE = 5, - IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, @@ -98,9 +97,15 @@ enum { IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, - IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + /* + * For IPOIB_MCAST_FLAG_BUSY + * When set, in flight join and mcast->mc is unreliable + * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or + * haven't started yet + * When clear and mcast->mc is valid pointer, join was successful + */ + IPOIB_MCAST_FLAG_BUSY = 2, IPOIB_MCAST_FLAG_ATTACHED = 3, - IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -148,6 +153,7 @@ struct ipoib_mcast { unsigned long created; unsigned long backoff; + unsigned long delay_until; unsigned long flags; unsigned char logcount; @@ -292,6 +298,11 @@ struct ipoib_neigh_table { struct completion deleted; }; +struct ipoib_qp_state_validate { + struct work_struct work; + struct ipoib_dev_priv *priv; +}; + /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside @@ -317,6 +328,7 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; + struct workqueue_struct *wq; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -426,11 +438,6 @@ struct ipoib_neigh { #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) -static inline int ipoib_ud_need_sg(unsigned int ib_mtu) -{ - return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; -} - void ipoib_neigh_dtor(struct ipoib_neigh *neigh); static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) { @@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev, int flush); +int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev, int flush); +int ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); @@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); -int ipoib_mcast_stop_thread(struct net_device *dev, int flush); +int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even } spin_lock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ @@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); spin_unlock_irqrestore(&priv->lock, flags); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", @@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_reap_list); spin_unlock_irqrestore(&priv->lock, flags); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); } return; } @@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); @@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); @@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path tx->dev = dev; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); - queue_work(ipoib_workqueue, &priv->cm.start_task); + queue_work(priv->wq, &priv->cm.start_task); return tx; } @@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->neigh->daddr + 4); tx->neigh = NULL; @@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, skb_queue_tail(&priv->cm.skb_queue, skb); if (e) - queue_work(ipoib_workqueue, &priv->cm.skb_task); + queue_work(priv->wq, &priv->cm.skb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) @@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) } if (!list_empty(&priv->cm.passive_ids)) - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..63b92cbb29ad 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -94,39 +94,9 @@ void ipoib_free_ah(struct kref *kref) static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, u64 mapping[IPOIB_UD_RX_SG]) { - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, - DMA_FROM_DEVICE); - ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, - DMA_FROM_DEVICE); - } else - ib_dma_unmap_single(priv->ca, mapping[0], - IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), - DMA_FROM_DEVICE); -} - -static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, - struct sk_buff *skb, - unsigned int length) -{ - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; - unsigned int size; - /* - * There is only two buffers needed for max_payload = 4K, - * first buf size is IPOIB_UD_HEAD_SIZE - */ - skb->tail += IPOIB_UD_HEAD_SIZE; - skb->len += length; - - size = length - IPOIB_UD_HEAD_SIZE; - - skb_frag_size_set(frag, size); - skb->data_len += size; - skb->truesize += PAGE_SIZE; - } else - skb_put(skb, length); - + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); } static int ipoib_ib_post_receive(struct net_device *dev, int id) @@ -156,18 +126,11 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; int buf_size; - int tailroom; u64 *mapping; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - buf_size = IPOIB_UD_HEAD_SIZE; - tailroom = 128; /* reserve some tailroom for IP/TCP headers */ - } else { - buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - tailroom = 0; - } + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - skb = dev_alloc_skb(buf_size + tailroom + 4); + skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); if (unlikely(!skb)) return NULL; @@ -184,23 +147,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) goto error; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - struct page *page = alloc_page(GFP_ATOMIC); - if (!page) - goto partial_error; - skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); - mapping[1] = - ib_dma_map_page(priv->ca, page, - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) - goto partial_error; - } - priv->rx_ring[id].skb = skb; return skb; - -partial_error: - ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); error: dev_kfree_skb_any(skb); return NULL; @@ -278,7 +226,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wc->byte_len, wc->slid); ipoib_ud_dma_unmap_rx(priv, mapping); - ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + skb_put(skb, wc->byte_len); /* First byte of dgid signals multicast when 0xff */ dgid = &((struct ib_grh *)skb->data)->dgid; @@ -296,6 +245,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); + skb->truesize = SKB_TRUESIZE(skb->len); + ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; @@ -376,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, } } +/* + * As the result of a completion error the QP Can be transferred to SQE states. + * The function checks if the (send)QP is in SQE state and + * moves it back to RTS state, that in order to have it functional again. + */ +static void ipoib_qp_state_validate_work(struct work_struct *work) +{ + struct ipoib_qp_state_validate *qp_work = + container_of(work, struct ipoib_qp_state_validate, work); + + struct ipoib_dev_priv *priv = qp_work->priv; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", + __func__, ret); + goto free_res; + } + pr_info("%s: QP: 0x%x is in state: %d\n", + __func__, priv->qp->qp_num, qp_attr.qp_state); + + /* currently support only in SQE->RTS transition*/ + if (qp_attr.qp_state == IB_QPS_SQE) { + qp_attr.qp_state = IB_QPS_RTS; + + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); + if (ret) { + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", + ret, priv->qp->qp_num); + goto free_res; + } + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", + __func__, priv->qp->qp_num); + } else { + pr_warn("QP (%d) will stay in state: %d\n", + priv->qp->qp_num, qp_attr.qp_state); + } + +free_res: + kfree(qp_work); +} + static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -407,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_wake_queue(dev); if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_qp_state_validate *qp_work; ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); + if (!qp_work) { + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", + __func__, priv->qp->qp_num); + return; + } + + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); + qp_work->priv = priv; + queue_work(priv->wq, &qp_work->work); + } } static int poll_tx(struct ipoib_dev_priv *priv) @@ -655,16 +663,33 @@ void ipoib_reap_ah(struct work_struct *work) __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); } +static void ipoib_flush_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + cancel_delayed_work(&priv->ah_reap_task); + flush_workqueue(priv->wq); + ipoib_reap_ah(&priv->ah_reap_task.work); +} + +static void ipoib_stop_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + set_bit(IPOIB_STOP_REAPER, &priv->flags); + ipoib_flush_ah(dev); +} + static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev, int flush) +int ipoib_ib_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; @@ -696,7 +721,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) @@ -706,7 +731,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, flush); + ipoib_ib_dev_stop(dev); return -1; } @@ -738,7 +763,7 @@ int ipoib_ib_dev_up(struct net_device *dev) return ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev, int flush) +int ipoib_ib_dev_down(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -747,7 +772,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); ipoib_flush_paths(dev); @@ -807,7 +832,7 @@ void ipoib_drain_cq(struct net_device *dev) local_bh_enable(); } -int ipoib_ib_dev_stop(struct net_device *dev, int flush) +int ipoib_ib_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; @@ -877,24 +902,7 @@ timeout: if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); - /* Wait for all AHs to be reaped */ - set_bit(IPOIB_STOP_REAPER, &priv->flags); - cancel_delayed_work(&priv->ah_reap_task); - if (flush) - flush_workqueue(ipoib_workqueue); - - begin = jiffies; - - while (!list_empty(&priv->dead_ahs)) { - __ipoib_reap_ah(dev); - - if (time_after(jiffies, begin + HZ)) { - ipoib_warn(priv, "timing out; will leak address handles\n"); - break; - } - - msleep(1); - } + ipoib_flush_ah(dev); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); @@ -918,7 +926,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -1037,15 +1045,16 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(dev); ipoib_mcast_dev_flush(dev); + ipoib_flush_ah(dev); } if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev); if (level == IPOIB_FLUSH_HEAVY) { if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - ipoib_ib_dev_stop(dev, 0); - if (ipoib_ib_dev_open(dev, 0) != 0) + ipoib_ib_dev_stop(dev); + if (ipoib_ib_dev_open(dev) != 0) return; if (netif_queue_stopped(dev)) netif_start_queue(dev); @@ -1097,9 +1106,17 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) */ ipoib_flush_paths(dev); - ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); + /* + * All of our ah references aren't free until after + * ipoib_mcast_dev_flush(), ipoib_flush_paths, and + * the neighbor garbage collection is stopped and reaped. + * That should all be done now, so make a final ah flush. + */ + ipoib_stop_ah(dev); + ipoib_transport_dev_cleanup(dev); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 915ad04a827e..9e1b203d756d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; @@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) return 0; err_stop: - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 1); - ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_down(dev); + ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -640,8 +640,10 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!path->query && path_rec_start(dev, path)) goto err_path; - - __skb_queue_tail(&neigh->queue, skb); + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else + goto err_drop; } spin_unlock_irqrestore(&priv->lock, flags); @@ -676,7 +678,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, new_path = 1; } if (path) { - __skb_queue_tail(&path->queue, skb); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } if (!path->query && path_rec_start(dev, path)) { spin_unlock_irqrestore(&priv->lock, flags); @@ -839,7 +846,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) return; } - queue_work(ipoib_workqueue, &priv->restart_task); + queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) @@ -966,7 +973,7 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } @@ -1145,7 +1152,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) /* start garbage collection */ clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; @@ -1274,15 +1281,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (ipoib_neigh_hash_init(priv) < 0) - goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out_neigh_hash_cleanup; + goto out; } priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); @@ -1297,16 +1302,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; + /* + * Must be after ipoib_ib_dev_init so we can allocate a per + * device wq there and use it here + */ + if (ipoib_neigh_hash_init(priv) < 0) + goto out_dev_uninit; + return 0; +out_dev_uninit: + ipoib_ib_dev_cleanup(dev); + out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); -out_neigh_hash_cleanup: - ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -1329,6 +1342,12 @@ void ipoib_dev_cleanup(struct net_device *dev) } unregister_netdevice_many(&head); + /* + * Must be before ipoib_ib_dev_cleanup or we delete an in use + * work queue + */ + ipoib_neigh_hash_uninit(dev); + ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); @@ -1336,8 +1355,6 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; - - ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { @@ -1646,10 +1663,11 @@ sysfs_failed: register_failed: ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); /* Stop GC if started before flush */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); event_failed: ipoib_dev_cleanup(priv->dev); @@ -1712,6 +1730,7 @@ static void ipoib_remove_one(struct ib_device *device) list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); @@ -1720,7 +1739,7 @@ static void ipoib_remove_one(struct ib_device *device) /* Stop GC */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); unregister_netdev(priv->dev); free_netdev(priv->dev); @@ -1755,14 +1774,16 @@ static int __init ipoib_init_module(void) return ret; /* - * We create our own workqueue mainly because we want to be - * able to flush it when devices are being removed. We can't - * use schedule_work()/flush_scheduled_work() because both - * unregister_netdev() and linkwatch_event take the rtnl lock, - * so flush_scheduled_work() can deadlock during device - * removal. + * We create a global workqueue here that is used for all flush + * operations. However, if you attempt to flush a workqueue + * from a task on that same workqueue, it deadlocks the system. + * We want to be able to flush the tasks associated with a + * specific net device, so we also create a workqueue for each + * netdevice. We queue up the tasks for that device only on + * its private workqueue, and we only queue up flush events + * on our global flush workqueue. This avoids the deadlocks. */ - ipoib_workqueue = create_singlethread_workqueue("ipoib"); + ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..0d23e0568deb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -55,8 +55,6 @@ MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif -static DEFINE_MUTEX(mcast_mutex); - struct ipoib_mcast_iter { struct net_device *dev; union ib_gid mgid; @@ -66,6 +64,48 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* + * This should be called with the priv->lock held + */ +static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast, + bool delay) +{ + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + + /* + * We will be scheduling *something*, so cancel whatever is + * currently scheduled first + */ + cancel_delayed_work(&priv->mcast_task); + if (mcast && delay) { + /* + * We had a failure and want to schedule a retry later + */ + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + mcast->delay_until = jiffies + (mcast->backoff * HZ); + /* + * Mark this mcast for its delay, but restart the + * task immediately. The join task will make sure to + * clear out all entries without delays, and then + * schedule itself to run again when the earliest + * delay expires + */ + queue_delayed_work(priv->wq, &priv->mcast_task, 0); + } else if (delay) { + /* + * Special case of retrying after a failure to + * allocate the broadcast multicast group, wait + * 1 second and try again + */ + queue_delayed_work(priv->wq, &priv->mcast_task, HZ); + } else + queue_delayed_work(priv->wq, &priv->mcast_task, 0); +} + static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; @@ -103,6 +143,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, mcast->dev = dev; mcast->created = jiffies; + mcast->delay_until = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); @@ -185,17 +226,27 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + /*update priv member according to the new mcast*/ + priv->broadcast->mcmember.qkey = mcmember->qkey; + priv->broadcast->mcmember.mtu = mcmember->mtu; + priv->broadcast->mcmember.traffic_class = mcmember->traffic_class; + priv->broadcast->mcmember.rate = mcmember->rate; + priv->broadcast->mcmember.sl = mcmember->sl; + priv->broadcast->mcmember.flow_label = mcmember->flow_label; + priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; + /* assume if the admin and the mcast are the same both can be changed */ + if (priv->mcast_mtu == priv->admin_mtu) + priv->admin_mtu = + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + else + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -270,107 +321,35 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, return 0; } -static int -ipoib_mcast_sendonly_join_complete(int status, - struct ib_sa_multicast *multicast) -{ - struct ipoib_mcast *mcast = multicast->context; - struct net_device *dev = mcast->dev; - - /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; - - if (!status) - status = ipoib_mcast_join_finish(mcast, &multicast->rec); - - if (status) { - if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); - - /* Flush out any queued packets */ - netif_tx_lock_bh(dev); - while (!skb_queue_empty(&mcast->pkt_queue)) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); - } - netif_tx_unlock_bh(dev); - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, - &mcast->flags); - } - return status; -} - -static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) -{ - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_sa_mcmember_rec rec = { -#if 0 /* Some SMs don't support send-only yet */ - .join_state = 4 -#else - .join_state = 1 -#endif - }; - int ret = 0; - - if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); - return -ENODEV; - } - - if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); - return -EBUSY; - } - - rec.mgid = mcast->mcmember.mgid; - rec.port_gid = priv->local_gid; - rec.pkey = cpu_to_be16(priv->pkey); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, - priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, - GFP_ATOMIC, - ipoib_mcast_sendonly_join_complete, - mcast); - if (IS_ERR(mcast->mc)) { - ret = PTR_ERR(mcast->mc); - clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", - ret); - } else { - ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", - mcast->mcmember.mgid.raw); - } - - return ret; -} - void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; - /* - * Take rtnl_lock to avoid racing with ipoib_stop() and - * turning the carrier back on while a device is being - * removed. - */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } - rtnl_lock(); + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. However, ipoib_stop() will attempt to flush + * the workqueue while holding the rtnl lock, so loop + * on trylock until either we get the lock or we see + * FLAG_OPER_UP go away as that signals that we are bailing + * and can safely ignore the carrier on work. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + else + msleep(20); + } + if (!ipoib_cm_admin_enabled(priv->dev)) + dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); netif_carrier_on(priv->dev); rtnl_unlock(); } @@ -382,7 +361,9 @@ static int ipoib_mcast_join_complete(int status, struct net_device *dev = mcast->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? + "sendonly " : "", mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ @@ -396,49 +377,74 @@ static int ipoib_mcast_join_complete(int status, if (!status) { mcast->backoff = 1; - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + mcast->delay_until = jiffies; /* - * Defer carrier on work to ipoib_workqueue to avoid a - * deadlock on rtnl_lock here. + * Defer carrier on work to priv->wq to avoid a + * deadlock on rtnl_lock here. Requeue our multicast + * work too, which will end up happening right after + * our carrier on task work and will allow us to + * send out all of the non-broadcast joins */ - if (mcast == priv->broadcast) - queue_work(ipoib_workqueue, &priv->carrier_on_task); - - status = 0; - goto out; - } + if (mcast == priv->broadcast) { + spin_lock_irq(&priv->lock); + queue_work(priv->wq, &priv->carrier_on_task); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + goto out_locked; + } + } else { + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } + } - if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EAGAIN) { - ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + mcast->backoff >= 2) { + /* + * We only retry sendonly joins once before we drop + * the packet and quit trying to deal with the + * group. However, we leave the group in the + * mcast list as an unjoined group. If we want to + * try joining again, we simply queue up a packet + * and restart the join thread. The empty queue + * is why the join thread ignores this group. + */ + mcast->backoff = 1; + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); } else { - ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); + goto out_locked; } } - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - - mutex_lock(&mcast_mutex); +out: spin_lock_irq(&priv->lock); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, - mcast->backoff * HZ); +out_locked: + /* + * Make sure to set mcast->mc before we clear the busy flag to avoid + * racing with code that checks for BUSY before checking mcast->mc + */ + if (status) + mcast->mc = NULL; + else + mcast->mc = multicast; + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); spin_unlock_irq(&priv->lock); - mutex_unlock(&mcast_mutex); -out: complete(&mcast->done); + return status; } @@ -446,6 +452,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, int create) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_multicast *multicast; struct ib_sa_mcmember_rec rec = { .join_state = 1 }; @@ -487,29 +494,18 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, rec.hop_limit = priv->broadcast->mcmember.hop_limit; } - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - init_completion(&mcast->done); - set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); - if (IS_ERR(mcast->mc)) { + if (IS_ERR(multicast)) { + ret = PTR_ERR(multicast); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + spin_unlock_irq(&priv->lock); complete(&mcast->done); - ret = PTR_ERR(mcast->mc); - ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, - mcast->backoff * HZ); - mutex_unlock(&mcast_mutex); } } @@ -519,8 +515,11 @@ void ipoib_mcast_join_task(struct work_struct *work) container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; struct ib_port_attr port_attr; + unsigned long delay_until = 0; + struct ipoib_mcast *mcast = NULL; + int create = 1; - if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &port_attr) || @@ -536,93 +535,118 @@ void ipoib_mcast_join_task(struct work_struct *work) else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + spin_lock_irq(&priv->lock); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + goto out; + if (!priv->broadcast) { struct ipoib_mcast *broadcast; - if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - return; - - broadcast = ipoib_mcast_alloc(dev, 1); + broadcast = ipoib_mcast_alloc(dev, 0); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); - mutex_unlock(&mcast_mutex); - return; + /* + * Restart us after a 1 second delay to retry + * creating our broadcast group and attaching to + * it. Until this succeeds, this ipoib dev is + * completely stalled (multicast wise). + */ + __ipoib_mcast_schedule_join_thread(priv, NULL, 1); + goto out; } - spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(dev, priv->broadcast); - spin_unlock_irq(&priv->lock); } if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) - ipoib_mcast_join(dev, priv->broadcast, 0); - return; - } - - while (1) { - struct ipoib_mcast *mcast = NULL; - - spin_lock_irq(&priv->lock); - list_for_each_entry(mcast, &priv->multicast_list, list) { - if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - /* Found the next unjoined group */ - break; + if (IS_ERR_OR_NULL(priv->broadcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { + mcast = priv->broadcast; + create = 0; + if (mcast->backoff > 1 && + time_before(jiffies, mcast->delay_until)) { + delay_until = mcast->delay_until; + mcast = NULL; } } - spin_unlock_irq(&priv->lock); + goto out; + } - if (&mcast->list == &priv->multicast_list) { - /* All done */ - break; + /* + * We'll never get here until the broadcast group is both allocated + * and attached + */ + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (IS_ERR_OR_NULL(mcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && + (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) || + !skb_queue_empty(&mcast->pkt_queue))) { + if (mcast->backoff == 1 || + time_after_eq(jiffies, mcast->delay_until)) { + /* Found the next unjoined group */ + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + create = 0; + else + create = 1; + spin_unlock_irq(&priv->lock); + ipoib_mcast_join(dev, mcast, create); + spin_lock_irq(&priv->lock); + } else if (!delay_until || + time_before(mcast->delay_until, delay_until)) + delay_until = mcast->delay_until; } - - ipoib_mcast_join(dev, mcast, 1); - return; } - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + mcast = NULL; + ipoib_dbg_mcast(priv, "successfully started all multicast joins\n"); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); +out: + if (delay_until) { + cancel_delayed_work(&priv->mcast_task); + queue_delayed_work(priv->wq, &priv->mcast_task, + delay_until - jiffies); + } + if (mcast) { + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + } + spin_unlock_irq(&priv->lock); + if (mcast) + ipoib_mcast_join(dev, mcast, create); } int ipoib_mcast_start_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "starting multicast thread\n"); - mutex_lock(&mcast_mutex); - if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); return 0; } -int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - mutex_lock(&mcast_mutex); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); + spin_lock_irqsave(&priv->lock, flags); cancel_delayed_work(&priv->mcast_task); - mutex_unlock(&mcast_mutex); + spin_unlock_irqrestore(&priv->lock, flags); - if (flush) - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); return 0; } @@ -633,6 +657,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); + + if (!IS_ERR_OR_NULL(mcast->mc)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { @@ -644,7 +671,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); - } + } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_dbg(priv, "leaving with no mcmember but not a " + "SENDONLY join\n"); return 0; } @@ -667,49 +696,37 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) } mcast = __ipoib_mcast_find(dev, mgid); - if (!mcast) { - /* Let's create a new send only group now */ - ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", - mgid); - - mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast || !mcast->ah) { if (!mcast) { - ipoib_warn(priv, "unable to allocate memory for " - "multicast structure\n"); - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - goto out; - } - - set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); - memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); - __ipoib_mcast_add(dev, mcast); - list_add_tail(&mcast->list, &priv->multicast_list); - } + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory " + "for multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } - if (!mcast->ah) { + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, + sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } - - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - ipoib_dbg_mcast(priv, "no address vector, " - "but multicast join already started\n"); - else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - ipoib_mcast_sendonly_join(mcast); - - /* - * If lookup completes between here and out:, don't - * want to send packet twice. - */ - mcast = NULL; - } - -out: - if (mcast && mcast->ah) { + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + } + } else { struct ipoib_neigh *neigh; spin_unlock_irqrestore(&priv->lock, flags); @@ -759,9 +776,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* seperate between the wait to the leave*/ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { @@ -792,9 +812,14 @@ void ipoib_mcast_restart_task(struct work_struct *work) unsigned long flags; struct ib_sa_mcmember_rec rec; - ipoib_dbg_mcast(priv, "restarting multicast task\n"); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + /* + * shortcut...on shutdown flush is called next, just + * let it do all the work + */ + return; - ipoib_mcast_stop_thread(dev, 0); + ipoib_dbg_mcast(priv, "restarting multicast task\n"); local_irq_save(flags); netif_addr_lock(dev); @@ -880,14 +905,27 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* We have to cancel outside of the spinlock */ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); } - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_mcast_start_thread(dev); + /* + * Double check that we are still up + */ + if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); + } } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..e5cc43074196 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -157,6 +157,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) goto out_free_pd; } + /* + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue + */ + priv->wq = create_singlethread_workqueue("ipoib_wq"); + if (!priv->wq) { + printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); + goto out_free_mr; + } + size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) { @@ -165,12 +175,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; - } + } else + goto out_free_wq; priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); - goto out_free_mr; + goto out_cm_dev_cleanup; } priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, @@ -216,15 +227,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->tx_wr.send_flags = IB_SEND_SIGNALED; priv->rx_sge[0].lkey = priv->mr->lkey; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; - priv->rx_sge[1].length = PAGE_SIZE; - priv->rx_sge[1].lkey = priv->mr->lkey; - priv->rx_wr.num_sge = IPOIB_UD_RX_SG; - } else { - priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - priv->rx_wr.num_sge = 1; - } + + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; @@ -236,12 +242,19 @@ out_free_send_cq: out_free_recv_cq: ib_destroy_cq(priv->recv_cq); +out_cm_dev_cleanup: + ipoib_cm_dev_cleanup(dev); + +out_free_wq: + destroy_workqueue(priv->wq); + priv->wq = NULL; + out_free_mr: ib_dereg_mr(priv->mr); - ipoib_cm_dev_cleanup(dev); out_free_pd: ib_dealloc_pd(priv->pd); + return -ENODEV; } @@ -265,11 +278,18 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) ipoib_cm_dev_cleanup(dev); + if (priv->wq) { + flush_workqueue(priv->wq); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } + if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); + } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index b47aea1094b2..262ba1f8ee50 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -69,7 +69,7 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.5" +#define DRV_VER "1.6" #define iser_dbg(fmt, arg...) \ do { \ @@ -218,22 +218,21 @@ enum iser_data_dir { /** * struct iser_data_buf - iSER data buffer * - * @buf: pointer to the sg list + * @sg: pointer to the sg list * @size: num entries of this sg * @data_len: total beffer byte len * @dma_nents: returned by dma_map_sg - * @copy_buf: allocated copy buf for SGs unaligned - * for rdma which are copied - * @sg_single: SG-ified clone of a non SG SC or - * unaligned SG + * @orig_sg: pointer to the original sg list (in case + * we used a copy) + * @orig_size: num entris of orig sg list */ struct iser_data_buf { - void *buf; + struct scatterlist *sg; unsigned int size; unsigned long data_len; unsigned int dma_nents; - char *copy_buf; - struct scatterlist sg_single; + struct scatterlist *orig_sg; + unsigned int orig_size; }; /* fwd declarations */ @@ -244,35 +243,14 @@ struct iscsi_endpoint; /** * struct iser_mem_reg - iSER memory registration info * - * @lkey: MR local key - * @rkey: MR remote key - * @va: MR start address (buffer va) - * @len: MR length + * @sge: memory region sg element + * @rkey: memory region remote key * @mem_h: pointer to registration context (FMR/Fastreg) */ struct iser_mem_reg { - u32 lkey; - u32 rkey; - u64 va; - u64 len; - void *mem_h; -}; - -/** - * struct iser_regd_buf - iSER buffer registration desc - * - * @reg: memory registration info - * @virt_addr: virtual address of buffer - * @device: reference to iser device - * @direction: dma direction (for dma_unmap) - * @data_size: data buffer size in bytes - */ -struct iser_regd_buf { - struct iser_mem_reg reg; - void *virt_addr; - struct iser_device *device; - enum dma_data_direction direction; - unsigned int data_size; + struct ib_sge sge; + u32 rkey; + void *mem_h; }; enum iser_desc_type { @@ -534,11 +512,9 @@ struct iser_conn { * @sc: link to scsi command * @command_sent: indicate if command was sent * @dir: iser data direction - * @rdma_regd: task rdma registration desc + * @rdma_reg: task rdma registration desc * @data: iser data buffer desc - * @data_copy: iser data copy buffer desc (bounce buffer) * @prot: iser protection buffer desc - * @prot_copy: iser protection copy buffer desc (bounce buffer) */ struct iscsi_iser_task { struct iser_tx_desc desc; @@ -547,11 +523,9 @@ struct iscsi_iser_task { struct scsi_cmnd *sc; int command_sent; int dir[ISER_DIRS_NUM]; - struct iser_regd_buf rdma_regd[ISER_DIRS_NUM]; + struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; struct iser_data_buf data[ISER_DIRS_NUM]; - struct iser_data_buf data_copy[ISER_DIRS_NUM]; struct iser_data_buf prot[ISER_DIRS_NUM]; - struct iser_data_buf prot_copy[ISER_DIRS_NUM]; }; struct iser_page_vec { @@ -621,7 +595,6 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn); void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, - struct iser_data_buf *mem_copy, enum iser_data_dir cmd_dir); int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, @@ -634,10 +607,6 @@ int iser_connect(struct iser_conn *iser_conn, struct sockaddr *dst_addr, int non_blocking); -int iser_reg_page_vec(struct ib_conn *ib_conn, - struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg); - void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, @@ -667,4 +636,9 @@ int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); void iser_free_fastreg_pool(struct ib_conn *ib_conn); u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector); +struct fast_reg_descriptor * +iser_reg_desc_get(struct ib_conn *ib_conn); +void +iser_reg_desc_put(struct ib_conn *ib_conn, + struct fast_reg_descriptor *desc); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 20e859a6f1a6..3e2118e8ed87 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -50,7 +50,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; struct iser_device *device = iser_task->iser_conn->ib_conn.device; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; @@ -78,15 +78,15 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) iser_err("Failed to set up Data-IN RDMA\n"); return err; } - regd_buf = &iser_task->rdma_regd[ISER_DIR_IN]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_IN]; hdr->flags |= ISER_RSV; - hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey); - hdr->read_va = cpu_to_be64(regd_buf->reg.va); + hdr->read_stag = cpu_to_be32(mem_reg->rkey); + hdr->read_va = cpu_to_be64(mem_reg->sge.addr); iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", - task->itt, regd_buf->reg.rkey, - (unsigned long long)regd_buf->reg.va); + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr); return 0; } @@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, { struct iscsi_iser_task *iser_task = task->dd_data; struct iser_device *device = iser_task->iser_conn->ib_conn.device; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; @@ -134,25 +134,25 @@ iser_prepare_write_cmd(struct iscsi_task *task, return err; } - regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; if (unsol_sz < edtl) { hdr->flags |= ISER_WSV; - hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey); - hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz); + hdr->write_stag = cpu_to_be32(mem_reg->rkey); + hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " "VA:%#llX + unsol:%d\n", - task->itt, regd_buf->reg.rkey, - (unsigned long long)regd_buf->reg.va, unsol_sz); + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr, unsol_sz); } if (imm_sz > 0) { iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", task->itt, imm_sz); - tx_dsg->addr = regd_buf->reg.va; + tx_dsg->addr = mem_reg->sge.addr; tx_dsg->length = imm_sz; - tx_dsg->lkey = regd_buf->reg.lkey; + tx_dsg->lkey = mem_reg->sge.lkey; iser_task->desc.num_sge = 2; } @@ -401,16 +401,16 @@ int iser_send_command(struct iscsi_conn *conn, } if (scsi_sg_count(sc)) { /* using a scatter list */ - data_buf->buf = scsi_sglist(sc); + data_buf->sg = scsi_sglist(sc); data_buf->size = scsi_sg_count(sc); } data_buf->data_len = scsi_bufflen(sc); if (scsi_prot_sg_count(sc)) { - prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->sg = scsi_prot_sglist(sc); prot_buf->size = scsi_prot_sg_count(sc); - prot_buf->data_len = data_buf->data_len >> - ilog2(sc->device->sector_size) * 8; + prot_buf->data_len = (data_buf->data_len >> + ilog2(sc->device->sector_size)) * 8; } if (hdr->flags & ISCSI_FLAG_CMD_READ) { @@ -450,7 +450,7 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = NULL; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; unsigned long buf_offset; unsigned long data_seg_len; uint32_t itt; @@ -477,11 +477,11 @@ int iser_send_data_out(struct iscsi_conn *conn, /* build the tx desc */ iser_initialize_task_headers(task, tx_desc); - regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; tx_dsg = &tx_desc->tx_sg[1]; - tx_dsg->addr = regd_buf->reg.va + buf_offset; - tx_dsg->length = data_seg_len; - tx_dsg->lkey = regd_buf->reg.lkey; + tx_dsg->addr = mem_reg->sge.addr + buf_offset; + tx_dsg->length = data_seg_len; + tx_dsg->lkey = mem_reg->sge.lkey; tx_desc->num_sge = 2; if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { @@ -658,10 +658,10 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) iser_task->prot[ISER_DIR_IN].data_len = 0; iser_task->prot[ISER_DIR_OUT].data_len = 0; - memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, - sizeof(struct iser_regd_buf)); - memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, - sizeof(struct iser_regd_buf)); + memset(&iser_task->rdma_reg[ISER_DIR_IN], 0, + sizeof(struct iser_mem_reg)); + memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0, + sizeof(struct iser_mem_reg)); } void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) @@ -674,35 +674,31 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) /* if we were reading, copy back to unaligned sglist, * anyway dma_unmap and free the copy */ - if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { + if (iser_task->data[ISER_DIR_IN].orig_sg) { is_rdma_data_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->data[ISER_DIR_IN], - &iser_task->data_copy[ISER_DIR_IN], ISER_DIR_IN); } - if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { + if (iser_task->data[ISER_DIR_OUT].orig_sg) { is_rdma_data_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->data[ISER_DIR_OUT], - &iser_task->data_copy[ISER_DIR_OUT], ISER_DIR_OUT); } - if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + if (iser_task->prot[ISER_DIR_IN].orig_sg) { is_rdma_prot_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->prot[ISER_DIR_IN], - &iser_task->prot_copy[ISER_DIR_IN], ISER_DIR_IN); } - if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + if (iser_task->prot[ISER_DIR_OUT].orig_sg) { is_rdma_prot_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->prot[ISER_DIR_OUT], - &iser_task->prot_copy[ISER_DIR_OUT], ISER_DIR_OUT); } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 341040bf0984..f0cdc961eb11 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -39,68 +39,173 @@ #include "iscsi_iser.h" -#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ +static void +iser_free_bounce_sg(struct iser_data_buf *data) +{ + struct scatterlist *sg; + int count; -/** - * iser_start_rdma_unaligned_sg - */ -static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, - struct iser_data_buf *data_copy, - enum iser_data_dir cmd_dir) + for_each_sg(data->sg, sg, data->size, count) + __free_page(sg_page(sg)); + + kfree(data->sg); + + data->sg = data->orig_sg; + data->size = data->orig_size; + data->orig_sg = NULL; + data->orig_size = 0; +} + +static int +iser_alloc_bounce_sg(struct iser_data_buf *data) { - struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; - struct scatterlist *sgl = (struct scatterlist *)data->buf; struct scatterlist *sg; - char *mem = NULL; - unsigned long cmd_data_len = 0; - int dma_nents, i; + struct page *page; + unsigned long length = data->data_len; + int i = 0, nents = DIV_ROUND_UP(length, PAGE_SIZE); - for_each_sg(sgl, sg, data->size, i) - cmd_data_len += ib_sg_dma_len(dev, sg); + sg = kcalloc(nents, sizeof(*sg), GFP_ATOMIC); + if (!sg) + goto err; - if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - mem = (void *)__get_free_pages(GFP_ATOMIC, - ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); - else - mem = kmalloc(cmd_data_len, GFP_ATOMIC); + sg_init_table(sg, nents); + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); - if (mem == NULL) { - iser_err("Failed to allocate mem size %d %d for copying sglist\n", - data->size, (int)cmd_data_len); - return -ENOMEM; + page = alloc_page(GFP_ATOMIC); + if (!page) + goto err; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; } - if (cmd_dir == ISER_DIR_OUT) { - /* copy the unaligned sg the buffer which is used for RDMA */ - char *p, *from; - - sgl = (struct scatterlist *)data->buf; - p = mem; - for_each_sg(sgl, sg, data->size, i) { - from = kmap_atomic(sg_page(sg)); - memcpy(p, - from + sg->offset, - sg->length); - kunmap_atomic(from); - p += sg->length; + data->orig_sg = data->sg; + data->orig_size = data->size; + data->sg = sg; + data->size = nents; + + return 0; + +err: + for (; i > 0; i--) + __free_page(sg_page(&sg[i - 1])); + kfree(sg); + + return -ENOMEM; +} + +static void +iser_copy_bounce(struct iser_data_buf *data, bool to_buffer) +{ + struct scatterlist *osg, *bsg = data->sg; + void *oaddr, *baddr; + unsigned int left = data->data_len; + unsigned int bsg_off = 0; + int i; + + for_each_sg(data->orig_sg, osg, data->orig_size, i) { + unsigned int copy_len, osg_off = 0; + + oaddr = kmap_atomic(sg_page(osg)) + osg->offset; + copy_len = min(left, osg->length); + while (copy_len) { + unsigned int len = min(copy_len, bsg->length - bsg_off); + + baddr = kmap_atomic(sg_page(bsg)) + bsg->offset; + if (to_buffer) + memcpy(baddr + bsg_off, oaddr + osg_off, len); + else + memcpy(oaddr + osg_off, baddr + bsg_off, len); + + kunmap_atomic(baddr - bsg->offset); + osg_off += len; + bsg_off += len; + copy_len -= len; + + if (bsg_off >= bsg->length) { + bsg = sg_next(bsg); + bsg_off = 0; + } } + kunmap_atomic(oaddr - osg->offset); + left -= osg_off; } +} + +static inline void +iser_copy_from_bounce(struct iser_data_buf *data) +{ + iser_copy_bounce(data, false); +} + +static inline void +iser_copy_to_bounce(struct iser_data_buf *data) +{ + iser_copy_bounce(data, true); +} + +struct fast_reg_descriptor * +iser_reg_desc_get(struct ib_conn *ib_conn) +{ + struct fast_reg_descriptor *desc; + unsigned long flags; + + spin_lock_irqsave(&ib_conn->lock, flags); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + spin_unlock_irqrestore(&ib_conn->lock, flags); + + return desc; +} + +void +iser_reg_desc_put(struct ib_conn *ib_conn, + struct fast_reg_descriptor *desc) +{ + unsigned long flags; - sg_init_one(&data_copy->sg_single, mem, cmd_data_len); - data_copy->buf = &data_copy->sg_single; - data_copy->size = 1; - data_copy->copy_buf = mem; + spin_lock_irqsave(&ib_conn->lock, flags); + list_add(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); +} - dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, - (cmd_dir == ISER_DIR_OUT) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - BUG_ON(dma_nents == 0); +/** + * iser_start_rdma_unaligned_sg + */ +static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; + int rc; + + rc = iser_alloc_bounce_sg(data); + if (rc) { + iser_err("Failed to allocate bounce for data len %lu\n", + data->data_len); + return rc; + } + + if (cmd_dir == ISER_DIR_OUT) + iser_copy_to_bounce(data); - data_copy->dma_nents = dma_nents; - data_copy->data_len = cmd_data_len; + data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!data->dma_nents) { + iser_err("Got dma_nents %d, something went wrong...\n", + data->dma_nents); + rc = -ENOMEM; + goto err; + } return 0; +err: + iser_free_bounce_sg(data); + return rc; } /** @@ -109,51 +214,18 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, - struct iser_data_buf *data_copy, enum iser_data_dir cmd_dir) { - struct ib_device *dev; - unsigned long cmd_data_len; - - dev = iser_task->iser_conn->ib_conn.device->ib_device; + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; - ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, + ib_dma_unmap_sg(dev, data->sg, data->size, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (cmd_dir == ISER_DIR_IN) { - char *mem; - struct scatterlist *sgl, *sg; - unsigned char *p, *to; - unsigned int sg_size; - int i; - - /* copy back read RDMA to unaligned sg */ - mem = data_copy->copy_buf; - - sgl = (struct scatterlist *)data->buf; - sg_size = data->size; - - p = mem; - for_each_sg(sgl, sg, sg_size, i) { - to = kmap_atomic(sg_page(sg)); - memcpy(to + sg->offset, - p, - sg->length); - kunmap_atomic(to); - p += sg->length; - } - } + if (cmd_dir == ISER_DIR_IN) + iser_copy_from_bounce(data); - cmd_data_len = data->data_len; - - if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - free_pages((unsigned long)data_copy->copy_buf, - ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); - else - kfree(data_copy->copy_buf); - - data_copy->copy_buf = NULL; + iser_free_bounce_sg(data); } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) @@ -175,7 +247,7 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, struct ib_device *ibdev, u64 *pages, int *offset, int *data_size) { - struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg, *sgl = data->sg; u64 start_addr, end_addr, page, chunk_start = 0; unsigned long total_sz = 0; unsigned int dma_len; @@ -227,14 +299,14 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, static int iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) { - struct scatterlist *sgl, *sg, *next_sg = NULL; + struct scatterlist *sg, *sgl, *next_sg = NULL; u64 start_addr, end_addr; int i, ret_len, start_check = 0; if (data->dma_nents == 1) return 1; - sgl = (struct scatterlist *)data->buf; + sgl = data->sg; start_addr = ib_sg_dma_address(ibdev, sgl); for_each_sg(sgl, sg, data->dma_nents, i) { @@ -266,11 +338,10 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data, static void iser_data_buf_dump(struct iser_data_buf *data, struct ib_device *ibdev) { - struct scatterlist *sgl = (struct scatterlist *)data->buf; struct scatterlist *sg; int i; - for_each_sg(sgl, sg, data->dma_nents, i) + for_each_sg(data->sg, sg, data->dma_nents, i) iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", i, (unsigned long)ib_sg_dma_address(ibdev, sg), @@ -288,31 +359,6 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); } -static void iser_page_vec_build(struct iser_data_buf *data, - struct iser_page_vec *page_vec, - struct ib_device *ibdev) -{ - int page_vec_len = 0; - - page_vec->length = 0; - page_vec->offset = 0; - - iser_dbg("Translating sg sz: %d\n", data->dma_nents); - page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, - &page_vec->offset, - &page_vec->data_size); - iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); - - page_vec->length = page_vec_len; - - if (page_vec_len * SIZE_4K < page_vec->data_size) { - iser_err("page_vec too short to hold this SG\n"); - iser_data_buf_dump(data, ibdev); - iser_dump_page_vec(page_vec); - BUG(); - } -} - int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, enum iser_data_dir iser_dir, @@ -323,7 +369,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, iser_task->dir[iser_dir] = 1; dev = iser_task->iser_conn->ib_conn.device->ib_device; - data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); + data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, dma_dir); if (data->dma_nents == 0) { iser_err("dma_map_sg failed!!!\n"); return -EINVAL; @@ -338,24 +384,41 @@ void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, struct ib_device *dev; dev = iser_task->iser_conn->ib_conn.device->ib_device; - ib_dma_unmap_sg(dev, data->buf, data->size, dir); + ib_dma_unmap_sg(dev, data->sg, data->size, dir); +} + +static int +iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, + struct iser_mem_reg *reg) +{ + struct scatterlist *sg = mem->sg; + + reg->sge.lkey = device->mr->lkey; + reg->rkey = device->mr->rkey; + reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); + reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx," + " length=0x%x\n", reg->sge.lkey, reg->rkey, + reg->sge.addr, reg->sge.length); + + return 0; } static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, - struct ib_device *ibdev, struct iser_data_buf *mem, - struct iser_data_buf *mem_copy, enum iser_data_dir cmd_dir, int aligned_len) { - struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; + struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; iscsi_conn->fmr_unalign_cnt++; iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", aligned_len, mem->size); if (iser_debug_level > 0) - iser_data_buf_dump(mem, ibdev); + iser_data_buf_dump(mem, device->ib_device); /* unmap the command data before accessing it */ iser_dma_unmap_task_data(iser_task, mem, @@ -364,13 +427,95 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, /* allocate copy buf, if we are writing, copy the */ /* unaligned scatterlist, dma map the copy */ - if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + if (iser_start_rdma_unaligned_sg(iser_task, mem, cmd_dir) != 0) return -ENOMEM; return 0; } /** + * iser_reg_page_vec - Register physical memory + * + * returns: 0 on success, errno code on failure + */ +static +int iser_reg_page_vec(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg) +{ + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_pool_fmr *fmr; + int ret, plen; + + plen = iser_sg_to_page_vec(mem, device->ib_device, + page_vec->pages, + &page_vec->offset, + &page_vec->data_size); + page_vec->length = plen; + if (plen * SIZE_4K < page_vec->data_size) { + iser_err("page vec too short to hold this SG\n"); + iser_data_buf_dump(mem, device->ib_device); + iser_dump_page_vec(page_vec); + return -EINVAL; + } + + fmr = ib_fmr_pool_map_phys(ib_conn->fmr.pool, + page_vec->pages, + page_vec->length, + page_vec->pages[0]); + if (IS_ERR(fmr)) { + ret = PTR_ERR(fmr); + iser_err("ib_fmr_pool_map_phys failed: %d\n", ret); + return ret; + } + + mem_reg->sge.lkey = fmr->fmr->lkey; + mem_reg->rkey = fmr->fmr->rkey; + mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset; + mem_reg->sge.length = page_vec->data_size; + mem_reg->mem_h = fmr; + + return 0; +} + +/** + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing. + */ +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + int ret; + + if (!reg->mem_h) + return; + + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h); + + ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); + if (ret) + iser_err("ib_fmr_pool_unmap failed %d\n", ret); + + reg->mem_h = NULL; +} + +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + + if (!reg->mem_h) + return; + + iser_reg_desc_put(&iser_task->iser_conn->ib_conn, + reg->mem_h); + reg->mem_h = NULL; +} + +/** * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, * using FMR (if possible) obtaining rkey and va * @@ -383,45 +528,29 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int aligned_len; int err; int i; - struct scatterlist *sg; - regd_buf = &iser_task->rdma_regd[cmd_dir]; + mem_reg = &iser_task->rdma_reg[cmd_dir]; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->data_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->data_copy[cmd_dir]; } /* if there a single dma entry, FMR is not needed */ if (mem->dma_nents == 1) { - sg = (struct scatterlist *)mem->buf; - - regd_buf->reg.lkey = device->mr->lkey; - regd_buf->reg.rkey = device->mr->rkey; - regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); - regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); - - iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " - "va: 0x%08lX sz: %ld]\n", - (unsigned int)regd_buf->reg.lkey, - (unsigned int)regd_buf->reg.rkey, - (unsigned long)regd_buf->reg.va, - (unsigned long)regd_buf->reg.len); + return iser_reg_dma(device, mem, mem_reg); } else { /* use FMR for multiple dma entries */ - iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); - err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, - ®d_buf->reg); + err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec, + mem_reg); if (err && err != -EAGAIN) { iser_data_buf_dump(mem, ibdev); iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", @@ -519,8 +648,10 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task, - struct fast_reg_descriptor *desc, struct ib_sge *data_sge, - struct ib_sge *prot_sge, struct ib_sge *sig_sge) + struct fast_reg_descriptor *desc, + struct iser_mem_reg *data_reg, + struct iser_mem_reg *prot_reg, + struct iser_mem_reg *sig_reg) { struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_pi_context *pi_ctx = desc->pi_ctx; @@ -544,12 +675,12 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, memset(&sig_wr, 0, sizeof(sig_wr)); sig_wr.opcode = IB_WR_REG_SIG_MR; sig_wr.wr_id = ISER_FASTREG_LI_WRID; - sig_wr.sg_list = data_sge; + sig_wr.sg_list = &data_reg->sge; sig_wr.num_sge = 1; sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; if (scsi_prot_sg_count(iser_task->sc)) - sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.prot = &prot_reg->sge; sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; @@ -566,27 +697,26 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, } desc->reg_indicators &= ~ISER_SIG_KEY_VALID; - sig_sge->lkey = pi_ctx->sig_mr->lkey; - sig_sge->addr = 0; - sig_sge->length = scsi_transfer_length(iser_task->sc); + sig_reg->sge.lkey = pi_ctx->sig_mr->lkey; + sig_reg->rkey = pi_ctx->sig_mr->rkey; + sig_reg->sge.addr = 0; + sig_reg->sge.length = scsi_transfer_length(iser_task->sc); - iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", - sig_sge->addr, sig_sge->length, - sig_sge->lkey); + iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n", + sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, + sig_reg->sge.length); err: return ret; } static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, - struct iser_regd_buf *regd_buf, struct iser_data_buf *mem, + struct fast_reg_descriptor *desc, enum iser_reg_indicator ind, - struct ib_sge *sge) + struct iser_mem_reg *reg) { - struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - struct ib_device *ibdev = device->ib_device; struct ib_mr *mr; struct ib_fast_reg_page_list *frpl; struct ib_send_wr fastreg_wr, inv_wr; @@ -594,17 +724,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, int ret, offset, size, plen; /* if there a single dma entry, dma mr suffices */ - if (mem->dma_nents == 1) { - struct scatterlist *sg = (struct scatterlist *)mem->buf; - - sge->lkey = device->mr->lkey; - sge->addr = ib_sg_dma_address(ibdev, &sg[0]); - sge->length = ib_sg_dma_len(ibdev, &sg[0]); - - iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", - sge->lkey, sge->addr, sge->length); - return 0; - } + if (mem->dma_nents == 1) + return iser_reg_dma(device, mem, reg); if (ind == ISER_DATA_KEY_VALID) { mr = desc->data_mr; @@ -652,9 +773,10 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, } desc->reg_indicators &= ~ind; - sge->lkey = mr->lkey; - sge->addr = frpl->page_list[0] + offset; - sge->length = size; + reg->sge.lkey = mr->lkey; + reg->rkey = mr->rkey; + reg->sge.addr = frpl->page_list[0] + offset; + reg->sge.length = size; return ret; } @@ -672,93 +794,66 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; + struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = NULL; - struct ib_sge data_sge; int err, aligned_len; - unsigned long flags; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->data_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->data_copy[cmd_dir]; } if (mem->dma_nents != 1 || scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - spin_lock_irqsave(&ib_conn->lock, flags); - desc = list_first_entry(&ib_conn->fastreg.pool, - struct fast_reg_descriptor, list); - list_del(&desc->list); - spin_unlock_irqrestore(&ib_conn->lock, flags); - regd_buf->reg.mem_h = desc; + desc = iser_reg_desc_get(ib_conn); + mem_reg->mem_h = desc; } - err = iser_fast_reg_mr(iser_task, regd_buf, mem, - ISER_DATA_KEY_VALID, &data_sge); + err = iser_fast_reg_mr(iser_task, mem, desc, + ISER_DATA_KEY_VALID, mem_reg); if (err) goto err_reg; if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - struct ib_sge prot_sge, sig_sge; + struct iser_mem_reg prot_reg; - memset(&prot_sge, 0, sizeof(prot_sge)); + memset(&prot_reg, 0, sizeof(prot_reg)); if (scsi_prot_sg_count(iser_task->sc)) { mem = &iser_task->prot[cmd_dir]; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->prot_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->prot_copy[cmd_dir]; } - err = iser_fast_reg_mr(iser_task, regd_buf, mem, - ISER_PROT_KEY_VALID, &prot_sge); + err = iser_fast_reg_mr(iser_task, mem, desc, + ISER_PROT_KEY_VALID, &prot_reg); if (err) goto err_reg; } - err = iser_reg_sig_mr(iser_task, desc, &data_sge, - &prot_sge, &sig_sge); + err = iser_reg_sig_mr(iser_task, desc, mem_reg, + &prot_reg, mem_reg); if (err) { iser_err("Failed to register signature mr\n"); return err; } desc->reg_indicators |= ISER_FASTREG_PROTECTED; - - regd_buf->reg.lkey = sig_sge.lkey; - regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; - regd_buf->reg.va = sig_sge.addr; - regd_buf->reg.len = sig_sge.length; - } else { - if (desc) - regd_buf->reg.rkey = desc->data_mr->rkey; - else - regd_buf->reg.rkey = device->mr->rkey; - - regd_buf->reg.lkey = data_sge.lkey; - regd_buf->reg.va = data_sge.addr; - regd_buf->reg.len = data_sge.length; } return 0; err_reg: - if (desc) { - spin_lock_irqsave(&ib_conn->lock, flags); - list_add_tail(&desc->list, &ib_conn->fastreg.pool); - spin_unlock_irqrestore(&ib_conn->lock, flags); - } + if (desc) + iser_reg_desc_put(ib_conn, desc); return err; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 4065abe28829..cc2dd35ffbc0 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -274,6 +274,65 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) } static int +iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *desc) +{ + struct iser_pi_context *pi_ctx = NULL; + struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2, + .flags = IB_MR_SIGNATURE_EN}; + int ret = 0; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) + return -ENOMEM; + + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + goto prot_frpl_failure; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + return 0; + +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); + + return ret; +} + +static void +iser_free_pi_ctx(struct iser_pi_context *pi_ctx) +{ + ib_free_fast_reg_page_list(pi_ctx->prot_frpl); + ib_dereg_mr(pi_ctx->prot_mr); + ib_destroy_mr(pi_ctx->sig_mr); + kfree(pi_ctx); +} + +static int iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, bool pi_enable, struct fast_reg_descriptor *desc) { @@ -297,59 +356,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, desc->reg_indicators |= ISER_DATA_KEY_VALID; if (pi_enable) { - struct ib_mr_init_attr mr_init_attr = {0}; - struct iser_pi_context *pi_ctx = NULL; - - desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!desc->pi_ctx) { - iser_err("Failed to allocate pi context\n"); - ret = -ENOMEM; + ret = iser_alloc_pi_ctx(ib_device, pd, desc); + if (ret) goto pi_ctx_alloc_failure; - } - pi_ctx = desc->pi_ctx; - - pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_frpl)) { - ret = PTR_ERR(pi_ctx->prot_frpl); - iser_err("Failed to allocate prot frpl ret=%d\n", - ret); - goto prot_frpl_failure; - } - - pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(pi_ctx->prot_mr)) { - ret = PTR_ERR(pi_ctx->prot_mr); - iser_err("Failed to allocate prot frmr ret=%d\n", - ret); - goto prot_mr_failure; - } - desc->reg_indicators |= ISER_PROT_KEY_VALID; - - mr_init_attr.max_reg_descriptors = 2; - mr_init_attr.flags |= IB_MR_SIGNATURE_EN; - pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); - if (IS_ERR(pi_ctx->sig_mr)) { - ret = PTR_ERR(pi_ctx->sig_mr); - iser_err("Failed to allocate signature enabled mr err=%d\n", - ret); - goto sig_mr_failure; - } - desc->reg_indicators |= ISER_SIG_KEY_VALID; } - desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; - - iser_dbg("Create fr_desc %p page_list %p\n", - desc, desc->data_frpl->page_list); return 0; -sig_mr_failure: - ib_dereg_mr(desc->pi_ctx->prot_mr); -prot_mr_failure: - ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); -prot_frpl_failure: - kfree(desc->pi_ctx); pi_ctx_alloc_failure: ib_dereg_mr(desc->data_mr); fast_reg_mr_failure: @@ -416,12 +428,8 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) list_del(&desc->list); ib_free_fast_reg_page_list(desc->data_frpl); ib_dereg_mr(desc->data_mr); - if (desc->pi_ctx) { - ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); - ib_dereg_mr(desc->pi_ctx->prot_mr); - ib_destroy_mr(desc->pi_ctx->sig_mr); - kfree(desc->pi_ctx); - } + if (desc->pi_ctx) + iser_free_pi_ctx(desc->pi_ctx); kfree(desc); ++i; } @@ -721,7 +729,7 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) struct iser_conn *iser_conn; iser_conn = (struct iser_conn *)cma_id->context; - iser_conn->state = ISER_CONN_DOWN; + iser_conn->state = ISER_CONN_TERMINATING; } /** @@ -992,93 +1000,6 @@ connect_failure: return err; } -/** - * iser_reg_page_vec - Register physical memory - * - * returns: 0 on success, errno code on failure - */ -int iser_reg_page_vec(struct ib_conn *ib_conn, - struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg) -{ - struct ib_pool_fmr *mem; - u64 io_addr; - u64 *page_list; - int status; - - page_list = page_vec->pages; - io_addr = page_list[0]; - - mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, - page_list, - page_vec->length, - io_addr); - - if (IS_ERR(mem)) { - status = (int)PTR_ERR(mem); - iser_err("ib_fmr_pool_map_phys failed: %d\n", status); - return status; - } - - mem_reg->lkey = mem->fmr->lkey; - mem_reg->rkey = mem->fmr->rkey; - mem_reg->len = page_vec->length * SIZE_4K; - mem_reg->va = io_addr; - mem_reg->mem_h = (void *)mem; - - mem_reg->va += page_vec->offset; - mem_reg->len = page_vec->data_size; - - iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " - "entry[0]: (0x%08lx,%ld)] -> " - "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", - page_vec, page_vec->length, - (unsigned long)page_vec->pages[0], - (unsigned long)page_vec->data_size, - (unsigned int)mem_reg->lkey, mem_reg->mem_h, - (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); - return 0; -} - -/** - * Unregister (previosuly registered using FMR) memory. - * If memory is non-FMR does nothing. - */ -void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - int ret; - - if (!reg->mem_h) - return; - - iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); - - ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); - if (ret) - iser_err("ib_fmr_pool_unmap failed %d\n", ret); - - reg->mem_h = NULL; -} - -void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - struct iser_conn *iser_conn = iser_task->iser_conn; - struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct fast_reg_descriptor *desc = reg->mem_h; - - if (!desc) - return; - - reg->mem_h = NULL; - spin_lock_bh(&ib_conn->lock); - list_add_tail(&desc->list, &ib_conn->fastreg.pool); - spin_unlock_bh(&ib_conn->lock); -} - int iser_post_recvl(struct iser_conn *iser_conn) { struct ib_recv_wr rx_wr, *rx_wr_failed; @@ -1210,6 +1131,9 @@ iser_handle_comp_error(struct ib_conn *ib_conn, iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + if (wc->wr_id == ISER_FASTREG_LI_WRID) + return; + if (is_iser_tx_desc(iser_conn, wr_id)) { struct iser_tx_desc *desc = wr_id; @@ -1254,13 +1178,11 @@ static void iser_handle_wc(struct ib_wc *wc) else iser_dbg("flush error: wr id %llx\n", wc->wr_id); - if (wc->wr_id != ISER_FASTREG_LI_WRID && - wc->wr_id != ISER_BEACON_WRID) - iser_handle_comp_error(ib_conn, wc); - - /* complete in case all flush errors were consumed */ if (wc->wr_id == ISER_BEACON_WRID) + /* all flush errors were consumed */ complete(&ib_conn->flush_comp); + else + iser_handle_comp_error(ib_conn, wc); } } @@ -1306,7 +1228,7 @@ static void iser_cq_callback(struct ib_cq *cq, void *cq_context) u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector) { - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = reg->mem_h; unsigned long sector_size = iser_task->sc->device->sector_size; struct ib_mr_status mr_status; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 075b19cc78e8..327529ee85eb 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -76,12 +76,12 @@ isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd) static void isert_qp_event_callback(struct ib_event *e, void *context) { - struct isert_conn *isert_conn = (struct isert_conn *)context; + struct isert_conn *isert_conn = context; isert_err("conn %p event: %d\n", isert_conn, e->event); switch (e->event) { case IB_EVENT_COMM_EST: - rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST); + rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST); break; case IB_EVENT_QP_LAST_WQE_REACHED: isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n"); @@ -107,13 +107,12 @@ isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) return 0; } -static int -isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) +static struct isert_comp * +isert_comp_get(struct isert_conn *isert_conn) { - struct isert_device *device = isert_conn->conn_device; - struct ib_qp_init_attr attr; + struct isert_device *device = isert_conn->device; struct isert_comp *comp; - int ret, i, min = 0; + int i, min = 0; mutex_lock(&device_list_mutex); for (i = 0; i < device->comps_used; i++) @@ -122,9 +121,30 @@ isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) min = i; comp = &device->comps[min]; comp->active_qps++; + mutex_unlock(&device_list_mutex); + isert_info("conn %p, using comp %p min_index: %d\n", isert_conn, comp, min); + + return comp; +} + +static void +isert_comp_put(struct isert_comp *comp) +{ + mutex_lock(&device_list_mutex); + comp->active_qps--; mutex_unlock(&device_list_mutex); +} + +static struct ib_qp * +isert_create_qp(struct isert_conn *isert_conn, + struct isert_comp *comp, + struct rdma_cm_id *cma_id) +{ + struct isert_device *device = isert_conn->device; + struct ib_qp_init_attr attr; + int ret; memset(&attr, 0, sizeof(struct ib_qp_init_attr)); attr.event_handler = isert_qp_event_callback; @@ -149,19 +169,31 @@ isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) if (device->pi_capable) attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; - ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr); + ret = rdma_create_qp(cma_id, device->pd, &attr); if (ret) { isert_err("rdma_create_qp failed for cma_id %d\n", ret); + return ERR_PTR(ret); + } + + return cma_id->qp; +} + +static int +isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id) +{ + struct isert_comp *comp; + int ret; + + comp = isert_comp_get(isert_conn); + isert_conn->qp = isert_create_qp(isert_conn, comp, cma_id); + if (IS_ERR(isert_conn->qp)) { + ret = PTR_ERR(isert_conn->qp); goto err; } - isert_conn->conn_qp = cma_id->qp; return 0; err: - mutex_lock(&device_list_mutex); - comp->active_qps--; - mutex_unlock(&device_list_mutex); - + isert_comp_put(comp); return ret; } @@ -174,18 +206,19 @@ isert_cq_event_callback(struct ib_event *e, void *context) static int isert_alloc_rx_descriptors(struct isert_conn *isert_conn) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; struct iser_rx_desc *rx_desc; struct ib_sge *rx_sg; u64 dma_addr; int i, j; - isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * + isert_conn->rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS * sizeof(struct iser_rx_desc), GFP_KERNEL); - if (!isert_conn->conn_rx_descs) + if (!isert_conn->rx_descs) goto fail; - rx_desc = isert_conn->conn_rx_descs; + rx_desc = isert_conn->rx_descs; for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc, @@ -198,21 +231,21 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn) rx_sg = &rx_desc->rx_sg; rx_sg->addr = rx_desc->dma_addr; rx_sg->length = ISER_RX_PAYLOAD_SIZE; - rx_sg->lkey = isert_conn->conn_mr->lkey; + rx_sg->lkey = device->mr->lkey; } - isert_conn->conn_rx_desc_head = 0; + isert_conn->rx_desc_head = 0; return 0; dma_map_fail: - rx_desc = isert_conn->conn_rx_descs; + rx_desc = isert_conn->rx_descs; for (j = 0; j < i; j++, rx_desc++) { ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); } - kfree(isert_conn->conn_rx_descs); - isert_conn->conn_rx_descs = NULL; + kfree(isert_conn->rx_descs); + isert_conn->rx_descs = NULL; fail: isert_err("conn %p failed to allocate rx descriptors\n", isert_conn); @@ -222,59 +255,51 @@ fail: static void isert_free_rx_descriptors(struct isert_conn *isert_conn) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->device->ib_device; struct iser_rx_desc *rx_desc; int i; - if (!isert_conn->conn_rx_descs) + if (!isert_conn->rx_descs) return; - rx_desc = isert_conn->conn_rx_descs; + rx_desc = isert_conn->rx_descs; for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++) { ib_dma_unmap_single(ib_dev, rx_desc->dma_addr, ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); } - kfree(isert_conn->conn_rx_descs); - isert_conn->conn_rx_descs = NULL; + kfree(isert_conn->rx_descs); + isert_conn->rx_descs = NULL; } static void isert_cq_work(struct work_struct *); static void isert_cq_callback(struct ib_cq *, void *); -static int -isert_create_device_ib_res(struct isert_device *device) +static void +isert_free_comps(struct isert_device *device) { - struct ib_device *ib_dev = device->ib_device; - struct ib_device_attr *dev_attr; - int ret = 0, i; - int max_cqe; - - dev_attr = &device->dev_attr; - ret = isert_query_device(ib_dev, dev_attr); - if (ret) - return ret; + int i; - max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); + for (i = 0; i < device->comps_used; i++) { + struct isert_comp *comp = &device->comps[i]; - /* asign function handlers */ - if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && - dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { - device->use_fastreg = 1; - device->reg_rdma_mem = isert_reg_rdma; - device->unreg_rdma_mem = isert_unreg_rdma; - } else { - device->use_fastreg = 0; - device->reg_rdma_mem = isert_map_rdma; - device->unreg_rdma_mem = isert_unmap_cmd; + if (comp->cq) { + cancel_work_sync(&comp->work); + ib_destroy_cq(comp->cq); + } } + kfree(device->comps); +} - /* Check signature cap */ - device->pi_capable = dev_attr->device_cap_flags & - IB_DEVICE_SIGNATURE_HANDOVER ? true : false; +static int +isert_alloc_comps(struct isert_device *device, + struct ib_device_attr *attr) +{ + int i, max_cqe, ret = 0; device->comps_used = min(ISERT_MAX_CQ, min_t(int, num_online_cpus(), - device->ib_device->num_comp_vectors)); + device->ib_device->num_comp_vectors)); + isert_info("Using %d CQs, %s supports %d vectors support " "Fast registration %d pi_capable %d\n", device->comps_used, device->ib_device->name, @@ -288,6 +313,8 @@ isert_create_device_ib_res(struct isert_device *device) return -ENOMEM; } + max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe); + for (i = 0; i < device->comps_used; i++) { struct isert_comp *comp = &device->comps[i]; @@ -299,6 +326,7 @@ isert_create_device_ib_res(struct isert_device *device) (void *)comp, max_cqe, i); if (IS_ERR(comp->cq)) { + isert_err("Unable to allocate cq\n"); ret = PTR_ERR(comp->cq); comp->cq = NULL; goto out_cq; @@ -310,40 +338,79 @@ isert_create_device_ib_res(struct isert_device *device) } return 0; - out_cq: - for (i = 0; i < device->comps_used; i++) { - struct isert_comp *comp = &device->comps[i]; + isert_free_comps(device); + return ret; +} - if (comp->cq) { - cancel_work_sync(&comp->work); - ib_destroy_cq(comp->cq); - } +static int +isert_create_device_ib_res(struct isert_device *device) +{ + struct ib_device_attr *dev_attr; + int ret; + + dev_attr = &device->dev_attr; + ret = isert_query_device(device->ib_device, dev_attr); + if (ret) + return ret; + + /* asign function handlers */ + if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + device->use_fastreg = 1; + device->reg_rdma_mem = isert_reg_rdma; + device->unreg_rdma_mem = isert_unreg_rdma; + } else { + device->use_fastreg = 0; + device->reg_rdma_mem = isert_map_rdma; + device->unreg_rdma_mem = isert_unmap_cmd; } - kfree(device->comps); + ret = isert_alloc_comps(device, dev_attr); + if (ret) + return ret; + + device->pd = ib_alloc_pd(device->ib_device); + if (IS_ERR(device->pd)) { + ret = PTR_ERR(device->pd); + isert_err("failed to allocate pd, device %p, ret=%d\n", + device, ret); + goto out_cq; + } + + device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(device->mr)) { + ret = PTR_ERR(device->mr); + isert_err("failed to create dma mr, device %p, ret=%d\n", + device, ret); + goto out_mr; + } + + /* Check signature cap */ + device->pi_capable = dev_attr->device_cap_flags & + IB_DEVICE_SIGNATURE_HANDOVER ? true : false; + + return 0; + +out_mr: + ib_dealloc_pd(device->pd); +out_cq: + isert_free_comps(device); return ret; } static void isert_free_device_ib_res(struct isert_device *device) { - int i; - isert_info("device %p\n", device); - for (i = 0; i < device->comps_used; i++) { - struct isert_comp *comp = &device->comps[i]; - - cancel_work_sync(&comp->work); - ib_destroy_cq(comp->cq); - comp->cq = NULL; - } - kfree(device->comps); + ib_dereg_mr(device->mr); + ib_dealloc_pd(device->pd); + isert_free_comps(device); } static void -isert_device_try_release(struct isert_device *device) +isert_device_put(struct isert_device *device) { mutex_lock(&device_list_mutex); device->refcount--; @@ -357,7 +424,7 @@ isert_device_try_release(struct isert_device *device) } static struct isert_device * -isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id) +isert_device_get(struct rdma_cm_id *cma_id) { struct isert_device *device; int ret; @@ -404,13 +471,13 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) struct fast_reg_descriptor *fr_desc, *tmp; int i = 0; - if (list_empty(&isert_conn->conn_fr_pool)) + if (list_empty(&isert_conn->fr_pool)) return; isert_info("Freeing conn %p fastreg pool", isert_conn); list_for_each_entry_safe(fr_desc, tmp, - &isert_conn->conn_fr_pool, list) { + &isert_conn->fr_pool, list) { list_del(&fr_desc->list); ib_free_fast_reg_page_list(fr_desc->data_frpl); ib_dereg_mr(fr_desc->data_mr); @@ -424,9 +491,9 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) ++i; } - if (i < isert_conn->conn_fr_pool_size) + if (i < isert_conn->fr_pool_size) isert_warn("Pool still has %d regions registered\n", - isert_conn->conn_fr_pool_size - i); + isert_conn->fr_pool_size - i); } static int @@ -526,7 +593,7 @@ static int isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) { struct fast_reg_descriptor *fr_desc; - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; struct se_session *se_sess = isert_conn->conn->sess->se_sess; struct se_node_acl *se_nacl = se_sess->se_node_acl; int i, ret, tag_num; @@ -537,7 +604,7 @@ isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; - isert_conn->conn_fr_pool_size = 0; + isert_conn->fr_pool_size = 0; for (i = 0; i < tag_num; i++) { fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); if (!fr_desc) { @@ -547,7 +614,7 @@ isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) } ret = isert_create_fr_desc(device->ib_device, - isert_conn->conn_pd, fr_desc); + device->pd, fr_desc); if (ret) { isert_err("Failed to create fastreg descriptor err=%d\n", ret); @@ -555,12 +622,12 @@ isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) goto err; } - list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); - isert_conn->conn_fr_pool_size++; + list_add_tail(&fr_desc->list, &isert_conn->fr_pool); + isert_conn->fr_pool_size++; } isert_dbg("Creating conn %p fastreg pool size=%d", - isert_conn, isert_conn->conn_fr_pool_size); + isert_conn, isert_conn->fr_pool_size); return 0; @@ -569,55 +636,50 @@ err: return ret; } -static int -isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +static void +isert_init_conn(struct isert_conn *isert_conn) { - struct isert_np *isert_np = cma_id->context; - struct iscsi_np *np = isert_np->np; - struct isert_conn *isert_conn; - struct isert_device *device; - struct ib_device *ib_dev = cma_id->device; - int ret = 0; - - spin_lock_bh(&np->np_thread_lock); - if (!np->enabled) { - spin_unlock_bh(&np->np_thread_lock); - isert_dbg("iscsi_np is not enabled, reject connect request\n"); - return rdma_reject(cma_id, NULL, 0); - } - spin_unlock_bh(&np->np_thread_lock); - - isert_dbg("cma_id: %p, portal: %p\n", - cma_id, cma_id->context); - - isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); - if (!isert_conn) { - isert_err("Unable to allocate isert_conn\n"); - return -ENOMEM; - } isert_conn->state = ISER_CONN_INIT; - INIT_LIST_HEAD(&isert_conn->conn_accept_node); - init_completion(&isert_conn->conn_login_comp); + INIT_LIST_HEAD(&isert_conn->accept_node); + init_completion(&isert_conn->login_comp); init_completion(&isert_conn->login_req_comp); - init_completion(&isert_conn->conn_wait); - kref_init(&isert_conn->conn_kref); - mutex_init(&isert_conn->conn_mutex); - spin_lock_init(&isert_conn->conn_lock); - INIT_LIST_HEAD(&isert_conn->conn_fr_pool); + init_completion(&isert_conn->wait); + kref_init(&isert_conn->kref); + mutex_init(&isert_conn->mutex); + spin_lock_init(&isert_conn->pool_lock); + INIT_LIST_HEAD(&isert_conn->fr_pool); +} + +static void +isert_free_login_buf(struct isert_conn *isert_conn) +{ + struct ib_device *ib_dev = isert_conn->device->ib_device; - isert_conn->conn_cm_id = cma_id; + ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_FROM_DEVICE); + kfree(isert_conn->login_buf); +} + +static int +isert_alloc_login_buf(struct isert_conn *isert_conn, + struct ib_device *ib_dev) +{ + int ret; isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + ISER_RX_LOGIN_SIZE, GFP_KERNEL); if (!isert_conn->login_buf) { isert_err("Unable to allocate isert_conn->login_buf\n"); - ret = -ENOMEM; - goto out; + return -ENOMEM; } isert_conn->login_req_buf = isert_conn->login_buf; isert_conn->login_rsp_buf = isert_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN; + isert_dbg("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n", isert_conn->login_buf, isert_conn->login_req_buf, isert_conn->login_rsp_buf); @@ -628,8 +690,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma); if (ret) { - isert_err("ib_dma_mapping_error failed for login_req_dma: %d\n", - ret); + isert_err("login_req_dma mapping error: %d\n", ret); isert_conn->login_req_dma = 0; goto out_login_buf; } @@ -640,17 +701,58 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma); if (ret) { - isert_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n", - ret); + isert_err("login_rsp_dma mapping error: %d\n", ret); isert_conn->login_rsp_dma = 0; goto out_req_dma_map; } - device = isert_device_find_by_ib_dev(cma_id); + return 0; + +out_req_dma_map: + ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); +out_login_buf: + kfree(isert_conn->login_buf); + return ret; +} + +static int +isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + struct isert_np *isert_np = cma_id->context; + struct iscsi_np *np = isert_np->np; + struct isert_conn *isert_conn; + struct isert_device *device; + int ret = 0; + + spin_lock_bh(&np->np_thread_lock); + if (!np->enabled) { + spin_unlock_bh(&np->np_thread_lock); + isert_dbg("iscsi_np is not enabled, reject connect request\n"); + return rdma_reject(cma_id, NULL, 0); + } + spin_unlock_bh(&np->np_thread_lock); + + isert_dbg("cma_id: %p, portal: %p\n", + cma_id, cma_id->context); + + isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL); + if (!isert_conn) + return -ENOMEM; + + isert_init_conn(isert_conn); + isert_conn->cm_id = cma_id; + + ret = isert_alloc_login_buf(isert_conn, cma_id->device); + if (ret) + goto out; + + device = isert_device_get(cma_id); if (IS_ERR(device)) { ret = PTR_ERR(device); goto out_rsp_dma_map; } + isert_conn->device = device; /* Set max inflight RDMA READ requests */ isert_conn->initiator_depth = min_t(u8, @@ -658,24 +760,6 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) device->dev_attr.max_qp_init_rd_atom); isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth); - isert_conn->conn_device = device; - isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device); - if (IS_ERR(isert_conn->conn_pd)) { - ret = PTR_ERR(isert_conn->conn_pd); - isert_err("ib_alloc_pd failed for conn %p: ret=%d\n", - isert_conn, ret); - goto out_pd; - } - - isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(isert_conn->conn_mr)) { - ret = PTR_ERR(isert_conn->conn_mr); - isert_err("ib_get_dma_mr failed for conn %p: ret=%d\n", - isert_conn, ret); - goto out_mr; - } - ret = isert_conn_setup_qp(isert_conn, cma_id); if (ret) goto out_conn_dev; @@ -689,7 +773,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) goto out_conn_dev; mutex_lock(&isert_np->np_accept_mutex); - list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list); + list_add_tail(&isert_conn->accept_node, &isert_np->np_accept_list); mutex_unlock(&isert_np->np_accept_mutex); isert_info("np %p: Allow accept_np to continue\n", np); @@ -697,19 +781,9 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) return 0; out_conn_dev: - ib_dereg_mr(isert_conn->conn_mr); -out_mr: - ib_dealloc_pd(isert_conn->conn_pd); -out_pd: - isert_device_try_release(device); + isert_device_put(device); out_rsp_dma_map: - ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, - ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); -out_req_dma_map: - ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE); -out_login_buf: - kfree(isert_conn->login_buf); + isert_free_login_buf(isert_conn); out: kfree(isert_conn); rdma_reject(cma_id, NULL, 0); @@ -719,43 +793,32 @@ out: static void isert_connect_release(struct isert_conn *isert_conn) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; isert_dbg("conn %p\n", isert_conn); - if (device && device->use_fastreg) + BUG_ON(!device); + + if (device->use_fastreg) isert_conn_free_fastreg_pool(isert_conn); isert_free_rx_descriptors(isert_conn); - rdma_destroy_id(isert_conn->conn_cm_id); + if (isert_conn->cm_id) + rdma_destroy_id(isert_conn->cm_id); - if (isert_conn->conn_qp) { - struct isert_comp *comp = isert_conn->conn_qp->recv_cq->cq_context; + if (isert_conn->qp) { + struct isert_comp *comp = isert_conn->qp->recv_cq->cq_context; - isert_dbg("dec completion context %p active_qps\n", comp); - mutex_lock(&device_list_mutex); - comp->active_qps--; - mutex_unlock(&device_list_mutex); - - ib_destroy_qp(isert_conn->conn_qp); + isert_comp_put(comp); + ib_destroy_qp(isert_conn->qp); } - ib_dereg_mr(isert_conn->conn_mr); - ib_dealloc_pd(isert_conn->conn_pd); + if (isert_conn->login_buf) + isert_free_login_buf(isert_conn); - if (isert_conn->login_buf) { - ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma, - ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE); - ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma, - ISCSI_DEF_MAX_RECV_SEG_LEN, - DMA_FROM_DEVICE); - kfree(isert_conn->login_buf); - } - kfree(isert_conn); + isert_device_put(device); - if (device) - isert_device_try_release(device); + kfree(isert_conn); } static void @@ -765,22 +828,22 @@ isert_connected_handler(struct rdma_cm_id *cma_id) isert_info("conn %p\n", isert_conn); - if (!kref_get_unless_zero(&isert_conn->conn_kref)) { + if (!kref_get_unless_zero(&isert_conn->kref)) { isert_warn("conn %p connect_release is running\n", isert_conn); return; } - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); if (isert_conn->state != ISER_CONN_FULL_FEATURE) isert_conn->state = ISER_CONN_UP; - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); } static void -isert_release_conn_kref(struct kref *kref) +isert_release_kref(struct kref *kref) { struct isert_conn *isert_conn = container_of(kref, - struct isert_conn, conn_kref); + struct isert_conn, kref); isert_info("conn %p final kref %s/%d\n", isert_conn, current->comm, current->pid); @@ -791,7 +854,7 @@ isert_release_conn_kref(struct kref *kref) static void isert_put_conn(struct isert_conn *isert_conn) { - kref_put(&isert_conn->conn_kref, isert_release_conn_kref); + kref_put(&isert_conn->kref, isert_release_kref); } /** @@ -803,7 +866,7 @@ isert_put_conn(struct isert_conn *isert_conn) * to TEMINATING and start teardown sequence (rdma_disconnect). * In case the connection state is UP, complete flush as well. * - * This routine must be called with conn_mutex held. Thus it is + * This routine must be called with mutex held. Thus it is * safe to call multiple times. */ static void @@ -819,7 +882,7 @@ isert_conn_terminate(struct isert_conn *isert_conn) isert_info("Terminating conn %p state %d\n", isert_conn, isert_conn->state); isert_conn->state = ISER_CONN_TERMINATING; - err = rdma_disconnect(isert_conn->conn_cm_id); + err = rdma_disconnect(isert_conn->cm_id); if (err) isert_warn("Failed rdma_disconnect isert_conn %p\n", isert_conn); @@ -868,22 +931,25 @@ isert_disconnected_handler(struct rdma_cm_id *cma_id, isert_conn = cma_id->qp->qp_context; - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); isert_conn_terminate(isert_conn); - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); - isert_info("conn %p completing conn_wait\n", isert_conn); - complete(&isert_conn->conn_wait); + isert_info("conn %p completing wait\n", isert_conn); + complete(&isert_conn->wait); return 0; } -static void +static int isert_connect_error(struct rdma_cm_id *cma_id) { struct isert_conn *isert_conn = cma_id->qp->qp_context; + isert_conn->cm_id = NULL; isert_put_conn(isert_conn); + + return -1; } static int @@ -912,7 +978,7 @@ isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) case RDMA_CM_EVENT_REJECTED: /* FALLTHRU */ case RDMA_CM_EVENT_UNREACHABLE: /* FALLTHRU */ case RDMA_CM_EVENT_CONNECT_ERROR: - isert_connect_error(cma_id); + ret = isert_connect_error(cma_id); break; default: isert_err("Unhandled RDMA CMA event: %d\n", event->event); @@ -927,11 +993,11 @@ isert_post_recv(struct isert_conn *isert_conn, u32 count) { struct ib_recv_wr *rx_wr, *rx_wr_failed; int i, ret; - unsigned int rx_head = isert_conn->conn_rx_desc_head; + unsigned int rx_head = isert_conn->rx_desc_head; struct iser_rx_desc *rx_desc; - for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) { - rx_desc = &isert_conn->conn_rx_descs[rx_head]; + for (rx_wr = isert_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { + rx_desc = &isert_conn->rx_descs[rx_head]; rx_wr->wr_id = (uintptr_t)rx_desc; rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; @@ -943,14 +1009,14 @@ isert_post_recv(struct isert_conn *isert_conn, u32 count) rx_wr->next = NULL; /* mark end of work requests list */ isert_conn->post_recv_buf_count += count; - ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr, + ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr, &rx_wr_failed); if (ret) { isert_err("ib_post_recv() failed with ret: %d\n", ret); isert_conn->post_recv_buf_count -= count; } else { isert_dbg("Posted %d RX buffers\n", count); - isert_conn->conn_rx_desc_head = rx_head; + isert_conn->rx_desc_head = rx_head; } return ret; } @@ -958,7 +1024,7 @@ isert_post_recv(struct isert_conn *isert_conn, u32 count) static int isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; struct ib_send_wr send_wr, *send_wr_failed; int ret; @@ -972,7 +1038,7 @@ isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed); + ret = ib_post_send(isert_conn->qp, &send_wr, &send_wr_failed); if (ret) isert_err("ib_post_send() failed, ret: %d\n", ret); @@ -984,7 +1050,8 @@ isert_create_send_desc(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, struct iser_tx_desc *tx_desc) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); @@ -995,8 +1062,8 @@ isert_create_send_desc(struct isert_conn *isert_conn, tx_desc->num_sge = 1; tx_desc->isert_cmd = isert_cmd; - if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) { - tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { + tx_desc->tx_sg[0].lkey = device->mr->lkey; isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc); } } @@ -1005,7 +1072,8 @@ static int isert_init_tx_hdrs(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; u64 dma_addr; dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc, @@ -1018,7 +1086,7 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn, tx_desc->dma_addr = dma_addr; tx_desc->tx_sg[0].addr = tx_desc->dma_addr; tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; - tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey; + tx_desc->tx_sg[0].lkey = device->mr->lkey; isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n", tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length, @@ -1051,7 +1119,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn) memset(&sge, 0, sizeof(struct ib_sge)); sge.addr = isert_conn->login_req_dma; sge.length = ISER_RX_LOGIN_SIZE; - sge.lkey = isert_conn->conn_mr->lkey; + sge.lkey = isert_conn->device->mr->lkey; isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n", sge.addr, sge.length, sge.lkey); @@ -1062,7 +1130,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn) rx_wr.num_sge = 1; isert_conn->post_recv_buf_count++; - ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail); + ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_fail); if (ret) { isert_err("ib_post_recv() failed: %d\n", ret); isert_conn->post_recv_buf_count--; @@ -1076,8 +1144,9 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, u32 length) { struct isert_conn *isert_conn = conn->context; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; - struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + struct iser_tx_desc *tx_desc = &isert_conn->login_tx_desc; int ret; isert_create_send_desc(isert_conn, NULL, tx_desc); @@ -1100,13 +1169,13 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, tx_dsg->addr = isert_conn->login_rsp_dma; tx_dsg->length = length; - tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_dsg->lkey = isert_conn->device->mr->lkey; tx_desc->num_sge = 2; } if (!login->login_failed) { if (login->login_complete) { if (!conn->sess->sess_ops->SessionType && - isert_conn->conn_device->use_fastreg) { + isert_conn->device->use_fastreg) { ret = isert_conn_create_fastreg_pool(isert_conn); if (ret) { isert_err("Conn: %p failed to create" @@ -1124,9 +1193,9 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, return ret; /* Now we are in FULL_FEATURE phase */ - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); isert_conn->state = ISER_CONN_FULL_FEATURE; - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); goto post_send; } @@ -1185,7 +1254,7 @@ isert_rx_login_req(struct isert_conn *isert_conn) memcpy(login->req_buf, &rx_desc->data[0], size); if (login->first_request) { - complete(&isert_conn->conn_login_comp); + complete(&isert_conn->login_comp); return; } schedule_delayed_work(&conn->login_work, 0); @@ -1194,7 +1263,7 @@ isert_rx_login_req(struct isert_conn *isert_conn) static struct iscsi_cmd *isert_allocate_cmd(struct iscsi_conn *conn) { - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct isert_cmd *isert_cmd; struct iscsi_cmd *cmd; @@ -1379,13 +1448,12 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, { struct iscsi_hdr *hdr = &rx_desc->iscsi_header; struct iscsi_conn *conn = isert_conn->conn; - struct iscsi_session *sess = conn->sess; struct iscsi_cmd *cmd; struct isert_cmd *isert_cmd; int ret = -EINVAL; u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK); - if (sess->sess_ops->SessionType && + if (conn->sess->sess_ops->SessionType && (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) { isert_err("Got illegal opcode: 0x%02x in SessionType=Discovery," " ignoring\n", opcode); @@ -1497,10 +1565,11 @@ isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) } static void -isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, - u32 xfer_len) +isert_rcv_completion(struct iser_rx_desc *desc, + struct isert_conn *isert_conn, + u32 xfer_len) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; struct iscsi_hdr *hdr; u64 rx_dma; int rx_buflen, outstanding; @@ -1532,9 +1601,9 @@ isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, if (login && !login->first_request) isert_rx_login_req(isert_conn); } - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); complete(&isert_conn->login_req_comp); - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); } else { isert_rx_do_work(desc, isert_conn); } @@ -1566,7 +1635,7 @@ isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, struct scatterlist *sg, u32 nents, u32 length, u32 offset, enum iser_ib_op_code op, struct isert_data_buf *data) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; data->dma_dir = op == ISER_IB_RDMA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE; @@ -1597,7 +1666,7 @@ isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, static void isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); memset(data, 0, sizeof(*data)); @@ -1634,7 +1703,6 @@ static void isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) { struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; - LIST_HEAD(unmap_list); isert_dbg("Cmd %p\n", isert_cmd); @@ -1644,9 +1712,9 @@ isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) isert_unmap_data_buf(isert_conn, &wr->prot); wr->fr_desc->ind &= ~ISERT_PROTECTED; } - spin_lock_bh(&isert_conn->conn_lock); - list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool); - spin_unlock_bh(&isert_conn->conn_lock); + spin_lock_bh(&isert_conn->pool_lock); + list_add_tail(&wr->fr_desc->list, &isert_conn->fr_pool); + spin_unlock_bh(&isert_conn->pool_lock); wr->fr_desc = NULL; } @@ -1665,7 +1733,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct isert_conn *isert_conn = isert_cmd->conn; struct iscsi_conn *conn = isert_conn->conn; - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; struct iscsi_text_rsp *hdr; isert_dbg("Cmd %p\n", isert_cmd); @@ -1815,7 +1883,7 @@ isert_completion_rdma_write(struct iser_tx_desc *tx_desc, struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_conn *isert_conn = isert_cmd->conn; - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; int ret = 0; if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { @@ -1841,7 +1909,7 @@ isert_completion_rdma_read(struct iser_tx_desc *tx_desc, struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_conn *isert_conn = isert_cmd->conn; - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; int ret = 0; if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) { @@ -1861,11 +1929,13 @@ isert_completion_rdma_read(struct iser_tx_desc *tx_desc, cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT; spin_unlock_bh(&cmd->istate_lock); - if (ret) + if (ret) { + target_put_sess_cmd(se_cmd->se_sess, se_cmd); transport_send_check_condition_and_sense(se_cmd, se_cmd->pi_err, 0); - else + } else { target_execute_cmd(se_cmd); + } } static void @@ -1874,7 +1944,7 @@ isert_do_control_comp(struct work_struct *work) struct isert_cmd *isert_cmd = container_of(work, struct isert_cmd, comp_work); struct isert_conn *isert_conn = isert_cmd->conn; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; isert_dbg("Cmd %p i_state %d\n", isert_cmd, cmd->i_state); @@ -1922,10 +1992,10 @@ isert_response_completion(struct iser_tx_desc *tx_desc, } static void -isert_send_completion(struct iser_tx_desc *tx_desc, +isert_snd_completion(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; struct isert_cmd *isert_cmd = tx_desc->isert_cmd; struct isert_rdma_wr *wr; @@ -1938,10 +2008,6 @@ isert_send_completion(struct iser_tx_desc *tx_desc, isert_dbg("Cmd %p iser_ib_op %d\n", isert_cmd, wr->iser_ib_op); switch (wr->iser_ib_op) { - case ISER_IB_RECV: - isert_err("Got ISER_IB_RECV\n"); - dump_stack(); - break; case ISER_IB_SEND: isert_response_completion(tx_desc, isert_cmd, isert_conn, ib_dev); @@ -1973,8 +2039,8 @@ isert_send_completion(struct iser_tx_desc *tx_desc, static inline bool is_isert_tx_desc(struct isert_conn *isert_conn, void *wr_id) { - void *start = isert_conn->conn_rx_descs; - int len = ISERT_QP_MAX_RECV_DTOS * sizeof(*isert_conn->conn_rx_descs); + void *start = isert_conn->rx_descs; + int len = ISERT_QP_MAX_RECV_DTOS * sizeof(*isert_conn->rx_descs); if (wr_id >= start && wr_id < start + len) return false; @@ -1986,11 +2052,11 @@ static void isert_cq_comp_err(struct isert_conn *isert_conn, struct ib_wc *wc) { if (wc->wr_id == ISER_BEACON_WRID) { - isert_info("conn %p completing conn_wait_comp_err\n", + isert_info("conn %p completing wait_comp_err\n", isert_conn); - complete(&isert_conn->conn_wait_comp_err); + complete(&isert_conn->wait_comp_err); } else if (is_isert_tx_desc(isert_conn, (void *)(uintptr_t)wc->wr_id)) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct ib_device *ib_dev = isert_conn->cm_id->device; struct isert_cmd *isert_cmd; struct iser_tx_desc *desc; @@ -2018,10 +2084,10 @@ isert_handle_wc(struct ib_wc *wc) if (likely(wc->status == IB_WC_SUCCESS)) { if (wc->opcode == IB_WC_RECV) { rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; - isert_rx_completion(rx_desc, isert_conn, wc->byte_len); + isert_rcv_completion(rx_desc, isert_conn, wc->byte_len); } else { tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; - isert_send_completion(tx_desc, isert_conn); + isert_snd_completion(tx_desc, isert_conn); } } else { if (wc->status != IB_WC_WR_FLUSH_ERR) @@ -2070,7 +2136,7 @@ isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) struct ib_send_wr *wr_failed; int ret; - ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr, + ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr, &wr_failed); if (ret) { isert_err("ib_post_send failed with %d\n", ret); @@ -2083,7 +2149,7 @@ static int isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *) &isert_cmd->tx_desc.iscsi_header; @@ -2097,7 +2163,8 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) if (cmd->se_cmd.sense_buffer && ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; u32 padding, pdu_len; @@ -2116,7 +2183,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->pdu_buf_len = pdu_len; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = pdu_len; - tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_dsg->lkey = device->mr->lkey; isert_cmd->tx_desc.num_sge = 2; } @@ -2131,8 +2198,8 @@ static void isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct isert_device *device = isert_conn->conn_device; + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) @@ -2148,8 +2215,8 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) static enum target_prot_op isert_get_sup_prot_ops(struct iscsi_conn *conn) { - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct isert_device *device = isert_conn->conn_device; + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; if (conn->tpg->tpg_attrib.t10_pi) { if (device->pi_capable) { @@ -2170,7 +2237,7 @@ isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn, bool nopout_response) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); @@ -2189,7 +2256,7 @@ static int isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); @@ -2207,7 +2274,7 @@ static int isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc); @@ -2225,9 +2292,10 @@ static int isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; struct iscsi_reject *hdr = (struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header; @@ -2243,7 +2311,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn) isert_cmd->pdu_buf_len = ISCSI_HDR_LEN; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = ISCSI_HDR_LEN; - tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_dsg->lkey = device->mr->lkey; isert_cmd->tx_desc.num_sge = 2; isert_init_send_wr(isert_conn, isert_cmd, send_wr); @@ -2257,7 +2325,7 @@ static int isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr; struct iscsi_text_rsp *hdr = (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header; @@ -2273,7 +2341,8 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); if (txt_rsp_len) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1]; void *txt_rsp_buf = cmd->buf_ptr; @@ -2283,7 +2352,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) isert_cmd->pdu_buf_len = txt_rsp_len; tx_dsg->addr = isert_cmd->pdu_buf_dma; tx_dsg->length = txt_rsp_len; - tx_dsg->lkey = isert_conn->conn_mr->lkey; + tx_dsg->lkey = device->mr->lkey; isert_cmd->tx_desc.num_sge = 2; } isert_init_send_wr(isert_conn, isert_cmd, send_wr); @@ -2300,7 +2369,8 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, { struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct scatterlist *sg_start, *tmp_sg; - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; u32 sg_off, page_off; int i = 0, sg_nents; @@ -2324,7 +2394,7 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; ib_sge->length = min_t(u32, data_left, ib_sg_dma_len(ib_dev, tmp_sg) - page_off); - ib_sge->lkey = isert_conn->conn_mr->lkey; + ib_sge->lkey = device->mr->lkey; isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", ib_sge->addr, ib_sge->length, ib_sge->lkey); @@ -2346,7 +2416,7 @@ isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, { struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; struct isert_data_buf *data = &wr->data; struct ib_send_wr *send_wr; struct ib_sge *ib_sge; @@ -2485,7 +2555,8 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, enum isert_indicator ind, struct ib_sge *sge) { - struct ib_device *ib_dev = isert_conn->conn_cm_id->device; + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; struct ib_mr *mr; struct ib_fast_reg_page_list *frpl; struct ib_send_wr fr_wr, inv_wr; @@ -2494,7 +2565,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, u32 page_off; if (mem->dma_nents == 1) { - sge->lkey = isert_conn->conn_mr->lkey; + sge->lkey = device->mr->lkey; sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", @@ -2542,7 +2613,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn, else wr->next = &fr_wr; - ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + ret = ib_post_send(isert_conn->qp, wr, &bad_wr); if (ret) { isert_err("fast registration failed, ret:%d\n", ret); return ret; @@ -2655,7 +2726,7 @@ isert_reg_sig_mr(struct isert_conn *isert_conn, else wr->next = &sig_wr; - ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr); + ret = ib_post_send(isert_conn->qp, wr, &bad_wr); if (ret) { isert_err("fast registration failed, ret:%d\n", ret); goto err; @@ -2685,14 +2756,14 @@ isert_handle_prot_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, struct isert_rdma_wr *wr) { - struct isert_device *device = isert_conn->conn_device; + struct isert_device *device = isert_conn->device; struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; int ret; if (!wr->fr_desc->pi_ctx) { ret = isert_create_pi_ctx(wr->fr_desc, device->ib_device, - isert_conn->conn_pd); + device->pd); if (ret) { isert_err("conn %p failed to allocate pi_ctx\n", isert_conn); @@ -2763,11 +2834,11 @@ isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, return ret; if (wr->data.dma_nents != 1 || isert_prot_cmd(isert_conn, se_cmd)) { - spin_lock_irqsave(&isert_conn->conn_lock, flags); - fr_desc = list_first_entry(&isert_conn->conn_fr_pool, + spin_lock_irqsave(&isert_conn->pool_lock, flags); + fr_desc = list_first_entry(&isert_conn->fr_pool, struct fast_reg_descriptor, list); list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + spin_unlock_irqrestore(&isert_conn->pool_lock, flags); wr->fr_desc = fr_desc; } @@ -2814,9 +2885,9 @@ isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd, unmap_cmd: if (fr_desc) { - spin_lock_irqsave(&isert_conn->conn_lock, flags); - list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool); - spin_unlock_irqrestore(&isert_conn->conn_lock, flags); + spin_lock_irqsave(&isert_conn->pool_lock, flags); + list_add_tail(&fr_desc->list, &isert_conn->fr_pool); + spin_unlock_irqrestore(&isert_conn->pool_lock, flags); } isert_unmap_data_buf(isert_conn, &wr->data); @@ -2829,8 +2900,8 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct isert_device *device = isert_conn->conn_device; + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; struct ib_send_wr *wr_failed; int rc; @@ -2859,7 +2930,7 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) wr->send_wr_num += 1; } - rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + rc = ib_post_send(isert_conn->qp, wr->send_wr, &wr_failed); if (rc) isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); @@ -2879,8 +2950,8 @@ isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_rdma_wr *wr = &isert_cmd->rdma_wr; - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; - struct isert_device *device = isert_conn->conn_device; + struct isert_conn *isert_conn = conn->context; + struct isert_device *device = isert_conn->device; struct ib_send_wr *wr_failed; int rc; @@ -2893,7 +2964,7 @@ isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) return rc; } - rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed); + rc = ib_post_send(isert_conn->qp, wr->send_wr, &wr_failed); if (rc) isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); @@ -2987,7 +3058,7 @@ isert_setup_id(struct isert_np *isert_np) goto out_id; } - ret = rdma_listen(id, ISERT_RDMA_LISTEN_BACKLOG); + ret = rdma_listen(id, 0); if (ret) { isert_err("rdma_listen() failed: %d\n", ret); goto out_id; @@ -3046,7 +3117,7 @@ out: static int isert_rdma_accept(struct isert_conn *isert_conn) { - struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_cm_id *cm_id = isert_conn->cm_id; struct rdma_conn_param cp; int ret; @@ -3067,7 +3138,7 @@ isert_rdma_accept(struct isert_conn *isert_conn) static int isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) { - struct isert_conn *isert_conn = (struct isert_conn *)conn->context; + struct isert_conn *isert_conn = conn->context; int ret; isert_info("before login_req comp conn: %p\n", isert_conn); @@ -3090,8 +3161,8 @@ isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login) isert_rx_login_req(isert_conn); - isert_info("before conn_login_comp conn: %p\n", conn); - ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp); + isert_info("before login_comp conn: %p\n", conn); + ret = wait_for_completion_interruptible(&isert_conn->login_comp); if (ret) return ret; @@ -3104,7 +3175,7 @@ static void isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, struct isert_conn *isert_conn) { - struct rdma_cm_id *cm_id = isert_conn->conn_cm_id; + struct rdma_cm_id *cm_id = isert_conn->cm_id; struct rdma_route *cm_route = &cm_id->route; struct sockaddr_in *sock_in; struct sockaddr_in6 *sock_in6; @@ -3137,13 +3208,13 @@ isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn, static int isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) { - struct isert_np *isert_np = (struct isert_np *)np->np_context; + struct isert_np *isert_np = np->np_context; struct isert_conn *isert_conn; - int max_accept = 0, ret; + int ret; accept_wait: ret = down_interruptible(&isert_np->np_sem); - if (ret || max_accept > 5) + if (ret) return -ENODEV; spin_lock_bh(&np->np_thread_lock); @@ -3162,17 +3233,15 @@ accept_wait: mutex_lock(&isert_np->np_accept_mutex); if (list_empty(&isert_np->np_accept_list)) { mutex_unlock(&isert_np->np_accept_mutex); - max_accept++; goto accept_wait; } isert_conn = list_first_entry(&isert_np->np_accept_list, - struct isert_conn, conn_accept_node); - list_del_init(&isert_conn->conn_accept_node); + struct isert_conn, accept_node); + list_del_init(&isert_conn->accept_node); mutex_unlock(&isert_np->np_accept_mutex); conn->context = isert_conn; isert_conn->conn = conn; - max_accept = 0; isert_set_conn_info(np, conn, isert_conn); @@ -3184,7 +3253,7 @@ accept_wait: static void isert_free_np(struct iscsi_np *np) { - struct isert_np *isert_np = (struct isert_np *)np->np_context; + struct isert_np *isert_np = np->np_context; struct isert_conn *isert_conn, *n; if (isert_np->np_cm_id) @@ -3202,7 +3271,7 @@ isert_free_np(struct iscsi_np *np) isert_info("Still have isert connections, cleaning up...\n"); list_for_each_entry_safe(isert_conn, n, &isert_np->np_accept_list, - conn_accept_node) { + accept_node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); isert_connect_release(isert_conn); @@ -3222,11 +3291,11 @@ static void isert_release_work(struct work_struct *work) isert_info("Starting release conn %p\n", isert_conn); - wait_for_completion(&isert_conn->conn_wait); + wait_for_completion(&isert_conn->wait); - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); isert_conn->state = ISER_CONN_DOWN; - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); isert_info("Destroying conn %p\n", isert_conn); isert_put_conn(isert_conn); @@ -3264,15 +3333,15 @@ isert_wait4flush(struct isert_conn *isert_conn) isert_info("conn %p\n", isert_conn); - init_completion(&isert_conn->conn_wait_comp_err); + init_completion(&isert_conn->wait_comp_err); isert_conn->beacon.wr_id = ISER_BEACON_WRID; /* post an indication that all flush errors were consumed */ - if (ib_post_recv(isert_conn->conn_qp, &isert_conn->beacon, &bad_wr)) { + if (ib_post_recv(isert_conn->qp, &isert_conn->beacon, &bad_wr)) { isert_err("conn %p failed to post beacon", isert_conn); return; } - wait_for_completion(&isert_conn->conn_wait_comp_err); + wait_for_completion(&isert_conn->wait_comp_err); } static void isert_wait_conn(struct iscsi_conn *conn) @@ -3281,17 +3350,17 @@ static void isert_wait_conn(struct iscsi_conn *conn) isert_info("Starting conn %p\n", isert_conn); - mutex_lock(&isert_conn->conn_mutex); + mutex_lock(&isert_conn->mutex); /* - * Only wait for conn_wait_comp_err if the isert_conn made it + * Only wait for wait_comp_err if the isert_conn made it * into full feature phase.. */ if (isert_conn->state == ISER_CONN_INIT) { - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); return; } isert_conn_terminate(isert_conn); - mutex_unlock(&isert_conn->conn_mutex); + mutex_unlock(&isert_conn->mutex); isert_wait4cmds(conn); isert_wait4flush(isert_conn); @@ -3370,7 +3439,7 @@ static void __exit isert_exit(void) } MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); -MODULE_VERSION("0.1"); +MODULE_VERSION("1.0"); MODULE_AUTHOR("nab@Linux-iSCSI.org"); MODULE_LICENSE("GPL"); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 8dc8415d152d..9ec23a786c02 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -31,7 +31,6 @@ #define isert_err(fmt, arg...) \ pr_err(PFX "%s: " fmt, __func__ , ## arg) -#define ISERT_RDMA_LISTEN_BACKLOG 10 #define ISCSI_ISER_SG_TABLESIZE 256 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL #define ISER_BEACON_WRID 0xfffffffffffffffeULL @@ -160,27 +159,25 @@ struct isert_conn { u64 login_req_dma; int login_req_len; u64 login_rsp_dma; - unsigned int conn_rx_desc_head; - struct iser_rx_desc *conn_rx_descs; - struct ib_recv_wr conn_rx_wr[ISERT_MIN_POSTED_RX]; + unsigned int rx_desc_head; + struct iser_rx_desc *rx_descs; + struct ib_recv_wr rx_wr[ISERT_MIN_POSTED_RX]; struct iscsi_conn *conn; - struct list_head conn_accept_node; - struct completion conn_login_comp; + struct list_head accept_node; + struct completion login_comp; struct completion login_req_comp; - struct iser_tx_desc conn_login_tx_desc; - struct rdma_cm_id *conn_cm_id; - struct ib_pd *conn_pd; - struct ib_mr *conn_mr; - struct ib_qp *conn_qp; - struct isert_device *conn_device; - struct mutex conn_mutex; - struct completion conn_wait; - struct completion conn_wait_comp_err; - struct kref conn_kref; - struct list_head conn_fr_pool; - int conn_fr_pool_size; + struct iser_tx_desc login_tx_desc; + struct rdma_cm_id *cm_id; + struct ib_qp *qp; + struct isert_device *device; + struct mutex mutex; + struct completion wait; + struct completion wait_comp_err; + struct kref kref; + struct list_head fr_pool; + int fr_pool_size; /* lock to protect fastreg pool */ - spinlock_t conn_lock; + spinlock_t pool_lock; struct work_struct release_work; struct ib_recv_wr beacon; bool logout_posted; @@ -211,6 +208,8 @@ struct isert_device { bool pi_capable; int refcount; struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_mr *mr; struct isert_comp *comps; int comps_used; struct list_head dev_node; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 0747c0595a9d..918814cd0f80 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -40,6 +40,7 @@ #include <linux/parser.h> #include <linux/random.h> #include <linux/jiffies.h> +#include <rdma/ib_cache.h> #include <linux/atomic.h> @@ -265,10 +266,10 @@ static int srp_init_qp(struct srp_target_port *target, if (!attr) return -ENOMEM; - ret = ib_find_pkey(target->srp_host->srp_dev->dev, - target->srp_host->port, - be16_to_cpu(target->pkey), - &attr->pkey_index); + ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, + target->srp_host->port, + be16_to_cpu(target->pkey), + &attr->pkey_index); if (ret) goto out; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 6e0a477681e9..9b84b4c0a000 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -93,7 +93,7 @@ MODULE_PARM_DESC(srpt_service_guid, " instead of using the node_guid of the first HCA."); static struct ib_client srpt_client; -static struct target_fabric_configfs *srpt_target; +static const struct target_core_fabric_ops srpt_template; static void srpt_release_channel(struct srpt_rdma_ch *ch); static int srpt_queue_status(struct se_cmd *cmd); @@ -207,7 +207,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, } break; default: - printk(KERN_ERR "received unrecognized IB event %d\n", + pr_err("received unrecognized IB event %d\n", event->event); break; } @@ -218,7 +218,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, */ static void srpt_srq_event(struct ib_event *event, void *ctx) { - printk(KERN_INFO "SRQ event %d\n", event->event); + pr_info("SRQ event %d\n", event->event); } /** @@ -242,8 +242,7 @@ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) ch->sess_name, srpt_get_ch_state(ch)); break; default: - printk(KERN_ERR "received unrecognized IB QP event %d\n", - event->event); + pr_err("received unrecognized IB QP event %d\n", event->event); break; } } @@ -602,7 +601,7 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev) sport = &sdev->port[i - 1]; WARN_ON(sport->port != i); if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) - printk(KERN_ERR "disabling MAD processing failed.\n"); + pr_err("disabling MAD processing failed.\n"); if (sport->mad_agent) { ib_unregister_mad_agent(sport->mad_agent); sport->mad_agent = NULL; @@ -810,7 +809,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch, ret = -ENOMEM; if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { - printk(KERN_WARNING "IB send queue full (needed 1)\n"); + pr_warn("IB send queue full (needed 1)\n"); goto out; } @@ -912,7 +911,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, if (ioctx->n_rbuf > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { - printk(KERN_ERR "received unsupported SRP_CMD request" + pr_err("received unsupported SRP_CMD request" " type (%u out + %u in != %u / %zu)\n", srp_cmd->data_out_desc_cnt, srp_cmd->data_in_desc_cnt, @@ -1432,7 +1431,7 @@ static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); } else { - printk(KERN_ERR "IB completion has been received too late for" + pr_err("IB completion has been received too late for" " wr_id = %u.\n", ioctx->ioctx.index); } } @@ -1457,7 +1456,7 @@ static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, SRPT_STATE_DATA_IN)) target_execute_cmd(&ioctx->cmd); else - printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__, + pr_err("%s[%d]: wrong state = %d\n", __func__, __LINE__, srpt_get_cmd_state(ioctx)); } else if (opcode == SRPT_RDMA_ABORT) { ioctx->rdma_aborted = true; @@ -1481,7 +1480,7 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, switch (opcode) { case SRPT_RDMA_READ_LAST: if (ioctx->n_rdma <= 0) { - printk(KERN_ERR "Received invalid RDMA read" + pr_err("Received invalid RDMA read" " error completion with idx %d\n", ioctx->ioctx.index); break; @@ -1490,14 +1489,13 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, if (state == SRPT_STATE_NEED_DATA) srpt_abort_cmd(ioctx); else - printk(KERN_ERR "%s[%d]: wrong state = %d\n", + pr_err("%s[%d]: wrong state = %d\n", __func__, __LINE__, state); break; case SRPT_RDMA_WRITE_LAST: break; default: - printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, - __LINE__, opcode); + pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode); break; } } @@ -1549,8 +1547,8 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp)); max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp); if (sense_data_len > max_sense_len) { - printk(KERN_WARNING "truncated sense data from %d to %d" - " bytes\n", sense_data_len, max_sense_len); + pr_warn("truncated sense data from %d to %d" + " bytes\n", sense_data_len, max_sense_len); sense_data_len = max_sense_len; } @@ -1628,8 +1626,8 @@ static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) int addressing_method; if (unlikely(len < 2)) { - printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or " - "more", len); + pr_err("Illegal LUN length %d, expected 2 bytes or more\n", + len); goto out; } @@ -1663,7 +1661,7 @@ static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: default: - printk(KERN_ERR "Unimplemented LUN addressing method %u", + pr_err("Unimplemented LUN addressing method %u\n", addressing_method); break; } @@ -1672,8 +1670,7 @@ out: return res; out_err: - printk(KERN_ERR "Support for multi-level LUNs has not yet been" - " implemented"); + pr_err("Support for multi-level LUNs has not yet been implemented\n"); goto out; } @@ -1723,7 +1720,7 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, } if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { - printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", + pr_err("0x%llx: parsing SRP descriptor table failed.\n", srp_cmd->tag); ret = TCM_INVALID_CDB_FIELD; goto send_sense; @@ -1912,7 +1909,7 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx); break; case SRP_I_LOGOUT: - printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n"); + pr_err("Not yet implemented: SRP_I_LOGOUT\n"); break; case SRP_CRED_RSP: pr_debug("received SRP_CRED_RSP\n"); @@ -1921,10 +1918,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, pr_debug("received SRP_AER_RSP\n"); break; case SRP_RSP: - printk(KERN_ERR "Received SRP_RSP\n"); + pr_err("Received SRP_RSP\n"); break; default: - printk(KERN_ERR "received IU with unknown opcode 0x%x\n", + pr_err("received IU with unknown opcode 0x%x\n", srp_cmd->opcode); break; } @@ -1948,12 +1945,12 @@ static void srpt_process_rcv_completion(struct ib_cq *cq, req_lim = atomic_dec_return(&ch->req_lim); if (unlikely(req_lim < 0)) - printk(KERN_ERR "req_lim = %d < 0\n", req_lim); + pr_err("req_lim = %d < 0\n", req_lim); ioctx = sdev->ioctx_ring[index]; srpt_handle_new_iu(ch, ioctx, NULL); } else { - printk(KERN_INFO "receiving failed for idx %u with status %d\n", - index, wc->status); + pr_info("receiving failed for idx %u with status %d\n", + index, wc->status); } } @@ -1993,12 +1990,12 @@ static void srpt_process_send_completion(struct ib_cq *cq, } } else { if (opcode == SRPT_SEND) { - printk(KERN_INFO "sending response for idx %u failed" - " with status %d\n", index, wc->status); + pr_info("sending response for idx %u failed" + " with status %d\n", index, wc->status); srpt_handle_send_err_comp(ch, wc->wr_id); } else if (opcode != SRPT_RDMA_MID) { - printk(KERN_INFO "RDMA t %d for idx %u failed with" - " status %d", opcode, index, wc->status); + pr_info("RDMA t %d for idx %u failed with" + " status %d\n", opcode, index, wc->status); srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); } } @@ -2062,15 +2059,15 @@ static int srpt_compl_thread(void *arg) ch = arg; BUG_ON(!ch); - printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n", - ch->sess_name, ch->thread->comm, current->pid); + pr_info("Session %s: kernel thread %s (PID %d) started\n", + ch->sess_name, ch->thread->comm, current->pid); while (!kthread_should_stop()) { wait_event_interruptible(ch->wait_queue, (srpt_process_completion(ch->cq, ch), kthread_should_stop())); } - printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n", - ch->sess_name, ch->thread->comm, current->pid); + pr_info("Session %s: kernel thread %s (PID %d) stopped\n", + ch->sess_name, ch->thread->comm, current->pid); return 0; } @@ -2097,7 +2094,7 @@ retry: ch->rq_size + srp_sq_size, 0); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); - printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n", + pr_err("failed to create CQ cqe= %d ret= %d\n", ch->rq_size + srp_sq_size, ret); goto out; } @@ -2123,7 +2120,7 @@ retry: goto retry; } } - printk(KERN_ERR "failed to create_qp ret= %d\n", ret); + pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } @@ -2143,7 +2140,7 @@ retry: ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); if (IS_ERR(ch->thread)) { - printk(KERN_ERR "failed to create kernel thread %ld\n", + pr_err("failed to create kernel thread %ld\n", PTR_ERR(ch->thread)); ch->thread = NULL; goto err_destroy_qp; @@ -2204,7 +2201,7 @@ static void __srpt_close_ch(struct srpt_rdma_ch *ch) /* fall through */ case CH_LIVE: if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) - printk(KERN_ERR "sending CM DREQ failed.\n"); + pr_err("sending CM DREQ failed.\n"); break; case CH_DISCONNECTING: break; @@ -2291,7 +2288,7 @@ static void srpt_drain_channel(struct ib_cm_id *cm_id) ret = srpt_ch_qp_err(ch); if (ret < 0) - printk(KERN_ERR "Setting queue pair in error state" + pr_err("Setting queue pair in error state" " failed: %d\n", ret); } } @@ -2435,17 +2432,17 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, it_iu_len = be32_to_cpu(req->req_it_iu_len); - printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," - " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" - " (guid=0x%llx:0x%llx)\n", - be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), - be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), - be64_to_cpu(*(__be64 *)&req->target_port_id[0]), - be64_to_cpu(*(__be64 *)&req->target_port_id[8]), - it_iu_len, - param->port, - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + pr_info("Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," + " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" + " (guid=0x%llx:0x%llx)\n", + be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), + be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), + be64_to_cpu(*(__be64 *)&req->target_port_id[0]), + be64_to_cpu(*(__be64 *)&req->target_port_id[8]), + it_iu_len, + param->port, + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); rsp = kzalloc(sizeof *rsp, GFP_KERNEL); rej = kzalloc(sizeof *rej, GFP_KERNEL); @@ -2460,7 +2457,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); ret = -EINVAL; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because its" + pr_err("rejected SRP_LOGIN_REQ because its" " length (%d bytes) is out of range (%d .. %d)\n", it_iu_len, 64, srp_max_req_size); goto reject; @@ -2470,7 +2467,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); ret = -EINVAL; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port" + pr_err("rejected SRP_LOGIN_REQ because the target port" " has not yet been enabled\n"); goto reject; } @@ -2516,7 +2513,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); ret = -ENOMEM; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because it" + pr_err("rejected SRP_LOGIN_REQ because it" " has an invalid target port identifier.\n"); goto reject; } @@ -2525,7 +2522,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (!ch) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n"); + pr_err("rejected SRP_LOGIN_REQ because no memory.\n"); ret = -ENOMEM; goto reject; } @@ -2562,7 +2559,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (ret) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating" + pr_err("rejected SRP_LOGIN_REQ because creating" " a new RDMA channel failed.\n"); goto free_ring; } @@ -2571,7 +2568,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (ret) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling" + pr_err("rejected SRP_LOGIN_REQ because enabling" " RTR failed (error code = %d)\n", ret); goto destroy_ib; } @@ -2586,8 +2583,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, nacl = srpt_lookup_acl(sport, ch->i_port_id); if (!nacl) { - printk(KERN_INFO "Rejected login because no ACL has been" - " configured yet for initiator %s.\n", ch->sess_name); + pr_info("Rejected login because no ACL has been" + " configured yet for initiator %s.\n", ch->sess_name); rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); goto destroy_ib; @@ -2631,7 +2628,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ret = ib_send_cm_rep(cm_id, rep_param); if (ret) { - printk(KERN_ERR "sending SRP_LOGIN_REQ response failed" + pr_err("sending SRP_LOGIN_REQ response failed" " (error code = %d)\n", ret); goto release_channel; } @@ -2679,7 +2676,7 @@ out: static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id); + pr_info("Received IB REJ for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2714,13 +2711,13 @@ static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id); + pr_info("Received IB TimeWait exit for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } static void srpt_cm_rep_error(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id); + pr_info("Received IB REP error for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2755,9 +2752,9 @@ static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) if (send_drep) { if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) - printk(KERN_ERR "Sending IB DREP failed.\n"); - printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n", - ch->sess_name); + pr_err("Sending IB DREP failed.\n"); + pr_info("Received DREQ and sent DREP for session %s.\n", + ch->sess_name); } } @@ -2766,8 +2763,7 @@ static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) */ static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n", - cm_id); + pr_info("Received InfiniBand DREP message for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2811,14 +2807,13 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) srpt_cm_rep_error(cm_id); break; case IB_CM_DREQ_ERROR: - printk(KERN_INFO "Received IB DREQ ERROR event.\n"); + pr_info("Received IB DREQ ERROR event.\n"); break; case IB_CM_MRA_RECEIVED: - printk(KERN_INFO "Received IB MRA event\n"); + pr_info("Received IB MRA event\n"); break; default: - printk(KERN_ERR "received unrecognized IB CM event %d\n", - event->event); + pr_err("received unrecognized IB CM event %d\n", event->event); break; } @@ -2848,8 +2843,8 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, ret = -ENOMEM; sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); if (sq_wr_avail < 0) { - printk(KERN_WARNING "IB send queue full (needed %d)\n", - n_rdma); + pr_warn("IB send queue full (needed %d)\n", + n_rdma); goto out; } } @@ -2889,7 +2884,7 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, } if (ret) - printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d", + pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", __func__, __LINE__, ret, i, n_rdma); if (ret && i > 0) { wr.num_sge = 0; @@ -2897,12 +2892,12 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, wr.send_flags = IB_SEND_SIGNALED; while (ch->state == CH_LIVE && ib_post_send(ch->qp, &wr, &bad_wr) != 0) { - printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]", + pr_info("Trying to abort failed RDMA transfer [%d]\n", ioctx->ioctx.index); msleep(1000); } while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { - printk(KERN_INFO "Waiting until RDMA abort finished [%d]", + pr_info("Waiting until RDMA abort finished [%d]\n", ioctx->ioctx.index); msleep(1000); } @@ -2923,17 +2918,17 @@ static int srpt_xfer_data(struct srpt_rdma_ch *ch, ret = srpt_map_sg_to_ib_sge(ch, ioctx); if (ret) { - printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret); + pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret); goto out; } ret = srpt_perform_rdmas(ch, ioctx); if (ret) { if (ret == -EAGAIN || ret == -ENOMEM) - printk(KERN_INFO "%s[%d] queue full -- ret=%d\n", - __func__, __LINE__, ret); + pr_info("%s[%d] queue full -- ret=%d\n", + __func__, __LINE__, ret); else - printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n", + pr_err("%s[%d] fatal error -- ret=%d\n", __func__, __LINE__, ret); goto out_unmap; } @@ -3058,7 +3053,7 @@ static void srpt_queue_response(struct se_cmd *cmd) !ioctx->queue_status_only) { ret = srpt_xfer_data(ch, ioctx); if (ret) { - printk(KERN_ERR "xfer_data failed for tag %llu\n", + pr_err("xfer_data failed for tag %llu\n", ioctx->tag); return; } @@ -3075,7 +3070,7 @@ static void srpt_queue_response(struct se_cmd *cmd) } ret = srpt_post_send(ch, ioctx, resp_len); if (ret) { - printk(KERN_ERR "sending cmd response failed for tag %llu\n", + pr_err("sending cmd response failed for tag %llu\n", ioctx->tag); srpt_unmap_sg_to_ib_sge(ch, ioctx); srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); @@ -3154,7 +3149,7 @@ static int srpt_release_sdev(struct srpt_device *sdev) res = wait_event_interruptible(sdev->ch_releaseQ, srpt_ch_list_empty(sdev)); if (res) - printk(KERN_ERR "%s: interrupted.\n", __func__); + pr_err("%s: interrupted.\n", __func__); return 0; } @@ -3293,7 +3288,7 @@ static void srpt_add_one(struct ib_device *device) spin_lock_init(&sport->port_acl_lock); if (srpt_refresh_port(sport)) { - printk(KERN_ERR "MAD registration failed for %s-%d.\n", + pr_err("MAD registration failed for %s-%d.\n", srpt_sdev_name(sdev), i); goto err_ring; } @@ -3330,7 +3325,7 @@ free_dev: kfree(sdev); err: sdev = NULL; - printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name); + pr_info("%s(%s) failed.\n", __func__, device->name); goto out; } @@ -3344,8 +3339,7 @@ static void srpt_remove_one(struct ib_device *device) sdev = ib_get_client_data(device, &srpt_client); if (!sdev) { - printk(KERN_INFO "%s(%s): nothing to do.\n", __func__, - device->name); + pr_info("%s(%s): nothing to do.\n", __func__, device->name); return; } @@ -3464,7 +3458,7 @@ static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); if (!nacl) { - printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); + pr_err("Unable to allocate struct srpt_node_acl\n"); return NULL; } @@ -3615,7 +3609,7 @@ static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg, u8 i_port_id[16]; if (srpt_parse_i_port_id(i_port_id, name) < 0) { - printk(KERN_ERR "invalid initiator port ID %s\n", name); + pr_err("invalid initiator port ID %s\n", name); ret = -EINVAL; goto err; } @@ -3816,12 +3810,12 @@ static ssize_t srpt_tpg_store_enable( ret = kstrtoul(page, 0, &tmp); if (ret < 0) { - printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); + pr_err("Unable to extract srpt_tpg_store_enable\n"); return -EINVAL; } if ((tmp != 0) && (tmp != 1)) { - printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp); + pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp); return -EINVAL; } if (tmp == 1) @@ -3851,7 +3845,7 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, int res; /* Initialize sport->port_wwn and sport->port_tpg_1 */ - res = core_tpg_register(&srpt_target->tf_ops, &sport->port_wwn, + res = core_tpg_register(&srpt_template, &sport->port_wwn, &sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL); if (res) return ERR_PTR(res); @@ -3919,7 +3913,9 @@ static struct configfs_attribute *srpt_wwn_attrs[] = { NULL, }; -static struct target_core_fabric_ops srpt_template = { +static const struct target_core_fabric_ops srpt_template = { + .module = THIS_MODULE, + .name = "srpt", .get_fabric_name = srpt_get_fabric_name, .get_fabric_proto_ident = srpt_get_fabric_proto_ident, .tpg_get_wwn = srpt_get_fabric_wwn, @@ -3964,6 +3960,10 @@ static struct target_core_fabric_ops srpt_template = { .fabric_drop_np = NULL, .fabric_make_nodeacl = srpt_make_nodeacl, .fabric_drop_nodeacl = srpt_drop_nodeacl, + + .tfc_wwn_attrs = srpt_wwn_attrs, + .tfc_tpg_base_attrs = srpt_tpg_attrs, + .tfc_tpg_attrib_attrs = srpt_tpg_attrib_attrs, }; /** @@ -3980,7 +3980,7 @@ static int __init srpt_init_module(void) ret = -EINVAL; if (srp_max_req_size < MIN_MAX_REQ_SIZE) { - printk(KERN_ERR "invalid value %d for kernel module parameter" + pr_err("invalid value %d for kernel module parameter" " srp_max_req_size -- must be at least %d.\n", srp_max_req_size, MIN_MAX_REQ_SIZE); goto out; @@ -3988,54 +3988,26 @@ static int __init srpt_init_module(void) if (srpt_srq_size < MIN_SRPT_SRQ_SIZE || srpt_srq_size > MAX_SRPT_SRQ_SIZE) { - printk(KERN_ERR "invalid value %d for kernel module parameter" + pr_err("invalid value %d for kernel module parameter" " srpt_srq_size -- must be in the range [%d..%d].\n", srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE); goto out; } - srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt"); - if (IS_ERR(srpt_target)) { - printk(KERN_ERR "couldn't register\n"); - ret = PTR_ERR(srpt_target); + ret = target_register_template(&srpt_template); + if (ret) goto out; - } - - srpt_target->tf_ops = srpt_template; - - /* - * Set up default attribute lists. - */ - srpt_target->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = srpt_wwn_attrs; - srpt_target->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = srpt_tpg_attrs; - srpt_target->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = srpt_tpg_attrib_attrs; - srpt_target->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - srpt_target->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - srpt_target->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - srpt_target->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - srpt_target->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - srpt_target->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - - ret = target_fabric_configfs_register(srpt_target); - if (ret < 0) { - printk(KERN_ERR "couldn't register\n"); - goto out_free_target; - } ret = ib_register_client(&srpt_client); if (ret) { - printk(KERN_ERR "couldn't register IB client\n"); + pr_err("couldn't register IB client\n"); goto out_unregister_target; } return 0; out_unregister_target: - target_fabric_configfs_deregister(srpt_target); - srpt_target = NULL; -out_free_target: - if (srpt_target) - target_fabric_configfs_free(srpt_target); + target_unregister_template(&srpt_template); out: return ret; } @@ -4043,8 +4015,7 @@ out: static void __exit srpt_cleanup_module(void) { ib_unregister_client(&srpt_client); - target_fabric_configfs_deregister(srpt_target); - srpt_target = NULL; + target_unregister_template(&srpt_template); } module_init(srpt_init_module); diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index 64b9b59ad4cb..b50c5b8b8a4d 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -148,16 +148,19 @@ static void cros_ec_keyb_process(struct cros_ec_keyb *ckdev, static int cros_ec_keyb_get_state(struct cros_ec_keyb *ckdev, uint8_t *kb_state) { + int ret; struct cros_ec_command msg = { - .version = 0, .command = EC_CMD_MKBP_STATE, - .outdata = NULL, - .outsize = 0, - .indata = kb_state, .insize = ckdev->cols, }; - return cros_ec_cmd_xfer(ckdev->ec, &msg); + ret = cros_ec_cmd_xfer(ckdev->ec, &msg); + if (ret < 0) + return ret; + + memcpy(kb_state, msg.indata, ckdev->cols); + + return 0; } static irqreturn_t cros_ec_keyb_irq(int irq, void *data) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index a35927cd42e5..68d43beccb7e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -50,6 +50,7 @@ #define CONTEXT_SIZE VTD_PAGE_SIZE #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) +#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) @@ -184,32 +185,11 @@ static int force_on = 0; * 64-127: Reserved */ struct root_entry { - u64 val; - u64 rsvd1; + u64 lo; + u64 hi; }; #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) -static inline bool root_present(struct root_entry *root) -{ - return (root->val & 1); -} -static inline void set_root_present(struct root_entry *root) -{ - root->val |= 1; -} -static inline void set_root_value(struct root_entry *root, unsigned long value) -{ - root->val &= ~VTD_PAGE_MASK; - root->val |= value & VTD_PAGE_MASK; -} -static inline struct context_entry * -get_context_addr_from_root(struct root_entry *root) -{ - return (struct context_entry *) - (root_present(root)?phys_to_virt( - root->val & VTD_PAGE_MASK) : - NULL); -} /* * low 64 bits: @@ -682,6 +662,40 @@ static void domain_update_iommu_cap(struct dmar_domain *domain) domain->iommu_superpage = domain_update_iommu_superpage(NULL); } +static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu, + u8 bus, u8 devfn, int alloc) +{ + struct root_entry *root = &iommu->root_entry[bus]; + struct context_entry *context; + u64 *entry; + + if (ecap_ecs(iommu->ecap)) { + if (devfn >= 0x80) { + devfn -= 0x80; + entry = &root->hi; + } + devfn *= 2; + } + entry = &root->lo; + if (*entry & 1) + context = phys_to_virt(*entry & VTD_PAGE_MASK); + else { + unsigned long phy_addr; + if (!alloc) + return NULL; + + context = alloc_pgtable_page(iommu->node); + if (!context) + return NULL; + + __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); + phy_addr = virt_to_phys((void *)context); + *entry = phy_addr | 1; + __iommu_flush_cache(iommu, entry, sizeof(*entry)); + } + return &context[devfn]; +} + static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn) { struct dmar_drhd_unit *drhd = NULL; @@ -741,75 +755,36 @@ static void domain_flush_cache(struct dmar_domain *domain, clflush_cache_range(addr, size); } -/* Gets context entry for a given bus and devfn */ -static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, - u8 bus, u8 devfn) -{ - struct root_entry *root; - struct context_entry *context; - unsigned long phy_addr; - unsigned long flags; - - spin_lock_irqsave(&iommu->lock, flags); - root = &iommu->root_entry[bus]; - context = get_context_addr_from_root(root); - if (!context) { - context = (struct context_entry *) - alloc_pgtable_page(iommu->node); - if (!context) { - spin_unlock_irqrestore(&iommu->lock, flags); - return NULL; - } - __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE); - phy_addr = virt_to_phys((void *)context); - set_root_value(root, phy_addr); - set_root_present(root); - __iommu_flush_cache(iommu, root, sizeof(*root)); - } - spin_unlock_irqrestore(&iommu->lock, flags); - return &context[devfn]; -} - static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn) { - struct root_entry *root; struct context_entry *context; - int ret; + int ret = 0; unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); - root = &iommu->root_entry[bus]; - context = get_context_addr_from_root(root); - if (!context) { - ret = 0; - goto out; - } - ret = context_present(&context[devfn]); -out: + context = iommu_context_addr(iommu, bus, devfn, 0); + if (context) + ret = context_present(context); spin_unlock_irqrestore(&iommu->lock, flags); return ret; } static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn) { - struct root_entry *root; struct context_entry *context; unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); - root = &iommu->root_entry[bus]; - context = get_context_addr_from_root(root); + context = iommu_context_addr(iommu, bus, devfn, 0); if (context) { - context_clear_entry(&context[devfn]); - __iommu_flush_cache(iommu, &context[devfn], \ - sizeof(*context)); + context_clear_entry(context); + __iommu_flush_cache(iommu, context, sizeof(*context)); } spin_unlock_irqrestore(&iommu->lock, flags); } static void free_context_table(struct intel_iommu *iommu) { - struct root_entry *root; int i; unsigned long flags; struct context_entry *context; @@ -819,10 +794,17 @@ static void free_context_table(struct intel_iommu *iommu) goto out; } for (i = 0; i < ROOT_ENTRY_NR; i++) { - root = &iommu->root_entry[i]; - context = get_context_addr_from_root(root); + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + free_pgtable_page(context); + + if (!ecap_ecs(iommu->ecap)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); if (context) free_pgtable_page(context); + } free_pgtable_page(iommu->root_entry); iommu->root_entry = NULL; @@ -1146,14 +1128,16 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) static void iommu_set_root_entry(struct intel_iommu *iommu) { - void *addr; + u64 addr; u32 sts; unsigned long flag; - addr = iommu->root_entry; + addr = virt_to_phys(iommu->root_entry); + if (ecap_ecs(iommu->ecap)) + addr |= DMA_RTADDR_RTT; raw_spin_lock_irqsave(&iommu->register_lock, flag); - dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); + dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); @@ -1800,7 +1784,9 @@ static int domain_context_mapping_one(struct dmar_domain *domain, BUG_ON(translation != CONTEXT_TT_PASS_THROUGH && translation != CONTEXT_TT_MULTI_LEVEL); - context = device_to_context_entry(iommu, bus, devfn); + spin_lock_irqsave(&iommu->lock, flags); + context = iommu_context_addr(iommu, bus, devfn, 1); + spin_unlock_irqrestore(&iommu->lock, flags); if (!context) return -ENOMEM; spin_lock_irqsave(&iommu->lock, flags); @@ -2564,6 +2550,10 @@ static bool device_has_rmrr(struct device *dev) * In both cases we assume that PCI USB devices with RMRRs have them largely * for historical reasons and that the RMRR space is not actively used post * boot. This exclusion may change if vendors begin to abuse it. + * + * The same exception is made for graphics devices, with the requirement that + * any use of the RMRR regions will be torn down before assigning the device + * to a guest. */ static bool device_is_rmrr_locked(struct device *dev) { @@ -2573,7 +2563,7 @@ static bool device_is_rmrr_locked(struct device *dev) if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); - if ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB) + if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev)) return false; } diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 6c25b3c5b729..5709ae9c3e77 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -637,10 +637,7 @@ static int __init intel_enable_irq_remapping(void) if (x2apic_supported()) { eim = !dmar_x2apic_optout(); if (!eim) - printk(KERN_WARNING - "Your BIOS is broken and requested that x2apic be disabled.\n" - "This will slightly decrease performance.\n" - "Use 'intremap=no_x2apic_optout' to override BIOS request.\n"); + pr_info("x2apic is disabled because BIOS sets x2apic opt out bit. You can use 'intremap=no_x2apic_optout' to override the BIOS setting.\n"); } for_each_iommu(iommu, drhd) { diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index a6ce3476834e..7b315e385ba3 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -33,12 +33,14 @@ #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_irq.h> +#include <linux/acpi.h> #include <linux/irqdomain.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/irqchip/chained_irq.h> #include <linux/irqchip/arm-gic.h> +#include <linux/irqchip/arm-gic-acpi.h> #include <asm/cputype.h> #include <asm/irq.h> @@ -1107,3 +1109,105 @@ IRQCHIP_DECLARE(msm_8660_qgic, "qcom,msm-8660-qgic", gic_of_init); IRQCHIP_DECLARE(msm_qgic2, "qcom,msm-qgic2", gic_of_init); #endif + +#ifdef CONFIG_ACPI +static phys_addr_t dist_phy_base, cpu_phy_base __initdata; + +static int __init +gic_acpi_parse_madt_cpu(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_interrupt *processor; + phys_addr_t gic_cpu_base; + static int cpu_base_assigned; + + processor = (struct acpi_madt_generic_interrupt *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + /* + * There is no support for non-banked GICv1/2 register in ACPI spec. + * All CPU interface addresses have to be the same. + */ + gic_cpu_base = processor->base_address; + if (cpu_base_assigned && gic_cpu_base != cpu_phy_base) + return -EINVAL; + + cpu_phy_base = gic_cpu_base; + cpu_base_assigned = 1; + return 0; +} + +static int __init +gic_acpi_parse_madt_distributor(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_generic_distributor *dist; + + dist = (struct acpi_madt_generic_distributor *)header; + + if (BAD_MADT_ENTRY(dist, end)) + return -EINVAL; + + dist_phy_base = dist->base_address; + return 0; +} + +int __init +gic_v2_acpi_init(struct acpi_table_header *table) +{ + void __iomem *cpu_base, *dist_base; + int count; + + /* Collect CPU base addresses */ + count = acpi_parse_entries(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + gic_acpi_parse_madt_cpu, table, + ACPI_MADT_TYPE_GENERIC_INTERRUPT, 0); + if (count <= 0) { + pr_err("No valid GICC entries exist\n"); + return -EINVAL; + } + + /* + * Find distributor base address. We expect one distributor entry since + * ACPI 5.1 spec neither support multi-GIC instances nor GIC cascade. + */ + count = acpi_parse_entries(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + gic_acpi_parse_madt_distributor, table, + ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR, 0); + if (count <= 0) { + pr_err("No valid GICD entries exist\n"); + return -EINVAL; + } else if (count > 1) { + pr_err("More than one GICD entry detected\n"); + return -EINVAL; + } + + cpu_base = ioremap(cpu_phy_base, ACPI_GIC_CPU_IF_MEM_SIZE); + if (!cpu_base) { + pr_err("Unable to map GICC registers\n"); + return -ENOMEM; + } + + dist_base = ioremap(dist_phy_base, ACPI_GICV2_DIST_MEM_SIZE); + if (!dist_base) { + pr_err("Unable to map GICD registers\n"); + iounmap(cpu_base); + return -ENOMEM; + } + + /* + * Initialize zero GIC instance (no multi-GIC support). Also, set GIC + * as default IRQ domain to allow for GSI registration and GSI to IRQ + * number translation (see acpi_register_gsi() and acpi_gsi_to_irq()). + */ + gic_init_bases(0, -1, dist_base, cpu_base, 0, NULL); + irq_set_default_host(gic_data[0].domain); + + acpi_irq_model = ACPI_IRQ_MODEL_GIC; + return 0; +} +#endif diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c index 0fe2f718d81c..afd1af3dfe5a 100644 --- a/drivers/irqchip/irqchip.c +++ b/drivers/irqchip/irqchip.c @@ -8,6 +8,7 @@ * warranty of any kind, whether express or implied. */ +#include <linux/acpi_irq.h> #include <linux/init.h> #include <linux/of_irq.h> #include <linux/irqchip.h> @@ -26,4 +27,6 @@ extern struct of_device_id __irqchip_of_table[]; void __init irqchip_init(void) { of_irq_init(__irqchip_of_table); + + acpi_irq_init(); } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 1219af493c0f..19a32280731d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -211,10 +211,9 @@ static void initialize(struct lg_cpu *cpu) /* * The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". + * the instruction address into "struct lguest_data". */ - if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) - || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) + if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 70dfcdc29f1f..5e7559be222a 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -56,21 +56,16 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) } /*H:210 - * The set_guest_interrupt() routine actually delivers the interrupt or - * trap. The mechanics of delivering traps and interrupts to the Guest are the - * same, except some traps have an "error code" which gets pushed onto the - * stack as well: the caller tells us if this is one. - * - * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this - * interrupt or trap. It's split into two parts for traditional reasons: gcc - * on i386 used to be frightened by 64 bit numbers. + * The push_guest_interrupt_stack() routine saves Guest state on the stack for + * an interrupt or trap. The mechanics of delivering traps and interrupts to + * the Guest are the same, except some traps have an "error code" which gets + * pushed onto the stack as well: the caller tells us if this is one. * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo * it). */ -static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, - bool has_err) +static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err) { unsigned long gstack, origstack; u32 eflags, ss, irq_enable; @@ -130,12 +125,28 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* - * Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. - */ + /* Adjust the stack pointer and stack segment. */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); +} + +/* + * This actually makes the Guest start executing the given interrupt/trap + * handler. + * + * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this + * interrupt or trap. It's split into two parts for traditional reasons: gcc + * on i386 used to be frightened by 64 bit numbers. + */ +static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi) +{ + /* If we're already in the kernel, we don't change stacks. */ + if ((cpu->regs->ss&0x3) != GUEST_PL) + cpu->regs->ss = cpu->esp1; + + /* + * Set the code segment and the address to execute. + */ cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); @@ -158,6 +169,24 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, kill_guest(cpu, "Disabling interrupts"); } +/* This restores the eflags word which was pushed on the stack by a trap */ +static void restore_eflags(struct lg_cpu *cpu) +{ + /* This is the physical address of the stack. */ + unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp); + + /* + * Stack looks like this: + * Address Contents + * esp EIP + * esp + 4 CS + * esp + 8 EFLAGS + */ + cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32); + cpu->regs->eflags &= + ~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT); +} + /*H:205 * Virtual Interrupts. * @@ -200,14 +229,6 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) BUG_ON(irq >= LGUEST_IRQS); - /* - * They may be in the middle of an iret, where they asked us never to - * deliver interrupts. - */ - if (cpu->regs->eip >= cpu->lg->noirq_start && - (cpu->regs->eip < cpu->lg->noirq_end)) - return; - /* If they're halted, interrupts restart them. */ if (cpu->halted) { /* Re-enable interrupts. */ @@ -237,12 +258,34 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); + /* - * set_guest_interrupt() takes the interrupt descriptor and a - * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. + * They may be about to iret, where they asked us never to + * deliver interrupts. In this case, we can emulate that iret + * then immediately deliver the interrupt. This is basically + * a noop: the iret would pop the interrupt frame and restore + * eflags, and then we'd set it up again. So just restore the + * eflags word and jump straight to the handler in this case. + * + * Denys Vlasenko points out that this isn't quite right: if + * the iret was returning to userspace, then that interrupt + * would reset the stack pointer (which the Guest told us + * about via LHCALL_SET_STACK). But unless the Guest is being + * *really* weird, that will be the same as the current stack + * anyway. */ - set_guest_interrupt(cpu, idt->a, idt->b, false); + if (cpu->regs->eip == cpu->lg->noirq_iret) { + restore_eflags(cpu); + } else { + /* + * set_guest_interrupt() takes a flag to say whether + * this interrupt pushes an error code onto the stack + * as well: virtual interrupts never do. + */ + push_guest_interrupt_stack(cpu, false); + } + /* Actually make Guest cpu jump to handler. */ + guest_run_interrupt(cpu, idt->a, idt->b); } /* @@ -353,8 +396,9 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; - set_guest_interrupt(cpu, cpu->arch.idt[num].a, - cpu->arch.idt[num].b, has_err(num)); + push_guest_interrupt_stack(cpu, has_err(num)); + guest_run_interrupt(cpu, cpu->arch.idt[num].a, + cpu->arch.idt[num].b); return true; } @@ -395,8 +439,9 @@ static bool direct_trap(unsigned int num) * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an - * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. + * interrupt gate. The Host could provide a mechanism to register more + * "no-interrupt" regions, and the Guest could point the trap gate at + * instructions within that region, where it can safely disable interrupts. */ /*M:006 diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 307e8b39e7d1..ac8ad0461e80 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -102,7 +102,7 @@ struct lguest { struct pgdir pgdirs[4]; - unsigned long noirq_start, noirq_end; + unsigned long noirq_iret; unsigned int stack_pages; u32 tsc_khz; diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index c4c6113eb9a6..30c60687d277 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -339,6 +339,13 @@ static ssize_t write(struct file *file, const char __user *in, } } +static int open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + + return 0; +} + /*L:060 * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the @@ -409,6 +416,7 @@ static int close(struct inode *inode, struct file *file) */ static const struct file_operations lguest_fops = { .owner = THIS_MODULE, + .open = open, .release = close, .write = write, .read = read, diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 6ddc983417d5..edcf4ab66e00 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -175,6 +175,22 @@ config MD_FAULTY In unsure, say N. + +config MD_CLUSTER + tristate "Cluster Support for MD (EXPERIMENTAL)" + depends on BLK_DEV_MD + depends on DLM + default n + ---help--- + Clustering support for MD devices. This enables locking and + synchronization across multiple systems on the cluster, so all + nodes in the cluster can access the MD devices simultaneously. + + This brings the redundancy (and uptime) of RAID levels across the + nodes of the cluster. + + If unsure, say N. + source "drivers/md/bcache/Kconfig" config BLK_DEV_DM_BUILTIN diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 1863feaa5846..dba4db5985fb 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 3a5767968ba0..2bc56e2a3526 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + int node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; @@ -433,6 +437,7 @@ void bitmap_update_sb(struct bitmap *bitmap) /* This might have been changed by a reshape */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); kunmap_atomic(sb); @@ -544,6 +549,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; unsigned long long events; + int nodes = 0; unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; @@ -562,6 +568,22 @@ static int bitmap_read_sb(struct bitmap *bitmap) return -ENOMEM; bitmap->storage.sb_page = sb_page; +re_read: + /* If cluster_slot is set, the cluster is setup */ + if (bitmap->cluster_slot >= 0) { + sector_t bm_blocks = bitmap->mddev->resync_max_sectors; + + sector_div(bm_blocks, + bitmap->mddev->bitmap_info.chunksize >> 9); + /* bits to bytes */ + bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); + /* to 4k blocks */ + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); + bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); + pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, + bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); + } + if (bitmap->storage.file) { loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; @@ -577,12 +599,15 @@ static int bitmap_read_sb(struct bitmap *bitmap) if (err) return err; + err = -EINVAL; sb = kmap_atomic(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -619,7 +644,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) goto out; } events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { + if (!nodes && (events < bitmap->mddev->events)) { printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " "-- forcing full recovery\n", @@ -634,20 +659,40 @@ static int bitmap_read_sb(struct bitmap *bitmap) if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); err = 0; + out: kunmap_atomic(sb); + /* Assiging chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; + if (nodes && (bitmap->cluster_slot < 0)) { + err = md_setup_cluster(bitmap->mddev, nodes); + if (err) { + pr_err("%s: Could not setup cluster service (%d)\n", + bmname(bitmap), err); + goto out_no_sb; + } + bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + goto re_read; + } + + out_no_sb: if (test_bit(BITMAP_STALE, &bitmap->flags)) bitmap->events_cleared = bitmap->mddev->events; bitmap->mddev->bitmap_info.chunksize = chunksize; bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; if (bitmap->mddev->bitmap_info.space == 0 || bitmap->mddev->bitmap_info.space > sectors_reserved) bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) + if (err) { bitmap_print_sb(bitmap); + if (bitmap->cluster_slot < 0) + md_cluster_stop(bitmap->mddev); + } return err; } @@ -692,9 +737,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store, } static int bitmap_storage_alloc(struct bitmap_storage *store, - unsigned long chunks, int with_super) + unsigned long chunks, int with_super, + int slot_number) { - int pnum; + int pnum, offset = 0; unsigned long num_pages; unsigned long bytes; @@ -703,6 +749,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, bytes += sizeof(bitmap_super_t); num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + offset = slot_number * (num_pages - 1); store->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); @@ -713,20 +760,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store, store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (store->sb_page == NULL) return -ENOMEM; - store->sb_page->index = 0; } + pnum = 0; if (store->sb_page) { store->filemap[0] = store->sb_page; pnum = 1; + store->sb_page->index = offset; } + for ( ; pnum < num_pages; pnum++) { store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!store->filemap[pnum]) { store->file_pages = pnum; return -ENOMEM; } - store->filemap[pnum]->index = pnum; + store->filemap[pnum]->index = pnum + offset; } store->file_pages = pnum; @@ -885,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) } } +static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + int set = 0; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return -EINVAL; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set = test_bit(bit, paddr); + else + set = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + return set; +} + + /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -935,7 +1006,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n */ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { - unsigned long i, chunks, index, oldindex, bit; + unsigned long i, chunks, index, oldindex, bit, node_offset = 0; struct page *page = NULL; unsigned long bit_cnt = 0; struct file *file; @@ -981,6 +1052,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) if (!bitmap->mddev->bitmap_info.external) offset = sizeof(bitmap_super_t); + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); + for (i = 0; i < chunks; i++) { int b; index = file_page_index(&bitmap->storage, i); @@ -1001,7 +1075,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) bitmap->mddev, bitmap->mddev->bitmap_info.offset, page, - index, count); + index + node_offset, count); if (ret) goto err; @@ -1207,7 +1281,6 @@ void bitmap_daemon_work(struct mddev *mddev) j < bitmap->storage.file_pages && !test_bit(BITMAP_STALE, &bitmap->flags); j++) { - if (test_page_attr(bitmap, j, BITMAP_PAGE_DIRTY)) /* bitmap_unplug will handle the rest */ @@ -1530,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n return; } if (!*bmc) { - *bmc = 2 | (needed ? NEEDED_MASK : 0); + *bmc = 2; bitmap_count_page(&bitmap->counts, offset, 1); bitmap_set_pending(&bitmap->counts, offset); bitmap->allclean = 0; } + if (needed) + *bmc |= NEEDED_MASK; spin_unlock_irq(&bitmap->counts.lock); } @@ -1591,6 +1666,10 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && + bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + md_cluster_stop(bitmap->mddev); + /* Shouldn't be needed - but just in case.... */ wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes) == 0); @@ -1636,7 +1715,7 @@ void bitmap_destroy(struct mddev *mddev) * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up */ -int bitmap_create(struct mddev *mddev) +struct bitmap *bitmap_create(struct mddev *mddev, int slot) { struct bitmap *bitmap; sector_t blocks = mddev->resync_max_sectors; @@ -1650,7 +1729,7 @@ int bitmap_create(struct mddev *mddev) bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock_init(&bitmap->counts.lock); atomic_set(&bitmap->pending_writes, 0); @@ -1659,6 +1738,7 @@ int bitmap_create(struct mddev *mddev) init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; + bitmap->cluster_slot = slot; if (mddev->kobj.sd) bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); @@ -1706,12 +1786,14 @@ int bitmap_create(struct mddev *mddev) printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", bitmap->counts.pages, bmname(bitmap)); - mddev->bitmap = bitmap; - return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + if (err) + goto error; + return bitmap; error: bitmap_free(bitmap); - return err; + return ERR_PTR(err); } int bitmap_load(struct mddev *mddev) @@ -1765,6 +1847,60 @@ out: } EXPORT_SYMBOL_GPL(bitmap_load); +/* Loads the bitmap associated with slot and copies the resync information + * to our bitmap + */ +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *low, sector_t *high, bool clear_bits) +{ + int rv = 0, i, j; + sector_t block, lo = 0, hi = 0; + struct bitmap_counts *counts; + struct bitmap *bitmap = bitmap_create(mddev, slot); + + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + rv = bitmap_read_sb(bitmap); + if (rv) + goto err; + + rv = bitmap_init_from_disk(bitmap, 0); + if (rv) + goto err; + + counts = &bitmap->counts; + for (j = 0; j < counts->chunks; j++) { + block = (sector_t)j << counts->chunkshift; + if (bitmap_file_test_bit(bitmap, block)) { + if (!lo) + lo = block; + hi = block; + bitmap_file_clear_bit(bitmap, block); + bitmap_set_memory_bits(mddev->bitmap, block, 1); + bitmap_file_set_bit(mddev->bitmap, block); + } + } + + if (clear_bits) { + bitmap_update_sb(bitmap); + /* Setting this for the ev_page should be enough. + * And we do not require both write_all and PAGE_DIRT either + */ + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + bitmap_write_all(bitmap); + bitmap_unplug(bitmap); + } + *low = lo; + *high = hi; +err: + bitmap_free(bitmap); + return rv; +} +EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); + + void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) { unsigned long chunk_kb; @@ -1849,7 +1985,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, memset(&store, 0, sizeof(store)); if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) ret = bitmap_storage_alloc(&store, chunks, - !bitmap->mddev->bitmap_info.external); + !bitmap->mddev->bitmap_info.external, + bitmap->cluster_slot); if (ret) goto err; @@ -2021,13 +2158,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->bitmap_info.offset = offset; if (mddev->pers) { + struct bitmap *bitmap; mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) + bitmap = bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) + rv = PTR_ERR(bitmap); + else { + mddev->bitmap = bitmap; rv = bitmap_load(mddev); - if (rv) { - bitmap_destroy(mddev); - mddev->bitmap_info.offset = 0; + if (rv) { + bitmap_destroy(mddev); + mddev->bitmap_info.offset = 0; + } } mddev->pers->quiesce(mddev, 0); if (rv) @@ -2186,6 +2328,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); static ssize_t metadata_show(struct mddev *mddev, char *page) { + if (mddev_is_clustered(mddev)) + return sprintf(page, "clustered\n"); return sprintf(page, "%s\n", (mddev->bitmap_info.external ? "external" : "internal")); } @@ -2198,7 +2342,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) return -EBUSY; if (strncmp(buf, "external", 8) == 0) mddev->bitmap_info.external = 1; - else if (strncmp(buf, "internal", 8) == 0) + else if ((strncmp(buf, "internal", 8) == 0) || + (strncmp(buf, "clustered", 9) == 0)) mddev->bitmap_info.external = 0; else return -EINVAL; diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 30210b9c4ef9..f1f4dd01090d 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -130,8 +130,9 @@ typedef struct bitmap_super_s { __le32 write_behind; /* 60 number of outstanding write-behind writes */ __le32 sectors_reserved; /* 64 number of 512-byte sectors that are * reserved for the bitmap. */ - - __u8 pad[256 - 68]; /* set to zero */ + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ } bitmap_super_t; /* notes: @@ -226,12 +227,13 @@ struct bitmap { wait_queue_head_t behind_wait; struct kernfs_node *sysfs_can_clear; + int cluster_slot; /* Slot offset for clustered env */ }; /* the bitmap API */ /* these are used only by md/bitmap */ -int bitmap_create(struct mddev *mddev); +struct bitmap *bitmap_create(struct mddev *mddev, int slot); int bitmap_load(struct mddev *mddev); void bitmap_flush(struct mddev *mddev); void bitmap_destroy(struct mddev *mddev); @@ -260,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev); int bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init); +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *lo, sector_t *hi, bool clear_bits); #endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c new file mode 100644 index 000000000000..fcfc4b9b2672 --- /dev/null +++ b/drivers/md/md-cluster.c @@ -0,0 +1,965 @@ +/* + * Copyright (C) 2015, SUSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + */ + + +#include <linux/module.h> +#include <linux/dlm.h> +#include <linux/sched.h> +#include <linux/raid/md_p.h> +#include "md.h" +#include "bitmap.h" +#include "md-cluster.h" + +#define LVB_SIZE 64 +#define NEW_DEV_TIMEOUT 5000 + +struct dlm_lock_resource { + dlm_lockspace_t *ls; + struct dlm_lksb lksb; + char *name; /* lock name. */ + uint32_t flags; /* flags to pass to dlm_lock() */ + struct completion completion; /* completion for synchronized locking */ + void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ + struct mddev *mddev; /* pointing back to mddev. */ +}; + +struct suspend_info { + int slot; + sector_t lo; + sector_t hi; + struct list_head list; +}; + +struct resync_info { + __le64 lo; + __le64 hi; +}; + +/* md_cluster_info flags */ +#define MD_CLUSTER_WAITING_FOR_NEWDISK 1 + + +struct md_cluster_info { + /* dlm lock space and resources for clustered raid. */ + dlm_lockspace_t *lockspace; + int slot_number; + struct completion completion; + struct dlm_lock_resource *sb_lock; + struct mutex sb_mutex; + struct dlm_lock_resource *bitmap_lockres; + struct list_head suspend_list; + spinlock_t suspend_lock; + struct md_thread *recovery_thread; + unsigned long recovery_map; + /* communication loc resources */ + struct dlm_lock_resource *ack_lockres; + struct dlm_lock_resource *message_lockres; + struct dlm_lock_resource *token_lockres; + struct dlm_lock_resource *no_new_dev_lockres; + struct md_thread *recv_thread; + struct completion newdisk_completion; + unsigned long state; +}; + +enum msg_type { + METADATA_UPDATED = 0, + RESYNCING, + NEWDISK, + REMOVE, + RE_ADD, +}; + +struct cluster_msg { + int type; + int slot; + /* TODO: Unionize this for smaller footprint */ + sector_t low; + sector_t high; + char uuid[16]; + int raid_slot; +}; + +static void sync_ast(void *arg) +{ + struct dlm_lock_resource *res; + + res = (struct dlm_lock_resource *) arg; + complete(&res->completion); +} + +static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) +{ + int ret = 0; + + init_completion(&res->completion); + ret = dlm_lock(res->ls, mode, &res->lksb, + res->flags, res->name, strlen(res->name), + 0, sync_ast, res, res->bast); + if (ret) + return ret; + wait_for_completion(&res->completion); + return res->lksb.sb_status; +} + +static int dlm_unlock_sync(struct dlm_lock_resource *res) +{ + return dlm_lock_sync(res, DLM_LOCK_NL); +} + +static struct dlm_lock_resource *lockres_init(struct mddev *mddev, + char *name, void (*bastfn)(void *arg, int mode), int with_lvb) +{ + struct dlm_lock_resource *res = NULL; + int ret, namelen; + struct md_cluster_info *cinfo = mddev->cluster_info; + + res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + if (!res) + return NULL; + res->ls = cinfo->lockspace; + res->mddev = mddev; + namelen = strlen(name); + res->name = kzalloc(namelen + 1, GFP_KERNEL); + if (!res->name) { + pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); + goto out_err; + } + strlcpy(res->name, name, namelen + 1); + if (with_lvb) { + res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); + if (!res->lksb.sb_lvbptr) { + pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); + goto out_err; + } + res->flags = DLM_LKF_VALBLK; + } + + if (bastfn) + res->bast = bastfn; + + res->flags |= DLM_LKF_EXPEDITE; + + ret = dlm_lock_sync(res, DLM_LOCK_NL); + if (ret) { + pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); + goto out_err; + } + res->flags &= ~DLM_LKF_EXPEDITE; + res->flags |= DLM_LKF_CONVERT; + + return res; +out_err: + kfree(res->lksb.sb_lvbptr); + kfree(res->name); + kfree(res); + return NULL; +} + +static void lockres_free(struct dlm_lock_resource *res) +{ + if (!res) + return; + + init_completion(&res->completion); + dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); + wait_for_completion(&res->completion); + + kfree(res->name); + kfree(res->lksb.sb_lvbptr); + kfree(res); +} + +static char *pretty_uuid(char *dest, char *src) +{ + int i, len = 0; + + for (i = 0; i < 16; i++) { + if (i == 4 || i == 6 || i == 8 || i == 10) + len += sprintf(dest + len, "-"); + len += sprintf(dest + len, "%02x", (__u8)src[i]); + } + return dest; +} + +static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, + sector_t lo, sector_t hi) +{ + struct resync_info *ri; + + ri = (struct resync_info *)lockres->lksb.sb_lvbptr; + ri->lo = cpu_to_le64(lo); + ri->hi = cpu_to_le64(hi); +} + +static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) +{ + struct resync_info ri; + struct suspend_info *s = NULL; + sector_t hi = 0; + + dlm_lock_sync(lockres, DLM_LOCK_CR); + memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); + hi = le64_to_cpu(ri.hi); + if (ri.hi > 0) { + s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); + if (!s) + goto out; + s->hi = hi; + s->lo = le64_to_cpu(ri.lo); + } + dlm_unlock_sync(lockres); +out: + return s; +} + +static void recover_bitmaps(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct md_cluster_info *cinfo = mddev->cluster_info; + struct dlm_lock_resource *bm_lockres; + char str[64]; + int slot, ret; + struct suspend_info *s, *tmp; + sector_t lo, hi; + + while (cinfo->recovery_map) { + slot = fls64((u64)cinfo->recovery_map) - 1; + + /* Clear suspend_area associated with the bitmap */ + spin_lock_irq(&cinfo->suspend_lock); + list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) + if (slot == s->slot) { + list_del(&s->list); + kfree(s); + } + spin_unlock_irq(&cinfo->suspend_lock); + + snprintf(str, 64, "bitmap%04d", slot); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("md-cluster: Cannot initialize bitmaps\n"); + goto clear_bit; + } + + ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (ret) { + pr_err("md-cluster: Could not DLM lock %s: %d\n", + str, ret); + goto clear_bit; + } + ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); + if (ret) { + pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); + goto dlm_unlock; + } + if (hi > 0) { + /* TODO:Wait for current resync to get over */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (lo < mddev->recovery_cp) + mddev->recovery_cp = lo; + md_check_recovery(mddev); + } +dlm_unlock: + dlm_unlock_sync(bm_lockres); +clear_bit: + clear_bit(slot, &cinfo->recovery_map); + } +} + +static void recover_prep(void *arg) +{ +} + +static void recover_slot(void *arg, struct dlm_slot *slot) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", + mddev->bitmap_info.cluster_name, + slot->nodeid, slot->slot, + cinfo->slot_number); + set_bit(slot->slot - 1, &cinfo->recovery_map); + if (!cinfo->recovery_thread) { + cinfo->recovery_thread = md_register_thread(recover_bitmaps, + mddev, "recover"); + if (!cinfo->recovery_thread) { + pr_warn("md-cluster: Could not create recovery thread\n"); + return; + } + } + md_wakeup_thread(cinfo->recovery_thread); +} + +static void recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cinfo->slot_number = our_slot; + complete(&cinfo->completion); +} + +static const struct dlm_lockspace_ops md_ls_ops = { + .recover_prep = recover_prep, + .recover_slot = recover_slot, + .recover_done = recover_done, +}; + +/* + * The BAST function for the ack lock resource + * This function wakes up the receive thread in + * order to receive and process the message. + */ +static void ack_bast(void *arg, int mode) +{ + struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; + struct md_cluster_info *cinfo = res->mddev->cluster_info; + + if (mode == DLM_LOCK_EX) + md_wakeup_thread(cinfo->recv_thread); +} + +static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) +{ + struct suspend_info *s, *tmp; + + list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) + if (slot == s->slot) { + pr_info("%s:%d Deleting suspend_info: %d\n", + __func__, __LINE__, slot); + list_del(&s->list); + kfree(s); + break; + } +} + +static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) +{ + spin_lock_irq(&cinfo->suspend_lock); + __remove_suspend_info(cinfo, slot); + spin_unlock_irq(&cinfo->suspend_lock); +} + + +static void process_suspend_info(struct md_cluster_info *cinfo, + int slot, sector_t lo, sector_t hi) +{ + struct suspend_info *s; + + if (!hi) { + remove_suspend_info(cinfo, slot); + return; + } + s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); + if (!s) + return; + s->slot = slot; + s->lo = lo; + s->hi = hi; + spin_lock_irq(&cinfo->suspend_lock); + /* Remove existing entry (if exists) before adding */ + __remove_suspend_info(cinfo, slot); + list_add(&s->list, &cinfo->suspend_list); + spin_unlock_irq(&cinfo->suspend_lock); +} + +static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +{ + char disk_uuid[64]; + struct md_cluster_info *cinfo = mddev->cluster_info; + char event_name[] = "EVENT=ADD_DEVICE"; + char raid_slot[16]; + char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; + int len; + + len = snprintf(disk_uuid, 64, "DEVICE_UUID="); + pretty_uuid(disk_uuid + len, cmsg->uuid); + snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); + pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); + init_completion(&cinfo->newdisk_completion); + set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); + wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT); + clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); +} + + +static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + md_reload_sb(mddev); + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); +} + +static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + + if (rdev) + md_kick_rdev_from_array(rdev); + else + pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); +} + +static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) +{ + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + + if (rdev && test_bit(Faulty, &rdev->flags)) + clear_bit(Faulty, &rdev->flags); + else + pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); +} + +static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) +{ + switch (msg->type) { + case METADATA_UPDATED: + pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", + __func__, __LINE__, msg->slot); + process_metadata_update(mddev, msg); + break; + case RESYNCING: + pr_info("%s: %d Received message: RESYNCING from %d\n", + __func__, __LINE__, msg->slot); + process_suspend_info(mddev->cluster_info, msg->slot, + msg->low, msg->high); + break; + case NEWDISK: + pr_info("%s: %d Received message: NEWDISK from %d\n", + __func__, __LINE__, msg->slot); + process_add_new_disk(mddev, msg); + break; + case REMOVE: + pr_info("%s: %d Received REMOVE from %d\n", + __func__, __LINE__, msg->slot); + process_remove_disk(mddev, msg); + break; + case RE_ADD: + pr_info("%s: %d Received RE_ADD from %d\n", + __func__, __LINE__, msg->slot); + process_readd_disk(mddev, msg); + break; + default: + pr_warn("%s:%d Received unknown message from %d\n", + __func__, __LINE__, msg->slot); + } +} + +/* + * thread for receiving message + */ +static void recv_daemon(struct md_thread *thread) +{ + struct md_cluster_info *cinfo = thread->mddev->cluster_info; + struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; + struct dlm_lock_resource *message_lockres = cinfo->message_lockres; + struct cluster_msg msg; + + /*get CR on Message*/ + if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { + pr_err("md/raid1:failed to get CR on MESSAGE\n"); + return; + } + + /* read lvb and wake up thread to process this message_lockres */ + memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); + process_recvd_msg(thread->mddev, &msg); + + /*release CR on ack_lockres*/ + dlm_unlock_sync(ack_lockres); + /*up-convert to EX on message_lockres*/ + dlm_lock_sync(message_lockres, DLM_LOCK_EX); + /*get CR on ack_lockres again*/ + dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + /*release CR on message_lockres*/ + dlm_unlock_sync(message_lockres); +} + +/* lock_comm() + * Takes the lock on the TOKEN lock resource so no other + * node can communicate while the operation is underway. + */ +static int lock_comm(struct md_cluster_info *cinfo) +{ + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + return error; +} + +static void unlock_comm(struct md_cluster_info *cinfo) +{ + dlm_unlock_sync(cinfo->token_lockres); +} + +/* __sendmsg() + * This function performs the actual sending of the message. This function is + * usually called after performing the encompassing operation + * The function: + * 1. Grabs the message lockresource in EX mode + * 2. Copies the message to the message LVB + * 3. Downconverts message lockresource to CR + * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes + * and the other nodes read the message. The thread will wait here until all other + * nodes have released ack lock resource. + * 5. Downconvert ack lockresource to CR + */ +static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) +{ + int error; + int slot = cinfo->slot_number - 1; + + cmsg->slot = cpu_to_le32(slot); + /*get EX on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); + goto failed_message; + } + + memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, + sizeof(struct cluster_msg)); + /*down-convert EX to CR on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); + if (error) { + pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", + error); + goto failed_message; + } + + /*up-convert CR to EX on Ack*/ + error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", + error); + goto failed_ack; + } + + /*down-convert EX to CR on Ack*/ + error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); + if (error) { + pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", + error); + goto failed_ack; + } + +failed_ack: + dlm_unlock_sync(cinfo->message_lockres); +failed_message: + return error; +} + +static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) +{ + int ret; + + lock_comm(cinfo); + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + return ret; +} + +static int gather_all_resync_info(struct mddev *mddev, int total_slots) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int i, ret = 0; + struct dlm_lock_resource *bm_lockres; + struct suspend_info *s; + char str[64]; + + + for (i = 0; i < total_slots; i++) { + memset(str, '\0', 64); + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) + return -ENOMEM; + if (i == (cinfo->slot_number - 1)) + continue; + + bm_lockres->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (ret == -EAGAIN) { + memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); + s = read_resync_info(mddev, bm_lockres); + if (s) { + pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", + __func__, __LINE__, + (unsigned long long) s->lo, + (unsigned long long) s->hi, i); + spin_lock_irq(&cinfo->suspend_lock); + s->slot = i; + list_add(&s->list, &cinfo->suspend_list); + spin_unlock_irq(&cinfo->suspend_lock); + } + ret = 0; + lockres_free(bm_lockres); + continue; + } + if (ret) + goto out; + /* TODO: Read the disk bitmap sb and check if it needs recovery */ + dlm_unlock_sync(bm_lockres); + lockres_free(bm_lockres); + } +out: + return ret; +} + +static int join(struct mddev *mddev, int nodes) +{ + struct md_cluster_info *cinfo; + int ret, ops_rv; + char str[64]; + + if (!try_module_get(THIS_MODULE)) + return -ENOENT; + + cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); + if (!cinfo) + return -ENOMEM; + + init_completion(&cinfo->completion); + + mutex_init(&cinfo->sb_mutex); + mddev->cluster_info = cinfo; + + memset(str, 0, 64); + pretty_uuid(str, mddev->uuid); + ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, + DLM_LSFL_FS, LVB_SIZE, + &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); + if (ret) + goto err; + wait_for_completion(&cinfo->completion); + if (nodes < cinfo->slot_number) { + pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", + cinfo->slot_number, nodes); + ret = -ERANGE; + goto err; + } + cinfo->sb_lock = lockres_init(mddev, "cmd-super", + NULL, 0); + if (!cinfo->sb_lock) { + ret = -ENOMEM; + goto err; + } + /* Initiate the communication resources */ + ret = -ENOMEM; + cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + if (!cinfo->recv_thread) { + pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); + goto err; + } + cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); + if (!cinfo->message_lockres) + goto err; + cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); + if (!cinfo->token_lockres) + goto err; + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); + if (!cinfo->ack_lockres) + goto err; + cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); + if (!cinfo->no_new_dev_lockres) + goto err; + + /* get sync CR lock on ACK. */ + if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) + pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", + ret); + /* get sync CR lock on no-new-dev. */ + if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) + pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); + + + pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); + snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); + cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); + if (!cinfo->bitmap_lockres) + goto err; + if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { + pr_err("Failed to get bitmap lock\n"); + ret = -EINVAL; + goto err; + } + + INIT_LIST_HEAD(&cinfo->suspend_list); + spin_lock_init(&cinfo->suspend_lock); + + ret = gather_all_resync_info(mddev, nodes); + if (ret) + goto err; + + return 0; +err: + lockres_free(cinfo->message_lockres); + lockres_free(cinfo->token_lockres); + lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->bitmap_lockres); + lockres_free(cinfo->sb_lock); + if (cinfo->lockspace) + dlm_release_lockspace(cinfo->lockspace, 2); + mddev->cluster_info = NULL; + kfree(cinfo); + module_put(THIS_MODULE); + return ret; +} + +static int leave(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + if (!cinfo) + return 0; + md_unregister_thread(&cinfo->recovery_thread); + md_unregister_thread(&cinfo->recv_thread); + lockres_free(cinfo->message_lockres); + lockres_free(cinfo->token_lockres); + lockres_free(cinfo->ack_lockres); + lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->sb_lock); + lockres_free(cinfo->bitmap_lockres); + dlm_release_lockspace(cinfo->lockspace, 2); + return 0; +} + +/* slot_number(): Returns the MD slot number to use + * DLM starts the slot numbers from 1, wheras cluster-md + * wants the number to be from zero, so we deduct one + */ +static int slot_number(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return cinfo->slot_number - 1; +} + +static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); + /* Re-acquire the lock to refresh LVB */ + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); +} + +static int metadata_update_start(struct mddev *mddev) +{ + return lock_comm(mddev->cluster_info); +} + +static int metadata_update_finish(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int ret; + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(METADATA_UPDATED); + ret = __sendmsg(cinfo, &cmsg); + unlock_comm(cinfo); + return ret; +} + +static int metadata_update_cancel(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + return dlm_unlock_sync(cinfo->token_lockres); +} + +static int resync_send(struct mddev *mddev, enum msg_type type, + sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int slot = cinfo->slot_number - 1; + + pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, + (unsigned long long)lo, + (unsigned long long)hi); + resync_info_update(mddev, lo, hi); + cmsg.type = cpu_to_le32(type); + cmsg.slot = cpu_to_le32(slot); + cmsg.low = cpu_to_le64(lo); + cmsg.high = cpu_to_le64(hi); + return sendmsg(cinfo, &cmsg); +} + +static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) +{ + pr_info("%s:%d\n", __func__, __LINE__); + return resync_send(mddev, RESYNCING, lo, hi); +} + +static void resync_finish(struct mddev *mddev) +{ + pr_info("%s:%d\n", __func__, __LINE__); + resync_send(mddev, RESYNCING, 0, 0); +} + +static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; + struct suspend_info *s; + + spin_lock_irq(&cinfo->suspend_lock); + if (list_empty(&cinfo->suspend_list)) + goto out; + list_for_each_entry(s, &cinfo->suspend_list, list) + if (hi > s->lo && lo < s->hi) { + ret = 1; + break; + } +out: + spin_unlock_irq(&cinfo->suspend_lock); + return ret; +} + +static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int ret = 0; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + char *uuid = sb->device_uuid; + + memset(&cmsg, 0, sizeof(cmsg)); + cmsg.type = cpu_to_le32(NEWDISK); + memcpy(cmsg.uuid, uuid, 16); + cmsg.raid_slot = rdev->desc_nr; + lock_comm(cinfo); + ret = __sendmsg(cinfo, &cmsg); + if (ret) + return ret; + cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; + ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); + cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; + /* Some node does not "see" the device */ + if (ret == -EAGAIN) + ret = -ENOENT; + else + dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); + return ret; +} + +static int add_new_disk_finish(struct mddev *mddev) +{ + struct cluster_msg cmsg; + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret; + /* Write sb and inform others */ + md_update_sb(mddev, 1); + cmsg.type = METADATA_UPDATED; + ret = __sendmsg(cinfo, &cmsg); + unlock_comm(cinfo); + return ret; +} + +static int new_disk_ack(struct mddev *mddev, bool ack) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { + pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); + return -EINVAL; + } + + if (ack) + dlm_unlock_sync(cinfo->no_new_dev_lockres); + complete(&cinfo->newdisk_completion); + return 0; +} + +static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct cluster_msg cmsg; + struct md_cluster_info *cinfo = mddev->cluster_info; + cmsg.type = REMOVE; + cmsg.raid_slot = rdev->desc_nr; + return __sendmsg(cinfo, &cmsg); +} + +static int gather_bitmaps(struct md_rdev *rdev) +{ + int sn, err; + sector_t lo, hi; + struct cluster_msg cmsg; + struct mddev *mddev = rdev->mddev; + struct md_cluster_info *cinfo = mddev->cluster_info; + + cmsg.type = RE_ADD; + cmsg.raid_slot = rdev->desc_nr; + err = sendmsg(cinfo, &cmsg); + if (err) + goto out; + + for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { + if (sn == (cinfo->slot_number - 1)) + continue; + err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); + if (err) { + pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); + goto out; + } + if ((hi > 0) && (lo < mddev->recovery_cp)) + mddev->recovery_cp = lo; + } +out: + return err; +} + +static struct md_cluster_operations cluster_ops = { + .join = join, + .leave = leave, + .slot_number = slot_number, + .resync_info_update = resync_info_update, + .resync_start = resync_start, + .resync_finish = resync_finish, + .metadata_update_start = metadata_update_start, + .metadata_update_finish = metadata_update_finish, + .metadata_update_cancel = metadata_update_cancel, + .area_resyncing = area_resyncing, + .add_new_disk_start = add_new_disk_start, + .add_new_disk_finish = add_new_disk_finish, + .new_disk_ack = new_disk_ack, + .remove_disk = remove_disk, + .gather_bitmaps = gather_bitmaps, +}; + +static int __init cluster_init(void) +{ + pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_info("Registering Cluster MD functions\n"); + register_md_cluster_operations(&cluster_ops, THIS_MODULE); + return 0; +} + +static void cluster_exit(void) +{ + unregister_md_cluster_operations(); +} + +module_init(cluster_init); +module_exit(cluster_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h new file mode 100644 index 000000000000..6817ee00e053 --- /dev/null +++ b/drivers/md/md-cluster.h @@ -0,0 +1,29 @@ + + +#ifndef _MD_CLUSTER_H +#define _MD_CLUSTER_H + +#include "md.h" + +struct mddev; +struct md_rdev; + +struct md_cluster_operations { + int (*join)(struct mddev *mddev, int nodes); + int (*leave)(struct mddev *mddev); + int (*slot_number)(struct mddev *mddev); + void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); + void (*resync_finish)(struct mddev *mddev); + int (*metadata_update_start)(struct mddev *mddev); + int (*metadata_update_finish)(struct mddev *mddev); + int (*metadata_update_cancel)(struct mddev *mddev); + int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); + int (*add_new_disk_finish)(struct mddev *mddev); + int (*new_disk_ack)(struct mddev *mddev, bool ack); + int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); + int (*gather_bitmaps)(struct md_rdev *rdev); +}; + +#endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md.c b/drivers/md/md.c index e6178787ce3d..d4f31e195e26 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -53,6 +53,7 @@ #include <linux/slab.h> #include "md.h" #include "bitmap.h" +#include "md-cluster.h" #ifndef MODULE static void autostart_arrays(int part); @@ -66,6 +67,11 @@ static void autostart_arrays(int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); +struct md_cluster_operations *md_cluster_ops; +EXPORT_SYMBOL(md_cluster_ops); +struct module *md_cluster_mod; +EXPORT_SYMBOL(md_cluster_mod); + static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; @@ -640,7 +646,7 @@ void mddev_unlock(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_unlock); -static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; @@ -650,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) return NULL; } +EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { @@ -2047,11 +2054,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) int choice = 0; if (mddev->pers) choice = mddev->raid_disks; - while (find_rdev_nr_rcu(mddev, choice)) + while (md_find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { - if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { + if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { rcu_read_unlock(); return -EBUSY; } @@ -2166,11 +2173,12 @@ static void export_rdev(struct md_rdev *rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(struct md_rdev *rdev) +void md_kick_rdev_from_array(struct md_rdev *rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); } +EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); static void export_array(struct mddev *mddev) { @@ -2179,7 +2187,7 @@ static void export_array(struct mddev *mddev) while (!list_empty(&mddev->disks)) { rdev = list_first_entry(&mddev->disks, struct md_rdev, same_set); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); } mddev->raid_disks = 0; mddev->major_version = 0; @@ -2208,7 +2216,7 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } -static void md_update_sb(struct mddev *mddev, int force_change) +void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; @@ -2369,6 +2377,37 @@ repeat: wake_up(&rdev->blocked_wait); } } +EXPORT_SYMBOL(md_update_sb); + +static int add_bound_rdev(struct md_rdev *rdev) +{ + struct mddev *mddev = rdev->mddev; + int err = 0; + + if (!mddev->pers->hot_remove_disk) { + /* If there is hot_add_disk but no hot_remove_disk + * then added disks for geometry changes, + * and should be added immediately. + */ + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); + if (err) { + unbind_rdev_from_array(rdev); + export_rdev(rdev); + return err; + } + } + sysfs_notify_dirent_safe(rdev->sysfs_state); + + set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev->degraded) + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_new_event(mddev); + md_wakeup_thread(mddev->thread); + return 0; +} /* words written to sysfs files may, or may not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -2471,10 +2510,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->remove_disk(mddev, rdev); + md_kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (mddev->pers) md_update_sb(mddev, 1); md_new_event(mddev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); err = 0; } } else if (cmd_match(buf, "writemostly")) { @@ -2553,6 +2598,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(Replacement, &rdev->flags); err = 0; } + } else if (cmd_match(buf, "re-add")) { + if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { + /* clear_bit is performed _after_ all the devices + * have their local Faulty bit cleared. If any writes + * happen in the meantime in the local node, they + * will land in the local bitmap, which will be synced + * by this node eventually + */ + if (!mddev_is_clustered(rdev->mddev) || + (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + clear_bit(Faulty, &rdev->flags); + err = add_bound_rdev(rdev); + } + } else + err = -EBUSY; } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -3127,7 +3187,7 @@ static void analyze_sbs(struct mddev *mddev) "md: fatal superblock inconsistency in %s" " -- removing from array\n", bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); } super_types[mddev->major_version]. @@ -3142,18 +3202,27 @@ static void analyze_sbs(struct mddev *mddev) "md: %s: %s: only %d devices permitted\n", mdname(mddev), bdevname(rdev->bdev, b), mddev->max_disks); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); continue; } - if (rdev != freshest) + if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { printk(KERN_WARNING "md: kicking non-fresh %s" " from array!\n", bdevname(rdev->bdev,b)); - kick_rdev_from_array(rdev); + md_kick_rdev_from_array(rdev); continue; } + /* No device should have a Candidate flag + * when reading devices + */ + if (test_bit(Candidate, &rdev->flags)) { + pr_info("md: kicking Cluster Candidate %s from array!\n", + bdevname(rdev->bdev, b)); + md_kick_rdev_from_array(rdev); + } + } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; @@ -4008,8 +4077,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -4354,7 +4427,6 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; int err; - int chunk; if (kstrtoull(buf, 10, &min)) return -EINVAL; @@ -4368,16 +4440,8 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; - /* Must be a multiple of chunk_size */ - chunk = mddev->chunk_sectors; - if (chunk) { - sector_t temp = min; - - err = -EINVAL; - if (sector_div(temp, chunk)) - goto out_unlock; - } - mddev->resync_min = min; + /* Round down to multiple of 4K for safety */ + mddev->resync_min = round_down(min, 8); err = 0; out_unlock: @@ -5077,10 +5141,16 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = bitmap_create(mddev); - if (err) + struct bitmap *bitmap; + + bitmap = bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) { + err = PTR_ERR(bitmap); printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); + } else + mddev->bitmap = bitmap; + } if (err) { mddev_detach(mddev); @@ -5232,6 +5302,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5250,6 +5322,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5636,6 +5710,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) info.state |= (1<<MD_SB_BITMAP_PRESENT); + if (mddev_is_clustered(mddev)) + info.state |= (1<<MD_SB_CLUSTERED); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -5691,7 +5767,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) return -EFAULT; rcu_read_lock(); - rdev = find_rdev_nr_rcu(mddev, info.number); + rdev = md_find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5724,6 +5800,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); + if (mddev_is_clustered(mddev) && + !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { + pr_err("%s: Cannot add to clustered mddev.\n", + mdname(mddev)); + return -EINVAL; + } + if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) return -EOVERFLOW; @@ -5810,31 +5893,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + /* + * check whether the device shows up in other nodes + */ + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + /* Through --cluster-confirm */ + set_bit(Candidate, &rdev->flags); + err = md_cluster_ops->new_disk_ack(mddev, true); + if (err) { + export_rdev(rdev); + return err; + } + } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + /* --add initiated by this node */ + err = md_cluster_ops->add_new_disk_start(mddev, rdev); + if (err) { + md_cluster_ops->add_new_disk_finish(mddev); + export_rdev(rdev); + return err; + } + } + } + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); - if (!err && !mddev->pers->hot_remove_disk) { - /* If there is hot_add_disk but no hot_remove_disk - * then added disks for geometry changes, - * and should be added immediately. - */ - super_types[mddev->major_version]. - validate_super(mddev, rdev); - err = mddev->pers->hot_add_disk(mddev, rdev); - if (err) - unbind_rdev_from_array(rdev); - } if (err) export_rdev(rdev); else - sysfs_notify_dirent_safe(rdev->sysfs_state); - - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (mddev->degraded) - set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!err) - md_new_event(mddev); - md_wakeup_thread(mddev->thread); + err = add_bound_rdev(rdev); + if (mddev_is_clustered(mddev) && + (info->state & (1 << MD_DISK_CLUSTER_ADD))) + md_cluster_ops->add_new_disk_finish(mddev); return err; } @@ -5895,18 +5985,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (!rdev) return -ENXIO; + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); + clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); if (rdev->raid_disk >= 0) goto busy; - kick_rdev_from_array(rdev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->remove_disk(mddev, rdev); + + md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + return 0; busy: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -5956,12 +6057,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) err = -EINVAL; goto abort_export; } + + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_export; + goto abort_clustered; /* * The rest should better be atomic, we can have disk failures @@ -5972,6 +6076,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -5981,6 +6087,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; +abort_clustered: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6038,9 +6147,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (mddev->pers) { mddev->pers->quiesce(mddev, 1); if (fd >= 0) { - err = bitmap_create(mddev); - if (!err) + struct bitmap *bitmap; + + bitmap = bitmap_create(mddev, -1); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; err = bitmap_load(mddev); + } else + err = PTR_ERR(bitmap); } if (fd < 0 || err) { bitmap_destroy(mddev); @@ -6293,6 +6407,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6300,33 +6416,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { - if (mddev->pers->quiesce == NULL || mddev->thread == NULL) - return -EINVAL; - if (mddev->recovery || mddev->sync_thread) - return -EBUSY; + if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { + rv = -EINVAL; + goto err; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto err; + } if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap *bitmap; /* add the bitmap */ - if (mddev->bitmap) - return -EEXIST; - if (mddev->bitmap_info.default_offset == 0) - return -EINVAL; + if (mddev->bitmap) { + rv = -EEXIST; + goto err; + } + if (mddev->bitmap_info.default_offset == 0) { + rv = -EINVAL; + goto err; + } mddev->bitmap_info.offset = mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); - rv = bitmap_create(mddev); - if (!rv) + bitmap = bitmap_create(mddev, -1); + if (!IS_ERR(bitmap)) { + mddev->bitmap = bitmap; rv = bitmap_load(mddev); + } else + rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); } else { /* remove the bitmap */ - if (!mddev->bitmap) - return -ENOENT; - if (mddev->bitmap->storage.file) - return -EINVAL; + if (!mddev->bitmap) { + rv = -ENOENT; + goto err; + } + if (mddev->bitmap->storage.file) { + rv = -EINVAL; + goto err; + } mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -6334,6 +6466,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + return rv; +err: + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -6393,6 +6531,7 @@ static inline bool md_ioctl_valid(unsigned int cmd) case SET_DISK_FAULTY: case STOP_ARRAY: case STOP_ARRAY_RO: + case CLUSTERED_DISK_NACK: return true; default: return false; @@ -6665,6 +6804,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto unlock; } + case CLUSTERED_DISK_NACK: + if (mddev_is_clustered(mddev)) + md_cluster_ops->new_disk_ack(mddev, false); + else + err = -EINVAL; + goto unlock; + case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto unlock; @@ -7238,6 +7384,55 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); +int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) +{ + if (md_cluster_ops != NULL) + return -EALREADY; + spin_lock(&pers_lock); + md_cluster_ops = ops; + md_cluster_mod = module; + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(register_md_cluster_operations); + +int unregister_md_cluster_operations(void) +{ + spin_lock(&pers_lock); + md_cluster_ops = NULL; + spin_unlock(&pers_lock); + return 0; +} +EXPORT_SYMBOL(unregister_md_cluster_operations); + +int md_setup_cluster(struct mddev *mddev, int nodes) +{ + int err; + + err = request_module("md-cluster"); + if (err) { + pr_err("md-cluster module not found.\n"); + return err; + } + + spin_lock(&pers_lock); + if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + spin_unlock(&pers_lock); + return -ENOENT; + } + spin_unlock(&pers_lock); + + return md_cluster_ops->join(mddev, nodes); +} + +void md_cluster_stop(struct mddev *mddev) +{ + if (!md_cluster_ops) + return; + md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); +} + static int is_mddev_idle(struct mddev *mddev, int init) { struct md_rdev *rdev; @@ -7375,7 +7570,11 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7576,6 +7775,9 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_start(mddev, j, max_sectors); + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7618,8 +7820,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; - sectors = mddev->pers->sync_request(mddev, j, &skipped, - currspeed < speed_min(mddev)); + sectors = mddev->pers->sync_request(mddev, j, &skipped); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -7636,6 +7837,8 @@ void md_do_sync(struct md_thread *thread) j += sectors; if (j > 2) mddev->curr_resync = j; + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7677,11 +7880,18 @@ void md_do_sync(struct md_thread *thread) /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { - if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev, 0)) { + if (currspeed > speed_max(mddev)) { msleep(500); goto repeat; } + if (!is_mddev_idle(mddev, 0)) { + /* + * Give other IO more of a chance. + * The faster the devices, the less we wait. + */ + wait_event(mddev->recovery_wait, + !atomic_read(&mddev->recovery_active)); + } } } printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, @@ -7694,7 +7904,10 @@ void md_do_sync(struct md_thread *thread) wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); + mddev->pers->sync_request(mddev, max_sectors, &skipped); + + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_finish(mddev); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { @@ -7925,8 +8138,13 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) + if (mddev->flags & MD_UPDATE_SB_FLAGS) { + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); + } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8024,6 +8242,8 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8036,6 +8256,8 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); + if (mddev_is_clustered(mddev)) + md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8656,6 +8878,28 @@ err_wq: return ret; } +void md_reload_sb(struct mddev *mddev) +{ + struct md_rdev *rdev, *tmp; + + rdev_for_each_safe(rdev, tmp, mddev) { + rdev->sb_loaded = 0; + ClearPageUptodate(rdev->sb_page); + } + mddev->raid_disks = 0; + analyze_sbs(mddev); + rdev_for_each_safe(rdev, tmp, mddev) { + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + /* since we don't write to faulty devices, we figure out if the + * disk is faulty by comparing events + */ + if (mddev->events > sb->events) + set_bit(Faulty, &rdev->flags); + } + +} +EXPORT_SYMBOL(md_reload_sb); + #ifndef MODULE /* diff --git a/drivers/md/md.h b/drivers/md/md.h index 318ca8fd430f..4046a6c6f223 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -23,6 +23,7 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include "md-cluster.h" #define MaxSector (~(sector_t)0) @@ -170,6 +171,10 @@ enum flag_bits { * a want_replacement device with same * raid_disk number. */ + Candidate, /* For clustered environments only: + * This device is seen locally but not + * by the whole cluster + */ }; #define BB_LEN_MASK (0x00000000000001FFULL) @@ -202,6 +207,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern void md_ack_all_badblocks(struct badblocks *bb); +struct md_cluster_info; + struct mddev { void *private; struct md_personality *pers; @@ -430,6 +437,8 @@ struct mddev { unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long max_write_behind; /* write-behind mode */ int external; + int nodes; /* Maximum number of nodes in the cluster */ + char cluster_name[64]; /* Name of the cluster */ } bitmap_info; atomic_t max_corr_read_errors; /* max read retries */ @@ -448,6 +457,7 @@ struct mddev { struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); + struct md_cluster_info *cluster_info; }; static inline int __must_check mddev_lock(struct mddev *mddev) @@ -496,7 +506,7 @@ struct md_personality int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); int (*spare_active) (struct mddev *mddev); - sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped); int (*resize) (struct mddev *mddev, sector_t sectors); sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); int (*check_reshape) (struct mddev *mddev); @@ -608,6 +618,11 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); +extern int register_md_cluster_operations(struct md_cluster_operations *ops, + struct module *module); +extern int unregister_md_cluster_operations(void); +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev); extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, @@ -654,6 +669,10 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); +extern void md_reload_sb(struct mddev *mddev); +extern void md_update_sb(struct mddev *mddev, int force); +extern void md_kick_rdev_from_array(struct md_rdev * rdev); +struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); static inline int mddev_check_plugged(struct mddev *mddev) { return !!blk_check_plugged(md_unplug, mddev, @@ -669,4 +688,9 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } +extern struct md_cluster_operations *md_cluster_ops; +static inline int mddev_is_clustered(struct mddev *mddev) +{ + return mddev->cluster_info && mddev->bitmap_info.nodes > 1; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 3b5d7f704aa3..2cb59a641cd2 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -271,14 +271,16 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) goto abort; } - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + if (mddev->queue) { + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + } pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -429,9 +431,12 @@ static int raid0_run(struct mddev *mddev) } if (md_check_no_bitmap(mddev)) return -EINVAL; - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + + if (mddev->queue) { + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + } /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { @@ -448,16 +453,17 @@ static int raid0_run(struct mddev *mddev) printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", mdname(mddev), (unsigned long long)mddev->array_sectors); - /* calculate the max read-ahead size. - * For read-ahead of large files to be effective, we need to - * readahead at least twice a whole stripe. i.e. number of devices - * multiplied by chunk size times 2. - * If an individual device has an ra_pages greater than the - * chunk size, then we will not drive that device as hard as it - * wants. We consider this a configuration error: a larger - * chunksize should be used in that case. - */ - { + + if (mddev->queue) { + /* calculate the max read-ahead size. + * For read-ahead of large files to be effective, we need to + * readahead at least twice a whole stripe. i.e. number of devices + * multiplied by chunk size times 2. + * If an individual device has an ra_pages greater than the + * chunk size, then we will not drive that device as hard as it + * wants. We consider this a configuration error: a larger + * chunksize should be used in that case. + */ int stripe = mddev->raid_disks * (mddev->chunk_sectors << 9) / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d34e238afa54..9157a29c8dbf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect has_nonrot_disk = 0; choose_next_idle = 0; - choose_first = (conf->mddev->recovery_cp < this_sector + sectors); + if ((conf->mddev->recovery_cp < this_sector + sectors) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, this_sector, + this_sector + sectors))) + choose_first = 1; + else + choose_first = 0; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; @@ -1102,8 +1108,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ if (bio_data_dir(bio) == WRITE && - bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) { + ((bio_end_sector(bio) > mddev->suspend_lo && + bio->bi_iter.bi_sector < mddev->suspend_hi) || + (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -1114,7 +1122,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi) + bio->bi_iter.bi_sector >= mddev->suspend_hi || + (mddev_is_clustered(mddev) && + !md_cluster_ops->area_resyncing(mddev, + bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; schedule(); } @@ -1561,6 +1572,7 @@ static int raid1_spare_active(struct mddev *mddev) struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; if (repl + && !test_bit(Candidate, &repl->flags) && repl->recovery_offset == MaxSector && !test_bit(Faulty, &repl->flags) && !test_and_set_bit(In_sync, &repl->flags)) { @@ -2468,7 +2480,7 @@ static int init_resync(struct r1conf *conf) * that can be installed to exclude normal IO requests. */ -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; @@ -2521,13 +2533,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp *skipped = 1; return sync_blocks; } - /* - * If there is non-resync activity waiting for a turn, - * and resync is going fast enough, - * then let it though before starting on this new sync request. - */ - if (!go_faster && conf->nr_waiting) - msleep_interruptible(1000); bitmap_cond_end_sync(mddev->bitmap, sector_nr); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a7196c49d15d..e793ab6b3570 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2889,7 +2889,7 @@ static int init_resync(struct r10conf *conf) */ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, - int *skipped, int go_faster) + int *skipped) { struct r10conf *conf = mddev->private; struct r10bio *r10_bio; @@ -2994,12 +2994,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (conf->geo.near_copies < conf->geo.raid_disks && max_sector > (sector_nr | chunk_mask)) max_sector = (sector_nr | chunk_mask) + 1; - /* - * If there is non-resync activity waiting for us then - * put in a delay to throttle resync. - */ - if (!go_faster && conf->nr_waiting) - msleep_interruptible(1000); /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cd2f96b2c572..77dfd720aaa0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -54,6 +54,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/nodemask.h> +#include <linux/flex_array.h> #include <trace/events/block.h> #include "md.h" @@ -496,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh) } } -static int grow_buffers(struct stripe_head *sh) +static int grow_buffers(struct stripe_head *sh, gfp_t gfp) { int i; int num = sh->raid_conf->pool_size; @@ -504,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh) for (i = 0; i < num; i++) { struct page *page; - if (!(page = alloc_page(GFP_KERNEL))) { + if (!(page = alloc_page(gfp))) { return 1; } sh->dev[i].page = page; @@ -525,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); BUG_ON(stripe_operations_active(sh)); + BUG_ON(sh->batch_head); pr_debug("init_stripe called, stripe %llu\n", (unsigned long long)sector); @@ -552,8 +554,10 @@ retry: } if (read_seqcount_retry(&conf->gen_lock, seq)) goto retry; + sh->overwrite_disks = 0; insert_hash(conf, sh); sh->cpu = smp_processor_id(); + set_bit(STRIPE_BATCH_READY, &sh->state); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -668,20 +672,28 @@ get_active_stripe(struct r5conf *conf, sector_t sector, *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { - if (!conf->inactive_blocked) + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { sh = get_free_stripe(conf, hash); + if (!sh && llist_empty(&conf->released_stripes) && + !test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, + &conf->cache_state); + } if (noblock && sh == NULL) break; if (!sh) { - conf->inactive_blocked = 1; + set_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state); wait_event_lock_irq( conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) - || !conf->inactive_blocked), + || !test_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state)), *(conf->hash_locks + hash)); - conf->inactive_blocked = 0; + clear_bit(R5_INACTIVE_BLOCKED, + &conf->cache_state); } else { init_stripe(sh, sector, previous); atomic_inc(&sh->count); @@ -708,6 +720,130 @@ get_active_stripe(struct r5conf *conf, sector_t sector, return sh; } +static bool is_full_stripe_write(struct stripe_head *sh) +{ + BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); + return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); +} + +static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) +{ + local_irq_disable(); + if (sh1 > sh2) { + spin_lock(&sh2->stripe_lock); + spin_lock_nested(&sh1->stripe_lock, 1); + } else { + spin_lock(&sh1->stripe_lock); + spin_lock_nested(&sh2->stripe_lock, 1); + } +} + +static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) +{ + spin_unlock(&sh1->stripe_lock); + spin_unlock(&sh2->stripe_lock); + local_irq_enable(); +} + +/* Only freshly new full stripe normal write stripe can be added to a batch list */ +static bool stripe_can_batch(struct stripe_head *sh) +{ + return test_bit(STRIPE_BATCH_READY, &sh->state) && + is_full_stripe_write(sh); +} + +/* we only do back search */ +static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +{ + struct stripe_head *head; + sector_t head_sector, tmp_sec; + int hash; + int dd_idx; + + if (!stripe_can_batch(sh)) + return; + /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ + tmp_sec = sh->sector; + if (!sector_div(tmp_sec, conf->chunk_sectors)) + return; + head_sector = sh->sector - STRIPE_SECTORS; + + hash = stripe_hash_locks_hash(head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = __find_stripe(conf, head_sector, conf->generation); + if (head && !atomic_inc_not_zero(&head->count)) { + spin_lock(&conf->device_lock); + if (!atomic_read(&head->count)) { + if (!test_bit(STRIPE_HANDLE, &head->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&head->lru) && + !test_bit(STRIPE_EXPANDING, &head->state)); + list_del_init(&head->lru); + if (head->group) { + head->group->stripes_cnt--; + head->group = NULL; + } + } + atomic_inc(&head->count); + spin_unlock(&conf->device_lock); + } + spin_unlock_irq(conf->hash_locks + hash); + + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; + + lock_two_stripes(head, sh); + /* clear_batch_ready clear the flag */ + if (!stripe_can_batch(head) || !stripe_can_batch(sh)) + goto unlock_out; + + if (sh->batch_head) + goto unlock_out; + + dd_idx = 0; + while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + dd_idx++; + if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) + goto unlock_out; + + if (head->batch_head) { + spin_lock(&head->batch_head->batch_lock); + /* This batch list is already running */ + if (!stripe_can_batch(head)) { + spin_unlock(&head->batch_head->batch_lock); + goto unlock_out; + } + + /* + * at this point, head's BATCH_READY could be cleared, but we + * can still add the stripe to batch list + */ + list_add(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_head->batch_lock); + + sh->batch_head = head->batch_head; + } else { + head->batch_head = head; + sh->batch_head = head->batch_head; + spin_lock(&head->batch_lock); + list_add_tail(&sh->batch_list, &head->batch_list); + spin_unlock(&head->batch_lock); + } + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + + atomic_inc(&sh->count); +unlock_out: + unlock_two_stripes(head, sh); +out: + release_stripe(head); +} + /* Determine if 'data_offset' or 'new_data_offset' should be used * in this stripe_head. */ @@ -738,6 +874,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; + struct stripe_head *head_sh = sh; might_sleep(); @@ -746,6 +883,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int replace_only = 0; struct bio *bi, *rbi; struct md_rdev *rdev, *rrdev = NULL; + + sh = head_sh; if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) rw = WRITE_FUA; @@ -764,6 +903,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) rw |= REQ_SYNC; +again: bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ @@ -782,7 +922,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) /* We raced and saw duplicates */ rrdev = NULL; } else { - if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) + if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) rdev = rrdev; rrdev = NULL; } @@ -853,13 +993,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); if (use_new_offset(conf, sh)) bi->bi_iter.bi_sector = (sh->sector + rdev->new_data_offset); else bi->bi_iter.bi_sector = (sh->sector + rdev->data_offset); - if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) @@ -903,6 +1045,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) __func__, (unsigned long long)sh->sector, rbi->bi_rw, i); atomic_inc(&sh->count); + if (sh != head_sh) + atomic_inc(&head_sh->count); if (use_new_offset(conf, sh)) rbi->bi_iter.bi_sector = (sh->sector + rrdev->new_data_offset); @@ -934,8 +1078,18 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); + if (sh->batch_head) + set_bit(STRIPE_BATCH_ERR, + &sh->batch_head->state); set_bit(STRIPE_HANDLE, &sh->state); } + + if (!head_sh->batch_head) + continue; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + if (sh != head_sh) + goto again; } } @@ -1051,6 +1205,7 @@ static void ops_run_biofill(struct stripe_head *sh) struct async_submit_ctl submit; int i; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1109,16 +1264,28 @@ static void ops_complete_compute(void *stripe_head_ref) /* return a pointer to the address conversion region of the scribble buffer */ static addr_conv_t *to_addr_conv(struct stripe_head *sh, - struct raid5_percpu *percpu) + struct raid5_percpu *percpu, int i) { - return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); + void *addr; + + addr = flex_array_get(percpu->scribble, i); + return addr + sizeof(struct page *) * (sh->disks + 2); +} + +/* return a pointer to the address conversion region of the scribble buffer */ +static struct page **to_addr_page(struct raid5_percpu *percpu, int i) +{ + void *addr; + + addr = flex_array_get(percpu->scribble, i); + return addr; } static struct dma_async_tx_descriptor * ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; + struct page **xor_srcs = to_addr_page(percpu, 0); int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; @@ -1127,6 +1294,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) struct async_submit_ctl submit; int i; + BUG_ON(sh->batch_head); + pr_debug("%s: stripe %llu block: %d\n", __func__, (unsigned long long)sh->sector, target); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); @@ -1138,7 +1307,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute, sh, to_addr_conv(sh, percpu)); + ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else @@ -1156,7 +1325,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) * destination buffer is recorded in srcs[count] and the Q destination * is recorded in srcs[count+1]]. */ -static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +static int set_syndrome_sources(struct page **srcs, + struct stripe_head *sh, + int srctype) { int disks = sh->disks; int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); @@ -1171,8 +1342,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) i = d0_idx; do { int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + struct r5dev *dev = &sh->dev[i]; - srcs[slot] = sh->dev[i].page; + if (i == sh->qd_idx || i == sh->pd_idx || + (srctype == SYNDROME_SRC_ALL) || + (srctype == SYNDROME_SRC_WANT_DRAIN && + test_bit(R5_Wantdrain, &dev->flags)) || + (srctype == SYNDROME_SRC_WRITTEN && + dev->written)) + srcs[slot] = sh->dev[i].page; i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1183,7 +1361,7 @@ static struct dma_async_tx_descriptor * ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) { int disks = sh->disks; - struct page **blocks = percpu->scribble; + struct page **blocks = to_addr_page(percpu, 0); int target; int qd_idx = sh->qd_idx; struct dma_async_tx_descriptor *tx; @@ -1193,6 +1371,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) int i; int count; + BUG_ON(sh->batch_head); if (sh->ops.target < 0) target = sh->ops.target2; else if (sh->ops.target2 < 0) @@ -1211,12 +1390,12 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) atomic_inc(&sh->count); if (target == qd_idx) { - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); blocks[count] = NULL; /* regenerating p is not necessary */ BUG_ON(blocks[count+1] != dest); /* q should already be set */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); } else { /* Compute any data- or p-drive using XOR */ @@ -1229,7 +1408,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); } @@ -1248,9 +1427,10 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) struct r5dev *tgt = &sh->dev[target]; struct r5dev *tgt2 = &sh->dev[target2]; struct dma_async_tx_descriptor *tx; - struct page **blocks = percpu->scribble; + struct page **blocks = to_addr_page(percpu, 0); struct async_submit_ctl submit; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu block1: %d block2: %d\n", __func__, (unsigned long long)sh->sector, target, target2); BUG_ON(target < 0 || target2 < 0); @@ -1290,7 +1470,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) /* Missing P+Q, just recompute */ init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, syndrome_disks+2, STRIPE_SIZE, &submit); } else { @@ -1314,21 +1494,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); - count = set_syndrome_sources(blocks, sh); + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); init_async_submit(&submit, ASYNC_TX_FENCE, tx, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); } } else { init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); if (failb == syndrome_disks) { /* We're missing D+P. */ return async_raid6_datap_recov(syndrome_disks+2, @@ -1352,17 +1532,18 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, - struct dma_async_tx_descriptor *tx) +ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; + struct page **xor_srcs = to_addr_page(percpu, 0); int count = 0, pd_idx = sh->pd_idx, i; struct async_submit_ctl submit; /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1374,31 +1555,56 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, } init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh, to_addr_conv(sh, percpu)); + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } static struct dma_async_tx_descriptor * +ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct page **blocks = to_addr_page(percpu, 0); + int count; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + + return tx; +} + +static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; int i; + struct stripe_head *head_sh = sh; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; + struct r5dev *dev; struct bio *chosen; - if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { + sh = head_sh; + if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { struct bio *wbi; +again: + dev = &sh->dev[i]; spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; + sh->overwrite_disks = 0; BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); @@ -1423,6 +1629,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } wbi = r5_next_bio(wbi, dev->sector); } + + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh == head_sh) + continue; + goto again; + } } } @@ -1478,12 +1693,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - struct page **xor_srcs = percpu->scribble; + struct page **xor_srcs; struct async_submit_ctl submit; - int count = 0, pd_idx = sh->pd_idx, i; + int count, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; unsigned long flags; + int j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1500,15 +1718,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, ops_complete_reconstruct(sh); return; } +again: + count = 0; + xor_srcs = to_addr_page(percpu, j); /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written) + if (head_sh->dev[i].written) xor_srcs[count++] = dev->page; } } else { @@ -1525,17 +1746,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_ACK | - (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); - - atomic_inc(&sh->count); + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; + if (last_stripe) { + flags = ASYNC_TX_ACK | + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); + + atomic_inc(&head_sh->count); + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, + to_addr_conv(sh, percpu, j)); + } else { + flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; + init_async_submit(&submit, flags, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); + } - init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, - to_addr_conv(sh, percpu)); if (unlikely(count == 1)) tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } } static void @@ -1543,8 +1779,12 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { struct async_submit_ctl submit; - struct page **blocks = percpu->scribble; - int count, i; + struct page **blocks; + int count, i, j = 0; + struct stripe_head *head_sh = sh; + int last_stripe; + int synflags; + unsigned long txflags; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1562,13 +1802,36 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, return; } - count = set_syndrome_sources(blocks, sh); +again: + blocks = to_addr_page(percpu, j); - atomic_inc(&sh->count); + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + synflags = SYNDROME_SRC_WRITTEN; + txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; + } else { + synflags = SYNDROME_SRC_ALL; + txflags = ASYNC_TX_ACK; + } + + count = set_syndrome_sources(blocks, sh, synflags); + last_stripe = !head_sh->batch_head || + list_first_entry(&sh->batch_list, + struct stripe_head, batch_list) == head_sh; - init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, - sh, to_addr_conv(sh, percpu)); + if (last_stripe) { + atomic_inc(&head_sh->count); + init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, + head_sh, to_addr_conv(sh, percpu, j)); + } else + init_async_submit(&submit, 0, tx, NULL, NULL, + to_addr_conv(sh, percpu, j)); async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + if (!last_stripe) { + j++; + sh = list_first_entry(&sh->batch_list, struct stripe_head, + batch_list); + goto again; + } } static void ops_complete_check(void *stripe_head_ref) @@ -1589,7 +1852,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; struct page *xor_dest; - struct page **xor_srcs = percpu->scribble; + struct page **xor_srcs = to_addr_page(percpu, 0); struct dma_async_tx_descriptor *tx; struct async_submit_ctl submit; int count; @@ -1598,6 +1861,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + BUG_ON(sh->batch_head); count = 0; xor_dest = sh->dev[pd_idx].page; xor_srcs[count++] = xor_dest; @@ -1608,7 +1872,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) } init_async_submit(&submit, 0, NULL, NULL, NULL, - to_addr_conv(sh, percpu)); + to_addr_conv(sh, percpu, 0)); tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, &submit); @@ -1619,20 +1883,21 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) { - struct page **srcs = percpu->scribble; + struct page **srcs = to_addr_page(percpu, 0); struct async_submit_ctl submit; int count; pr_debug("%s: stripe %llu checkp: %d\n", __func__, (unsigned long long)sh->sector, checkp); - count = set_syndrome_sources(srcs, sh); + BUG_ON(sh->batch_head); + count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); if (!checkp) srcs[count] = NULL; atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, - sh, to_addr_conv(sh, percpu)); + sh, to_addr_conv(sh, percpu, 0)); async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, &sh->ops.zero_sum_result, percpu->spare_page, &submit); } @@ -1667,8 +1932,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) async_tx_ack(tx); } - if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, percpu, tx); + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { + if (level < 6) + tx = ops_run_prexor5(sh, percpu, tx); + else + tx = ops_run_prexor6(sh, percpu, tx); + } if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); @@ -1693,7 +1962,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) BUG(); } - if (overlap_clear) + if (overlap_clear && !sh->batch_head) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_Overlap, &dev->flags)) @@ -1702,10 +1971,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static int grow_one_stripe(struct r5conf *conf, int hash) +static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); + sh = kmem_cache_zalloc(conf->slab_cache, gfp); if (!sh) return 0; @@ -1713,17 +1982,23 @@ static int grow_one_stripe(struct r5conf *conf, int hash) spin_lock_init(&sh->stripe_lock); - if (grow_buffers(sh)) { + if (grow_buffers(sh, gfp)) { shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } - sh->hash_lock_index = hash; + sh->hash_lock_index = + conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); INIT_LIST_HEAD(&sh->lru); + + spin_lock_init(&sh->batch_lock); + INIT_LIST_HEAD(&sh->batch_list); + sh->batch_head = NULL; release_stripe(sh); + conf->max_nr_stripes++; return 1; } @@ -1731,7 +2006,6 @@ static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); - int hash; if (conf->mddev->gendisk) sprintf(conf->cache_name[0], @@ -1749,13 +2023,10 @@ static int grow_stripes(struct r5conf *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; - while (num--) { - if (!grow_one_stripe(conf, hash)) + while (num--) + if (!grow_one_stripe(conf, GFP_KERNEL)) return 1; - conf->max_nr_stripes++; - hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; - } + return 0; } @@ -1772,13 +2043,21 @@ static int grow_stripes(struct r5conf *conf, int num) * calculate over all devices (not just the data blocks), using zeros in place * of the P and Q blocks. */ -static size_t scribble_len(int num) +static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) { + struct flex_array *ret; size_t len; len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); - - return len; + ret = flex_array_alloc(len, cnt, flags); + if (!ret) + return NULL; + /* always prealloc all elements, so no locking is required */ + if (flex_array_prealloc(ret, 0, cnt, flags)) { + flex_array_free(ret); + return NULL; + } + return ret; } static int resize_stripes(struct r5conf *conf, int newsize) @@ -1896,16 +2175,16 @@ static int resize_stripes(struct r5conf *conf, int newsize) err = -ENOMEM; get_online_cpus(); - conf->scribble_len = scribble_len(newsize); for_each_present_cpu(cpu) { struct raid5_percpu *percpu; - void *scribble; + struct flex_array *scribble; percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = kmalloc(conf->scribble_len, GFP_NOIO); + scribble = scribble_alloc(newsize, conf->chunk_sectors / + STRIPE_SECTORS, GFP_NOIO); if (scribble) { - kfree(percpu->scribble); + flex_array_free(percpu->scribble); percpu->scribble = scribble; } else { err = -ENOMEM; @@ -1937,9 +2216,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) return err; } -static int drop_one_stripe(struct r5conf *conf, int hash) +static int drop_one_stripe(struct r5conf *conf) { struct stripe_head *sh; + int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; spin_lock_irq(conf->hash_locks + hash); sh = get_free_stripe(conf, hash); @@ -1950,15 +2230,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash) shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); + conf->max_nr_stripes--; return 1; } static void shrink_stripes(struct r5conf *conf) { - int hash; - for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) - while (drop_one_stripe(conf, hash)) - ; + while (conf->max_nr_stripes && + drop_one_stripe(conf)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -2154,10 +2434,16 @@ static void raid5_end_write_request(struct bio *bi, int error) } rdev_dec_pending(rdev, conf->mddev); + if (sh->batch_head && !uptodate) + set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); + + if (sh->batch_head && sh != sh->batch_head) + release_stripe(sh->batch_head); } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); @@ -2535,7 +2821,7 @@ static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { - int i, pd_idx = sh->pd_idx, disks = sh->disks; + int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; struct r5conf *conf = sh->raid_conf; int level = conf->level; @@ -2571,13 +2857,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&conf->pending_full_writes); } else { - BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + BUG_ON(level == 6 && + (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (i == pd_idx) + if (i == pd_idx || i == qd_idx) continue; if (dev->towrite && @@ -2624,7 +2912,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, + int forwrite, int previous) { struct bio **bip; struct r5conf *conf = sh->raid_conf; @@ -2643,6 +2932,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in * protect it. */ spin_lock_irq(&sh->stripe_lock); + /* Don't allow new IO added to stripes in batch list */ + if (sh->batch_head) + goto overlap; if (forwrite) { bip = &sh->dev[dd_idx].towrite; if (*bip == NULL) @@ -2657,6 +2949,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; + if (!forwrite || previous) + clear_bit(STRIPE_BATCH_READY, &sh->state); + BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); if (*bip) bi->bi_next = *bip; @@ -2674,7 +2969,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sector = bio_end_sector(bi); } if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) - set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); + if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) + sh->overwrite_disks++; } pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", @@ -2688,6 +2984,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sh->bm_seq = conf->seq_flush+1; set_bit(STRIPE_BIT_DELAY, &sh->state); } + + if (stripe_can_batch(sh)) + stripe_add_to_batch_list(conf, sh); return 1; overlap: @@ -2720,6 +3019,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio **return_bi) { int i; + BUG_ON(sh->batch_head); for (i = disks; i--; ) { struct bio *bi; int bitmap_end = 0; @@ -2746,6 +3046,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; + sh->overwrite_disks = 0; spin_unlock_irq(&sh->stripe_lock); if (bi) bitmap_end = 1; @@ -2834,6 +3135,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int abort = 0; int i; + BUG_ON(sh->batch_head); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -3064,6 +3366,7 @@ static void handle_stripe_fill(struct stripe_head *sh, { int i; + BUG_ON(sh->batch_head); /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write @@ -3087,6 +3390,9 @@ static void handle_stripe_clean_event(struct r5conf *conf, int i; struct r5dev *dev; int discard_pending = 0; + struct stripe_head *head_sh = sh; + bool do_endio = false; + int wakeup_nr = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -3102,8 +3408,11 @@ static void handle_stripe_clean_event(struct r5conf *conf, clear_bit(R5_UPTODATE, &dev->flags); if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); - dev->page = dev->orig_page; } + do_endio = true; + +returnbi: + dev->page = dev->orig_page; wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < @@ -3120,6 +3429,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, + batch_list); + if (sh != head_sh) { + dev = &sh->dev[i]; + goto returnbi; + } + } + sh = head_sh; + dev = &sh->dev[i]; } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); @@ -3141,8 +3461,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, * will be reinitialized */ spin_lock_irq(&conf->device_lock); +unhash: remove_hash(sh); + if (head_sh->batch_head) { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + if (sh != head_sh) + goto unhash; + } spin_unlock_irq(&conf->device_lock); + sh = head_sh; + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); @@ -3151,6 +3480,45 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); + + if (!head_sh->batch_head || !do_endio) + return; + for (i = 0; i < head_sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) + wakeup_nr++; + } + while (!list_empty(&head_sh->batch_list)) { + int i; + sh = list_first_entry(&head_sh->batch_list, + struct stripe_head, batch_list); + list_del_init(&sh->batch_list); + + set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, + head_sh->state & ~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE) | + STRIPE_EXPAND_SYNC_FLAG)); + sh->check_state = head_sh->check_state; + sh->reconstruct_state = head_sh->reconstruct_state; + for (i = 0; i < sh->disks; i++) { + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wakeup_nr++; + sh->dev[i].flags = head_sh->dev[i].flags; + } + + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); + if (sh->state & STRIPE_EXPAND_SYNC_FLAG) + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + } + + spin_lock_irq(&head_sh->stripe_lock); + head_sh->batch_head = NULL; + spin_unlock_irq(&head_sh->stripe_lock); + wake_up_nr(&conf->wait_for_overlap, wakeup_nr); + if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG) + set_bit(STRIPE_HANDLE, &head_sh->state); } static void handle_stripe_dirtying(struct r5conf *conf, @@ -3161,28 +3529,27 @@ static void handle_stripe_dirtying(struct r5conf *conf, int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; - /* RAID6 requires 'rcw' in current implementation. - * Otherwise, check whether resync is now happening or should start. + /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or * initial creation), so parity in some stripes might be inconsistent. * In this case, we need to always do reconstruct-write, to ensure * that in case of drive failure or read-error correction, we * generate correct data from the parity. */ - if (conf->max_degraded == 2 || + if (conf->rmw_level == PARITY_DISABLE_RMW || (recovery_cp < MaxSector && sh->sector >= recovery_cp && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->max_degraded, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)recovery_cp, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3192,7 +3559,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, rmw += 2*disks; /* cannot read it */ } /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { @@ -3205,7 +3573,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) { + if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ if (conf->mddev->queue) blk_add_trace_msg(conf->mddev->queue, @@ -3213,7 +3581,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && + if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && @@ -3232,7 +3600,7 @@ static void handle_stripe_dirtying(struct r5conf *conf, } } } - if (rcw <= rmw && rcw > 0) { + if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { /* want reconstruct write, but need to get some data */ int qread =0; rcw = 0; @@ -3290,6 +3658,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, { struct r5dev *dev = NULL; + BUG_ON(sh->batch_head); set_bit(STRIPE_HANDLE, &sh->state); switch (sh->check_state) { @@ -3380,6 +3749,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, int qd_idx = sh->qd_idx; struct r5dev *dev; + BUG_ON(sh->batch_head); set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); @@ -3543,6 +3913,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) * copy some of them into a target stripe for expand. */ struct dma_async_tx_descriptor *tx = NULL; + BUG_ON(sh->batch_head); clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) if (i != sh->pd_idx && i != sh->qd_idx) { @@ -3615,8 +3986,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) memset(s, 0, sizeof(*s)); - s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->failed_num[0] = -1; s->failed_num[1] = -1; @@ -3786,6 +4157,80 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_unlock(); } +static int clear_batch_ready(struct stripe_head *sh) +{ + struct stripe_head *tmp; + if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) + return 0; + spin_lock(&sh->stripe_lock); + if (!sh->batch_head) { + spin_unlock(&sh->stripe_lock); + return 0; + } + + /* + * this stripe could be added to a batch list before we check + * BATCH_READY, skips it + */ + if (sh->batch_head != sh) { + spin_unlock(&sh->stripe_lock); + return 1; + } + spin_lock(&sh->batch_lock); + list_for_each_entry(tmp, &sh->batch_list, batch_list) + clear_bit(STRIPE_BATCH_READY, &tmp->state); + spin_unlock(&sh->batch_lock); + spin_unlock(&sh->stripe_lock); + + /* + * BATCH_READY is cleared, no new stripes can be added. + * batch_list can be accessed without lock + */ + return 0; +} + +static void check_break_stripe_batch_list(struct stripe_head *sh) +{ + struct stripe_head *head_sh, *next; + int i; + + if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) + return; + + head_sh = sh; + do { + sh = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + BUG_ON(sh == head_sh); + } while (!test_bit(STRIPE_DEGRADED, &sh->state)); + + while (sh != head_sh) { + next = list_first_entry(&sh->batch_list, + struct stripe_head, batch_list); + list_del_init(&sh->batch_list); + + set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, + head_sh->state & ~((1 << STRIPE_ACTIVE) | + (1 << STRIPE_PREREAD_ACTIVE) | + (1 << STRIPE_DEGRADED) | + STRIPE_EXPAND_SYNC_FLAG)); + sh->check_state = head_sh->check_state; + sh->reconstruct_state = head_sh->reconstruct_state; + for (i = 0; i < sh->disks; i++) + sh->dev[i].flags = head_sh->dev[i].flags & + (~((1 << R5_WriteError) | (1 << R5_Overlap))); + + spin_lock_irq(&sh->stripe_lock); + sh->batch_head = NULL; + spin_unlock_irq(&sh->stripe_lock); + + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + + sh = next; + } +} + static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; @@ -3803,7 +4248,14 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + if (clear_batch_ready(sh) ) { + clear_bit_unlock(STRIPE_ACTIVE, &sh->state); + return; + } + + check_break_stripe_batch_list(sh); + + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); /* Cannot process 'sync' concurrently with 'discard' */ if (!test_bit(STRIPE_DISCARD, &sh->state) && @@ -4158,7 +4610,7 @@ static int raid5_congested(struct mddev *mddev, int bits) * how busy the stripe_cache is */ - if (conf->inactive_blocked) + if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return 1; if (conf->quiesce) return 1; @@ -4180,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev, unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - if ((bvm->bi_rw & 1) == WRITE) - return biovec->bv_len; /* always allow writes to be mergeable */ + /* + * always allow writes to be mergeable, read as well if array + * is degraded as we'll go through stripe cache anyway. + */ + if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) + return biovec->bv_len; if (mddev->new_chunk_sectors < mddev->chunk_sectors) chunk_sectors = mddev->new_chunk_sectors; @@ -4603,12 +5059,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); + sh->overwrite_disks = 0; for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) continue; sh->dev[d].towrite = bi; set_bit(R5_OVERWRITE, &sh->dev[d].flags); raid5_inc_bi_active_stripes(bi); + sh->overwrite_disks++; } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { @@ -4656,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) md_write_start(mddev, bi); - if (rw == READ && + /* + * If array is degraded, better not do chunk aligned read because + * later we might have to read it again in order to reconstruct + * data on failed drives. + */ + if (rw == READ && mddev->degraded == 0 && mddev->reshape_position == MaxSector && chunk_aligned_read(mddev,bi)) return; @@ -4772,7 +5235,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw)) { + !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while @@ -4785,7 +5248,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) } set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - if ((bi->bi_rw & REQ_SYNC) && + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_rw & REQ_SYNC) && !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); release_stripe_plug(mddev, sh); @@ -5050,8 +5514,7 @@ ret: return reshape_sectors; } -/* FIXME go_faster isn't used */ -static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) +static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { struct r5conf *conf = mddev->private; struct stripe_head *sh; @@ -5186,7 +5649,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } - if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { release_stripe(sh); raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; @@ -5312,6 +5775,8 @@ static void raid5d(struct md_thread *thread) int batch_size, released; released = release_stripe_list(conf, conf->temp_inactive_list); + if (released) + clear_bit(R5_DID_ALLOC, &conf->cache_state); if ( !list_empty(&conf->bitmap_list)) { @@ -5350,6 +5815,13 @@ static void raid5d(struct md_thread *thread) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { + grow_one_stripe(conf, __GFP_NOWARN); + /* Set flag even if allocation failed. This helps + * slow down allocation requests when mem is short + */ + set_bit(R5_DID_ALLOC, &conf->cache_state); + } async_tx_issue_pending_all(); blk_finish_plug(&plug); @@ -5365,7 +5837,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) spin_lock(&mddev->lock); conf = mddev->private; if (conf) - ret = sprintf(page, "%d\n", conf->max_nr_stripes); + ret = sprintf(page, "%d\n", conf->min_nr_stripes); spin_unlock(&mddev->lock); return ret; } @@ -5375,30 +5847,24 @@ raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; int err; - int hash; if (size <= 16 || size > 32768) return -EINVAL; - hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; - while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf, hash)) - conf->max_nr_stripes--; - else - break; - hash--; - if (hash < 0) - hash = NR_STRIPE_HASH_LOCKS - 1; - } + + conf->min_nr_stripes = size; + while (size < conf->max_nr_stripes && + drop_one_stripe(conf)) + ; + + err = md_allow_write(mddev); if (err) return err; - hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; - while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf, hash)) - conf->max_nr_stripes++; - else break; - hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; - } + + while (size > conf->max_nr_stripes) + if (!grow_one_stripe(conf, GFP_KERNEL)) + break; + return 0; } EXPORT_SYMBOL(raid5_set_cache_size); @@ -5433,6 +5899,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_store_stripe_cache_size); static ssize_t +raid5_show_rmw_level(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->rmw_level); + else + return 0; +} + +static ssize_t +raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + + if (!conf) + return -ENODEV; + + if (len >= PAGE_SIZE) + return -EINVAL; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) + return -EINVAL; + + if (new != PARITY_DISABLE_RMW && + new != PARITY_ENABLE_RMW && + new != PARITY_PREFER_RMW) + return -EINVAL; + + conf->rmw_level = new; + return len; +} + +static struct md_sysfs_entry +raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, + raid5_show_rmw_level, + raid5_store_rmw_level); + + +static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { struct r5conf *conf; @@ -5463,7 +5972,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) conf = mddev->private; if (!conf) err = -ENODEV; - else if (new > conf->max_nr_stripes) + else if (new > conf->min_nr_stripes) err = -EINVAL; else conf->bypass_threshold = new; @@ -5618,6 +6127,7 @@ static struct attribute *raid5_attrs[] = { &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, + &raid5_rmw_level.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -5699,7 +6209,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) { safe_put_page(percpu->spare_page); - kfree(percpu->scribble); + if (percpu->scribble) + flex_array_free(percpu->scribble); percpu->spare_page = NULL; percpu->scribble = NULL; } @@ -5709,7 +6220,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu if (conf->level == 6 && !percpu->spare_page) percpu->spare_page = alloc_page(GFP_KERNEL); if (!percpu->scribble) - percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + percpu->scribble = scribble_alloc(max(conf->raid_disks, + conf->previous_raid_disks), conf->chunk_sectors / + STRIPE_SECTORS, GFP_KERNEL); if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { free_scratch_buffer(conf, percpu); @@ -5740,6 +6253,8 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + if (conf->shrinker.seeks) + unregister_shrinker(&conf->shrinker); free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -5807,6 +6322,30 @@ static int raid5_alloc_percpu(struct r5conf *conf) return err; } +static unsigned long raid5_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + int ret = 0; + while (ret < sc->nr_to_scan) { + if (drop_one_stripe(conf) == 0) + return SHRINK_STOP; + ret++; + } + return ret; +} + +static unsigned long raid5_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + + if (conf->max_nr_stripes < conf->min_nr_stripes) + /* unlikely, but not impossible */ + return 0; + return conf->max_nr_stripes - conf->min_nr_stripes; +} + static struct r5conf *setup_conf(struct mddev *mddev) { struct r5conf *conf; @@ -5879,7 +6418,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) else conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; max_disks = max(conf->raid_disks, conf->previous_raid_disks); - conf->scribble_len = scribble_len(max_disks); conf->disks = kzalloc(max_disks * sizeof(struct disk_info), GFP_KERNEL); @@ -5907,6 +6445,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(conf->temp_inactive_list + i); conf->level = mddev->new_level; + conf->chunk_sectors = mddev->new_chunk_sectors; if (raid5_alloc_percpu(conf) != 0) goto abort; @@ -5939,12 +6478,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->fullsync = 1; } - conf->chunk_sectors = mddev->new_chunk_sectors; conf->level = mddev->new_level; - if (conf->level == 6) + if (conf->level == 6) { conf->max_degraded = 2; - else + if (raid6_call.xor_syndrome) + conf->rmw_level = PARITY_ENABLE_RMW; + else + conf->rmw_level = PARITY_DISABLE_RMW; + } else { conf->max_degraded = 1; + conf->rmw_level = PARITY_ENABLE_RMW; + } conf->algorithm = mddev->new_layout; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { @@ -5952,10 +6496,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->prev_algo = mddev->layout; } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->min_nr_stripes = NR_STRIPES; + memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); - if (grow_stripes(conf, NR_STRIPES)) { + if (grow_stripes(conf, conf->min_nr_stripes)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -5963,6 +6508,17 @@ static struct r5conf *setup_conf(struct mddev *mddev) } else printk(KERN_INFO "md/raid:%s: allocated %dkB\n", mdname(mddev), memory); + /* + * Losing a stripe head costs more than the time to refill it, + * it reduces the queue depth and so can hurt throughput. + * So set it rather large, scaled by number of devices. + */ + conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker.scan_objects = raid5_cache_scan; + conf->shrinker.count_objects = raid5_cache_count; + conf->shrinker.batch = 128; + conf->shrinker.flags = 0; + register_shrinker(&conf->shrinker); sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); @@ -6604,9 +7160,9 @@ static int check_stripe_cache(struct mddev *mddev) */ struct r5conf *conf = mddev->private; if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes || + > conf->min_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 - > conf->max_nr_stripes) { + > conf->min_nr_stripes) { printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 983e18a83db1..7dc0dd86074b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -210,11 +210,19 @@ struct stripe_head { atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ + int overwrite_disks; /* total overwrite disks in stripe, + * this is only checked when stripe + * has STRIPE_BATCH_READY + */ enum check_states check_state; enum reconstruct_states reconstruct_state; spinlock_t stripe_lock; int cpu; struct r5worker_group *group; + + struct stripe_head *batch_head; /* protected by stripe lock */ + spinlock_t batch_lock; /* only header's lock is useful */ + struct list_head batch_list; /* protected by head's batch lock*/ /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -327,8 +335,15 @@ enum { STRIPE_ON_UNPLUG_LIST, STRIPE_DISCARD, STRIPE_ON_RELEASE_LIST, + STRIPE_BATCH_READY, + STRIPE_BATCH_ERR, }; +#define STRIPE_EXPAND_SYNC_FLAG \ + ((1 << STRIPE_EXPAND_SOURCE) |\ + (1 << STRIPE_EXPAND_READY) |\ + (1 << STRIPE_EXPANDING) |\ + (1 << STRIPE_SYNC_REQUESTED)) /* * Operation request flags */ @@ -340,6 +355,24 @@ enum { STRIPE_OP_RECONSTRUCT, STRIPE_OP_CHECK, }; + +/* + * RAID parity calculation preferences + */ +enum { + PARITY_DISABLE_RMW = 0, + PARITY_ENABLE_RMW, + PARITY_PREFER_RMW, +}; + +/* + * Pages requested from set_syndrome_sources() + */ +enum { + SYNDROME_SRC_ALL, + SYNDROME_SRC_WANT_DRAIN, + SYNDROME_SRC_WRITTEN, +}; /* * Plugging: * @@ -396,10 +429,11 @@ struct r5conf { spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; - int level, algorithm; + int level, algorithm, rmw_level; int max_degraded; int raid_disks; int max_nr_stripes; + int min_nr_stripes; /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -458,15 +492,11 @@ struct r5conf { /* per cpu variables */ struct raid5_percpu { struct page *spare_page; /* Used when checking P/Q in raid6 */ - void *scribble; /* space for constructing buffer + struct flex_array *scribble; /* space for constructing buffer * lists and performing address * conversions */ } __percpu *percpu; - size_t scribble_len; /* size of scribble region must be - * associated with conf to handle - * cpu hotplug while reshaping - */ #ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif @@ -480,9 +510,19 @@ struct r5conf { struct llist_head released_stripes; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; - int inactive_blocked; /* release of inactive stripes blocked, - * waiting for 25% to be free - */ + unsigned long cache_state; +#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, + * waiting for 25% to be free + */ +#define R5_ALLOC_MORE 2 /* It might help to allocate another + * stripe. + */ +#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate + * more until at least one has been + * released. This avoids flooding + * the cache. + */ + struct shrinker shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; @@ -497,6 +537,7 @@ struct r5conf { int worker_cnt_per_group; }; + /* * Our supported algorithms */ diff --git a/drivers/media/platform/xilinx/xilinx-dma.c b/drivers/media/platform/xilinx/xilinx-dma.c index 10209c294168..efde88adf624 100644 --- a/drivers/media/platform/xilinx/xilinx-dma.c +++ b/drivers/media/platform/xilinx/xilinx-dma.c @@ -12,7 +12,7 @@ * published by the Free Software Foundation. */ -#include <linux/amba/xilinx_dma.h> +#include <linux/dma/xilinx_dma.h> #include <linux/lcm.h> #include <linux/list.h> #include <linux/module.h> diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c b/drivers/media/v4l2-core/videobuf2-dma-contig.c index 69e0483adfee..644dec73d220 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c @@ -402,6 +402,12 @@ static struct dma_buf *vb2_dc_get_dmabuf(void *buf_priv, unsigned long flags) { struct vb2_dc_buf *buf = buf_priv; struct dma_buf *dbuf; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &vb2_dc_dmabuf_ops; + exp_info.size = buf->size; + exp_info.flags = flags; + exp_info.priv = buf; if (!buf->sgt_base) buf->sgt_base = vb2_dc_get_base_sgt(buf); @@ -409,7 +415,7 @@ static struct dma_buf *vb2_dc_get_dmabuf(void *buf_priv, unsigned long flags) if (WARN_ON(!buf->sgt_base)) return NULL; - dbuf = dma_buf_export(buf, &vb2_dc_dmabuf_ops, buf->size, flags, NULL); + dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c index b1838abb6d00..45c708e463b9 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c @@ -583,11 +583,17 @@ static struct dma_buf *vb2_dma_sg_get_dmabuf(void *buf_priv, unsigned long flags { struct vb2_dma_sg_buf *buf = buf_priv; struct dma_buf *dbuf; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &vb2_dma_sg_dmabuf_ops; + exp_info.size = buf->size; + exp_info.flags = flags; + exp_info.priv = buf; if (WARN_ON(!buf->dma_sgt)) return NULL; - dbuf = dma_buf_export(buf, &vb2_dma_sg_dmabuf_ops, buf->size, flags, NULL); + dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; diff --git a/drivers/media/v4l2-core/videobuf2-vmalloc.c b/drivers/media/v4l2-core/videobuf2-vmalloc.c index bcde88572429..657ab302a5cf 100644 --- a/drivers/media/v4l2-core/videobuf2-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf2-vmalloc.c @@ -368,11 +368,17 @@ static struct dma_buf *vb2_vmalloc_get_dmabuf(void *buf_priv, unsigned long flag { struct vb2_vmalloc_buf *buf = buf_priv; struct dma_buf *dbuf; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + + exp_info.ops = &vb2_vmalloc_dmabuf_ops; + exp_info.size = buf->size; + exp_info.flags = flags; + exp_info.priv = buf; if (WARN_ON(!buf->vaddr)) return NULL; - dbuf = dma_buf_export(buf, &vb2_vmalloc_dmabuf_ops, buf->size, flags, NULL); + dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c index fc0c81ef04ff..c4aecc6f8373 100644 --- a/drivers/mfd/cros_ec.c +++ b/drivers/mfd/cros_ec.c @@ -74,15 +74,11 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, ret = ec_dev->cmd_xfer(ec_dev, msg); if (msg->result == EC_RES_IN_PROGRESS) { int i; - struct cros_ec_command status_msg; - struct ec_response_get_comms_status status; + struct cros_ec_command status_msg = { }; + struct ec_response_get_comms_status *status; - status_msg.version = 0; status_msg.command = EC_CMD_GET_COMMS_STATUS; - status_msg.outdata = NULL; - status_msg.outsize = 0; - status_msg.indata = (uint8_t *)&status; - status_msg.insize = sizeof(status); + status_msg.insize = sizeof(*status); /* * Query the EC's status until it's no longer busy or @@ -98,7 +94,10 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, msg->result = status_msg.result; if (status_msg.result != EC_RES_SUCCESS) break; - if (!(status.flags & EC_COMMS_STATUS_PROCESSING)) + + status = (struct ec_response_get_comms_status *) + status_msg.indata; + if (!(status->flags & EC_COMMS_STATUS_PROCESSING)) break; } } @@ -119,6 +118,10 @@ static const struct mfd_cell cros_devs[] = { .id = 2, .of_compatible = "google,cros-ec-i2c-tunnel", }, + { + .name = "cros-ec-ctl", + .id = 3, + }, }; int cros_ec_register(struct cros_ec_device *ec_dev) diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 072f67066df3..2b6ef6bd5d5f 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -388,7 +388,7 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host, { struct dma_slave_config cfg = { 0, }; struct dma_chan *chan; - unsigned int slave_id; + void *slave_data = NULL; struct resource *res; dma_cap_mask_t mask; int ret; @@ -397,13 +397,12 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host, dma_cap_set(DMA_SLAVE, mask); if (pdata) - slave_id = direction == DMA_MEM_TO_DEV - ? pdata->slave_id_tx : pdata->slave_id_rx; - else - slave_id = 0; + slave_data = direction == DMA_MEM_TO_DEV ? + (void *)pdata->slave_id_tx : + (void *)pdata->slave_id_rx; chan = dma_request_slave_channel_compat(mask, shdma_chan_filter, - (void *)(unsigned long)slave_id, &host->pd->dev, + slave_data, &host->pd->dev, direction == DMA_MEM_TO_DEV ? "tx" : "rx"); dev_dbg(&host->pd->dev, "%s: %s: got channel %p\n", __func__, @@ -414,8 +413,6 @@ sh_mmcif_request_dma_one(struct sh_mmcif_host *host, res = platform_get_resource(host->pd, IORESOURCE_MEM, 0); - /* In the OF case the driver will get the slave ID from the DT */ - cfg.slave_id = slave_id; cfg.direction = direction; if (direction == DMA_DEV_TO_MEM) { diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c index 6906a905cd54..354f4f335ed5 100644 --- a/drivers/mmc/host/sh_mobile_sdhi.c +++ b/drivers/mmc/host/sh_mobile_sdhi.c @@ -201,7 +201,7 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) of_match_device(sh_mobile_sdhi_of_match, &pdev->dev); struct sh_mobile_sdhi *priv; struct tmio_mmc_data *mmc_data; - struct sh_mobile_sdhi_info *p = pdev->dev.platform_data; + struct tmio_mmc_data *mmd = pdev->dev.platform_data; struct tmio_mmc_host *host; struct resource *res; int irq, ret, i = 0; @@ -245,30 +245,14 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev) else host->bus_shift = 0; - mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED; - if (p) { - mmc_data->flags = p->tmio_flags; - mmc_data->ocr_mask = p->tmio_ocr_mask; - mmc_data->capabilities |= p->tmio_caps; - mmc_data->capabilities2 |= p->tmio_caps2; - mmc_data->cd_gpio = p->cd_gpio; - - if (p->dma_slave_tx > 0 && p->dma_slave_rx > 0) { - /* - * Yes, we have to provide slave IDs twice to TMIO: - * once as a filter parameter and once for channel - * configuration as an explicit slave ID - */ - dma_priv->chan_priv_tx = (void *)p->dma_slave_tx; - dma_priv->chan_priv_rx = (void *)p->dma_slave_rx; - dma_priv->slave_id_tx = p->dma_slave_tx; - dma_priv->slave_id_rx = p->dma_slave_rx; - } - } + if (mmd) + *mmc_data = *mmd; + dma_priv->filter = shdma_chan_filter; dma_priv->enable = sh_mobile_sdhi_enable_dma; mmc_data->alignment_shift = 1; /* 2-byte alignment */ + mmc_data->capabilities |= MMC_CAP_MMC_HIGHSPEED; /* * All SDHI blocks support 2-byte and larger block sizes in 4-bit diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index fc3805ed69d1..4a597f5a53e2 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -43,10 +43,6 @@ struct tmio_mmc_data; struct tmio_mmc_host; struct tmio_mmc_dma { - void *chan_priv_tx; - void *chan_priv_rx; - int slave_id_tx; - int slave_id_rx; enum dma_slave_buswidth dma_buswidth; bool (*filter)(struct dma_chan *chan, void *arg); void (*enable)(struct tmio_mmc_host *host, bool enable); diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index 331bb618e398..e4b05dbb9ca8 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -261,7 +261,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat { /* We can only either use DMA for both Tx and Rx or not use it at all */ if (!host->dma || (!host->pdev->dev.of_node && - (!host->dma->chan_priv_tx || !host->dma->chan_priv_rx))) + (!pdata->chan_priv_tx || !pdata->chan_priv_rx))) return; if (!host->chan_tx && !host->chan_rx) { @@ -278,7 +278,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat dma_cap_set(DMA_SLAVE, mask); host->chan_tx = dma_request_slave_channel_compat(mask, - host->dma->filter, host->dma->chan_priv_tx, + host->dma->filter, pdata->chan_priv_tx, &host->pdev->dev, "tx"); dev_dbg(&host->pdev->dev, "%s: TX: got channel %p\n", __func__, host->chan_tx); @@ -286,8 +286,6 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat if (!host->chan_tx) return; - if (host->dma->chan_priv_tx) - cfg.slave_id = host->dma->slave_id_tx; cfg.direction = DMA_MEM_TO_DEV; cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->bus_shift); cfg.dst_addr_width = host->dma->dma_buswidth; @@ -299,7 +297,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat goto ecfgtx; host->chan_rx = dma_request_slave_channel_compat(mask, - host->dma->filter, host->dma->chan_priv_rx, + host->dma->filter, pdata->chan_priv_rx, &host->pdev->dev, "rx"); dev_dbg(&host->pdev->dev, "%s: RX: got channel %p\n", __func__, host->chan_rx); @@ -307,8 +305,6 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat if (!host->chan_rx) goto ereqrx; - if (host->dma->chan_priv_rx) - cfg.slave_id = host->dma->slave_id_rx; cfg.direction = DMA_DEV_TO_MEM; cfg.src_addr = cfg.dst_addr + host->pdata->dma_rx_offset; cfg.src_addr_width = host->dma->dma_buswidth; diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 71fea895ce38..a03ad2951c7b 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -309,6 +309,19 @@ config MTD_SWAP The driver provides wear leveling by storing erase counter into the OOB. +config MTD_PARTITIONED_MASTER + bool "Retain master device when partitioned" + default n + depends on MTD + help + For historical reasons, by default, either a master is present or + several partitions are present, but not both. The concern was that + data listed in multiple partitions was dangerous; however, SCSI does + this and it is frequently useful for applications. This config option + leaves the master in even if the device is partitioned. It also makes + the parent of the partition device be the master device, rather than + what lies behind the master. + source "drivers/mtd/chips/Kconfig" source "drivers/mtd/maps/Kconfig" diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c index 423666b51efb..9a1a6ffd16b8 100644 --- a/drivers/mtd/chips/cfi_cmdset_0020.c +++ b/drivers/mtd/chips/cfi_cmdset_0020.c @@ -206,23 +206,23 @@ static struct mtd_info *cfi_staa_setup(struct map_info *map) mtd->eraseregions[(j*cfi->cfiq->NumEraseRegions)+i].numblocks = ernum; } offset += (ersize * ernum); - } + } - if (offset != devsize) { - /* Argh */ - printk(KERN_WARNING "Sum of regions (%lx) != total size of set of interleaved chips (%lx)\n", offset, devsize); - kfree(mtd->eraseregions); - kfree(cfi->cmdset_priv); - kfree(mtd); - return NULL; - } + if (offset != devsize) { + /* Argh */ + printk(KERN_WARNING "Sum of regions (%lx) != total size of set of interleaved chips (%lx)\n", offset, devsize); + kfree(mtd->eraseregions); + kfree(cfi->cmdset_priv); + kfree(mtd); + return NULL; + } - for (i=0; i<mtd->numeraseregions;i++){ - printk(KERN_DEBUG "%d: offset=0x%llx,size=0x%x,blocks=%d\n", - i, (unsigned long long)mtd->eraseregions[i].offset, - mtd->eraseregions[i].erasesize, - mtd->eraseregions[i].numblocks); - } + for (i=0; i<mtd->numeraseregions;i++){ + printk(KERN_DEBUG "%d: offset=0x%llx,size=0x%x,blocks=%d\n", + i, (unsigned long long)mtd->eraseregions[i].offset, + mtd->eraseregions[i].erasesize, + mtd->eraseregions[i].numblocks); + } /* Also select the correct geometry setup too */ mtd->_erase = cfi_staa_erase_varsize; diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 66f0405f7e53..b16f3cda97ff 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -9,7 +9,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * When the first attempt at device initialization fails, we may need to + * wait a little bit and retry. This timeout, by default 3 seconds, gives + * device time to start up. Required on BCM2708 and a few other chipsets. + */ +#define MTD_DEFAULT_TIMEOUT 3 + #include <linux/module.h> +#include <linux/delay.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> @@ -209,10 +217,14 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) } -static struct block2mtd_dev *add_device(char *devname, int erase_size) +static struct block2mtd_dev *add_device(char *devname, int erase_size, + int timeout) { +#ifndef MODULE + int i; +#endif const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; - struct block_device *bdev; + struct block_device *bdev = ERR_PTR(-ENODEV); struct block2mtd_dev *dev; char *name; @@ -225,15 +237,28 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) /* Get a handle on the device */ bdev = blkdev_get_by_path(devname, mode, dev); -#ifndef MODULE - if (IS_ERR(bdev)) { - - /* We might not have rootfs mounted at this point. Try - to resolve the device name by other means. */ - dev_t devt = name_to_dev_t(devname); - if (devt) - bdev = blkdev_get_by_dev(devt, mode, dev); +#ifndef MODULE + /* + * We might not have the root device mounted at this point. + * Try to resolve the device name by other means. + */ + for (i = 0; IS_ERR(bdev) && i <= timeout; i++) { + dev_t devt; + + if (i) + /* + * Calling wait_for_device_probe in the first loop + * was not enough, sleep for a bit in subsequent + * go-arounds. + */ + msleep(1000); + wait_for_device_probe(); + + devt = name_to_dev_t(devname); + if (!devt) + continue; + bdev = blkdev_get_by_dev(devt, mode, dev); } #endif @@ -280,6 +305,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) /* Device didn't get added, so free the entry */ goto err_destroy_mutex; } + list_add(&dev->list, &blkmtd_device_list); pr_info("mtd%d: [%s] erase_size = %dKiB [%d]\n", dev->mtd.index, @@ -348,16 +374,19 @@ static inline void kill_final_newline(char *str) #ifndef MODULE static int block2mtd_init_called = 0; -static char block2mtd_paramline[80 + 12]; /* 80 for device, 12 for erase size */ +/* 80 for device, 12 for erase size */ +static char block2mtd_paramline[80 + 12]; #endif static int block2mtd_setup2(const char *val) { - char buf[80 + 12]; /* 80 for device, 12 for erase size */ + /* 80 for device, 12 for erase size, 80 for name, 8 for timeout */ + char buf[80 + 12 + 80 + 8]; char *str = buf; char *token[2]; char *name; size_t erase_size = PAGE_SIZE; + unsigned long timeout = MTD_DEFAULT_TIMEOUT; int i, ret; if (strnlen(val, sizeof(buf)) >= sizeof(buf)) { @@ -395,7 +424,7 @@ static int block2mtd_setup2(const char *val) } } - add_device(name, erase_size); + add_device(name, erase_size, timeout); return 0; } @@ -463,8 +492,7 @@ static void block2mtd_exit(void) } } - -module_init(block2mtd_init); +late_initcall(block2mtd_init); module_exit(block2mtd_exit); MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c index 448ce42f951e..866d31904475 100644 --- a/drivers/mtd/devices/docg3.c +++ b/drivers/mtd/devices/docg3.c @@ -1805,7 +1805,7 @@ static int __init doc_dbg_register(struct docg3 *docg3) } } -static void __exit doc_dbg_unregister(struct docg3 *docg3) +static void doc_dbg_unregister(struct docg3 *docg3) { debugfs_remove_recursive(docg3->debugfs_root); } @@ -2033,7 +2033,7 @@ static int __init docg3_probe(struct platform_device *pdev) struct mtd_info *mtd; struct resource *ress; void __iomem *base; - int ret, floor, found = 0; + int ret, floor; struct docg3_cascade *cascade; ret = -ENXIO; @@ -2073,14 +2073,11 @@ static int __init docg3_probe(struct platform_device *pdev) 0); if (ret) goto err_probe; - found++; } ret = doc_register_sysfs(pdev, cascade); if (ret) goto err_probe; - if (!found) - goto notfound; platform_set_drvdata(pdev, cascade); doc_dbg_register(cascade->floors[0]->priv); @@ -2103,7 +2100,7 @@ err_probe: * * Returns 0 */ -static int __exit docg3_release(struct platform_device *pdev) +static int docg3_release(struct platform_device *pdev) { struct docg3_cascade *cascade = platform_get_drvdata(pdev); struct docg3 *docg3 = cascade->floors[0]->priv; @@ -2134,7 +2131,7 @@ static struct platform_driver g3_driver = { }, .suspend = docg3_suspend, .resume = docg3_resume, - .remove = __exit_p(docg3_release), + .remove = docg3_release, }; module_platform_driver_probe(g3_driver, docg3_probe); diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 85e35467fba6..7c8b1694a134 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -223,6 +223,8 @@ static int m25p_probe(struct spi_device *spi) */ if (data && data->type) flash_name = data->type; + else if (!strcmp(spi->modalias, "nor-jedec")) + flash_name = NULL; /* auto-detect */ else flash_name = spi->modalias; @@ -247,9 +249,16 @@ static int m25p_remove(struct spi_device *spi) } /* - * XXX This needs to be kept in sync with spi_nor_ids. We can't share - * it with spi-nor, because if this is built as a module then modpost - * won't be able to read it and add appropriate aliases. + * Do NOT add to this array without reading the following: + * + * Historically, many flash devices are bound to this driver by their name. But + * since most of these flash are compatible to some extent, and their + * differences can often be differentiated by the JEDEC read-ID command, we + * encourage new users to add support to the spi-nor library, and simply bind + * against a generic string here (e.g., "nor-jedec"). + * + * Many flash names are kept here in this list (as well as in spi-nor.c) to + * keep them available as module aliases for existing platforms. */ static const struct spi_device_id m25p_ids[] = { {"at25fs010"}, {"at25fs040"}, {"at25df041a"}, {"at25df321a"}, @@ -291,6 +300,12 @@ static const struct spi_device_id m25p_ids[] = { {"w25x64"}, {"w25q64"}, {"w25q80"}, {"w25q80bl"}, {"w25q128"}, {"w25q256"}, {"cat25c11"}, {"cat25c03"}, {"cat25c09"}, {"cat25c17"}, {"cat25128"}, + + /* + * Generic support for SPI NOR that can be identified by the JEDEC READ + * ID opcode (0x9F). Use this, if possible. + */ + {"nor-jedec"}, { }, }; MODULE_DEVICE_TABLE(spi, m25p_ids); diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index ba801d2c6dcc..e715ae90632f 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -242,7 +242,7 @@ config MTD_L440GX config MTD_CFI_FLAGADM tristate "CFI Flash device mapping on FlagaDM" - depends on 8xx && MTD_CFI + depends on PPC_8xx && MTD_CFI help Mapping for the Flaga digital module. If you don't have one, ignore this setting. diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c index ea697202935a..892ad6ac63f2 100644 --- a/drivers/mtd/maps/sa1100-flash.c +++ b/drivers/mtd/maps/sa1100-flash.c @@ -274,7 +274,7 @@ static int sa1100_mtd_probe(struct platform_device *pdev) return err; } -static int __exit sa1100_mtd_remove(struct platform_device *pdev) +static int sa1100_mtd_remove(struct platform_device *pdev) { struct sa_info *info = platform_get_drvdata(pdev); struct flash_platform_data *plat = dev_get_platdata(&pdev->dev); @@ -286,7 +286,7 @@ static int __exit sa1100_mtd_remove(struct platform_device *pdev) static struct platform_driver sa1100_mtd_driver = { .probe = sa1100_mtd_probe, - .remove = __exit_p(sa1100_mtd_remove), + .remove = sa1100_mtd_remove, .driver = { .name = "sa1100-mtd", }, diff --git a/drivers/mtd/maps/ts5500_flash.c b/drivers/mtd/maps/ts5500_flash.c index d1d671daf235..9969fedb1f13 100644 --- a/drivers/mtd/maps/ts5500_flash.c +++ b/drivers/mtd/maps/ts5500_flash.c @@ -117,5 +117,5 @@ module_exit(cleanup_ts5500_map); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sean Young <sean@mess.org>"); -MODULE_DESCRIPTION("MTD map driver for Techology Systems TS-5500 board"); +MODULE_DESCRIPTION("MTD map driver for Technology Systems TS-5500 board"); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index d08229eb44d8..2b0c52870999 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -171,9 +171,6 @@ static void mtd_blktrans_work(struct work_struct *work) background_done = 0; } - if (req) - __blk_end_request_all(req, -EIO); - spin_unlock_irq(rq->queue_lock); } diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 11883bd26d9d..d172195fbd15 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -38,6 +38,7 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/reboot.h> +#include <linux/kconfig.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> @@ -501,6 +502,29 @@ out_error: return ret; } +static int mtd_add_device_partitions(struct mtd_info *mtd, + struct mtd_partition *real_parts, + int nbparts) +{ + int ret; + + if (nbparts == 0 || IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { + ret = add_mtd_device(mtd); + if (ret == 1) + return -ENODEV; + } + + if (nbparts > 0) { + ret = add_mtd_partitions(mtd, real_parts, nbparts); + if (ret && IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) + del_mtd_device(mtd); + return ret; + } + + return 0; +} + + /** * mtd_device_parse_register - parse partitions and register an MTD device. * @@ -523,7 +547,8 @@ out_error: * found this functions tries to fallback to information specified in * @parts/@nr_parts. * * If any partitioning info was found, this function registers the found - * partitions. + * partitions. If the MTD_PARTITIONED_MASTER option is set, then the device + * as a whole is registered first. * * If no partitions were found this function just registers the MTD device * @mtd and exits. * @@ -534,27 +559,21 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, const struct mtd_partition *parts, int nr_parts) { - int err; - struct mtd_partition *real_parts; + int ret; + struct mtd_partition *real_parts = NULL; - err = parse_mtd_partitions(mtd, types, &real_parts, parser_data); - if (err <= 0 && nr_parts && parts) { + ret = parse_mtd_partitions(mtd, types, &real_parts, parser_data); + if (ret <= 0 && nr_parts && parts) { real_parts = kmemdup(parts, sizeof(*parts) * nr_parts, GFP_KERNEL); if (!real_parts) - err = -ENOMEM; + ret = -ENOMEM; else - err = nr_parts; + ret = nr_parts; } - if (err > 0) { - err = add_mtd_partitions(mtd, real_parts, err); - kfree(real_parts); - } else if (err == 0) { - err = add_mtd_device(mtd); - if (err == 1) - err = -ENODEV; - } + if (ret >= 0) + ret = mtd_add_device_partitions(mtd, real_parts, ret); /* * FIXME: some drivers unfortunately call this function more than once. @@ -569,7 +588,8 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, register_reboot_notifier(&mtd->reboot_notifier); } - return err; + kfree(real_parts); + return ret; } EXPORT_SYMBOL_GPL(mtd_device_parse_register); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index e779de315ade..cafdb8855a79 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -30,6 +30,7 @@ #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> #include <linux/err.h> +#include <linux/kconfig.h> #include "mtdcore.h" @@ -379,10 +380,17 @@ static struct mtd_part *allocate_partition(struct mtd_info *master, slave->mtd.name = name; slave->mtd.owner = master->owner; - /* NOTE: we don't arrange MTDs as a tree; it'd be error-prone - * to have the same data be in two different partitions. + /* NOTE: Historically, we didn't arrange MTDs as a tree out of + * concern for showing the same data in multiple partitions. + * However, it is very useful to have the master node present, + * so the MTD_PARTITIONED_MASTER option allows that. The master + * will have device nodes etc only if this is set, so make the + * parent conditional on that option. Note, this is a way to + * distinguish between the master and the partition in sysfs. */ - slave->mtd.dev.parent = master->dev.parent; + slave->mtd.dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) ? + &master->dev : + master->dev.parent; slave->mtd._read = part_read; slave->mtd._write = part_write; @@ -546,12 +554,35 @@ out_register: return slave; } +static ssize_t mtd_partition_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mtd_info *mtd = dev_get_drvdata(dev); + struct mtd_part *part = PART(mtd); + return snprintf(buf, PAGE_SIZE, "%lld\n", part->offset); +} + +static DEVICE_ATTR(offset, S_IRUGO, mtd_partition_offset_show, NULL); + +static const struct attribute *mtd_partition_attrs[] = { + &dev_attr_offset.attr, + NULL +}; + +static int mtd_add_partition_attrs(struct mtd_part *new) +{ + int ret = sysfs_create_files(&new->mtd.dev.kobj, mtd_partition_attrs); + if (ret) + printk(KERN_WARNING + "mtd: failed to create partition attrs, err=%d\n", ret); + return ret; +} + int mtd_add_partition(struct mtd_info *master, const char *name, long long offset, long long length) { struct mtd_partition part; - struct mtd_part *p, *new; - uint64_t start, end; + struct mtd_part *new; int ret = 0; /* the direct offset is expected */ @@ -575,31 +606,15 @@ int mtd_add_partition(struct mtd_info *master, const char *name, if (IS_ERR(new)) return PTR_ERR(new); - start = offset; - end = offset + length; - mutex_lock(&mtd_partitions_mutex); - list_for_each_entry(p, &mtd_partitions, list) - if (p->master == master) { - if ((start >= p->offset) && - (start < (p->offset + p->mtd.size))) - goto err_inv; - - if ((end >= p->offset) && - (end < (p->offset + p->mtd.size))) - goto err_inv; - } - list_add(&new->list, &mtd_partitions); mutex_unlock(&mtd_partitions_mutex); add_mtd_device(&new->mtd); + mtd_add_partition_attrs(new); + return ret; -err_inv: - mutex_unlock(&mtd_partitions_mutex); - free_partition(new); - return -EINVAL; } EXPORT_SYMBOL_GPL(mtd_add_partition); @@ -612,6 +627,8 @@ int mtd_del_partition(struct mtd_info *master, int partno) list_for_each_entry_safe(slave, next, &mtd_partitions, list) if ((slave->master == master) && (slave->mtd.index == partno)) { + sysfs_remove_files(&slave->mtd.dev.kobj, + mtd_partition_attrs); ret = del_mtd_device(&slave->mtd); if (ret < 0) break; @@ -631,8 +648,8 @@ EXPORT_SYMBOL_GPL(mtd_del_partition); * and registers slave MTD objects which are bound to the master according to * the partition definitions. * - * We don't register the master, or expect the caller to have done so, - * for reasons of data integrity. + * For historical reasons, this function's caller only registers the master + * if the MTD_PARTITIONED_MASTER config option is set. */ int add_mtd_partitions(struct mtd_info *master, @@ -655,6 +672,7 @@ int add_mtd_partitions(struct mtd_info *master, mutex_unlock(&mtd_partitions_mutex); add_mtd_device(&slave->mtd); + mtd_add_partition_attrs(slave); cur_offset = slave->offset + slave->mtd.size; } diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c index d93c849b70b5..46010bd895b1 100644 --- a/drivers/mtd/nand/atmel_nand.c +++ b/drivers/mtd/nand/atmel_nand.c @@ -485,7 +485,7 @@ static void pmecc_config_ecc_layout(struct nand_ecclayout *layout, for (i = 0; i < ecc_len; i++) layout->eccpos[i] = oobsize - ecc_len + i; - layout->oobfree[0].offset = 2; + layout->oobfree[0].offset = PMECC_OOB_RESERVED_BYTES; layout->oobfree[0].length = oobsize - ecc_len - layout->oobfree[0].offset; } @@ -1204,14 +1204,14 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev, goto err; } - regs_rom = platform_get_resource(pdev, IORESOURCE_MEM, 3); - host->pmecc_rom_base = devm_ioremap_resource(&pdev->dev, regs_rom); - if (IS_ERR(host->pmecc_rom_base)) { - if (!host->has_no_lookup_table) - /* Don't display the information again */ + if (!host->has_no_lookup_table) { + regs_rom = platform_get_resource(pdev, IORESOURCE_MEM, 3); + host->pmecc_rom_base = devm_ioremap_resource(&pdev->dev, + regs_rom); + if (IS_ERR(host->pmecc_rom_base)) { dev_err(host->dev, "Can not get I/O resource for ROM, will build a lookup table in runtime!\n"); - - host->has_no_lookup_table = true; + host->has_no_lookup_table = true; + } } if (host->has_no_lookup_table) { @@ -1254,7 +1254,8 @@ static int atmel_pmecc_nand_init_params(struct platform_device *pdev, nand_chip->ecc.steps = mtd->writesize / sector_size; nand_chip->ecc.total = nand_chip->ecc.bytes * nand_chip->ecc.steps; - if (nand_chip->ecc.total > mtd->oobsize - 2) { + if (nand_chip->ecc.total > + mtd->oobsize - PMECC_OOB_RESERVED_BYTES) { dev_err(host->dev, "No room for ECC bytes\n"); err_no = -EINVAL; goto err; @@ -1719,7 +1720,7 @@ static int nfc_wait_interrupt(struct atmel_nand_host *host, u32 flag) comp[index++] = &host->nfc->comp_cmd_done; if (index == 0) { - dev_err(host->dev, "Unkown interrupt flag: 0x%08x\n", flag); + dev_err(host->dev, "Unknown interrupt flag: 0x%08x\n", flag); return -EINVAL; } @@ -1752,11 +1753,10 @@ static int nfc_send_command(struct atmel_nand_host *host, cmd, addr, cycle0); timeout = jiffies + msecs_to_jiffies(NFC_TIME_OUT_MS); - while (nfc_cmd_readl(NFCADDR_CMD_NFCBUSY, host->nfc->base_cmd_regs) - & NFCADDR_CMD_NFCBUSY) { + while (nfc_readl(host->nfc->hsmc_regs, SR) & NFC_SR_BUSY) { if (time_after(jiffies, timeout)) { dev_err(host->dev, - "Time out to wait CMD_NFCBUSY ready!\n"); + "Time out to wait for NFC ready!\n"); return -ETIMEDOUT; } } diff --git a/drivers/mtd/nand/atmel_nand_ecc.h b/drivers/mtd/nand/atmel_nand_ecc.h index d4035e335ad8..668e7358f19b 100644 --- a/drivers/mtd/nand/atmel_nand_ecc.h +++ b/drivers/mtd/nand/atmel_nand_ecc.h @@ -152,4 +152,7 @@ /* Time out value for reading PMECC status register */ #define PMECC_MAX_TIMEOUT_MS 100 +/* Reserved bytes in oob area */ +#define PMECC_OOB_RESERVED_BYTES 2 + #endif diff --git a/drivers/mtd/nand/atmel_nand_nfc.h b/drivers/mtd/nand/atmel_nand_nfc.h index 85b8ca6af7d2..4d5d26221a7e 100644 --- a/drivers/mtd/nand/atmel_nand_nfc.h +++ b/drivers/mtd/nand/atmel_nand_nfc.h @@ -35,6 +35,7 @@ #define NFC_CTRL_DISABLE (1 << 1) #define ATMEL_HSMC_NFC_SR 0x08 /* NFC Status Register */ +#define NFC_SR_BUSY (1 << 8) #define NFC_SR_XFR_DONE (1 << 16) #define NFC_SR_CMD_DONE (1 << 17) #define NFC_SR_DTOE (1 << 20) diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c index f44c6061536a..870c7fc0f759 100644 --- a/drivers/mtd/nand/denali.c +++ b/drivers/mtd/nand/denali.c @@ -225,7 +225,6 @@ static void nand_onfi_timing_set(struct denali_nand_info *denali, uint16_t Twhr[6] = {120, 80, 80, 60, 60, 60}; uint16_t Tcs[6] = {70, 35, 25, 25, 20, 15}; - uint16_t TclsRising = 1; uint16_t data_invalid_rhoh, data_invalid_rloh, data_invalid; uint16_t dv_window = 0; uint16_t en_lo, en_hi; @@ -276,8 +275,6 @@ static void nand_onfi_timing_set(struct denali_nand_info *denali, re_2_re = CEIL_DIV(Trhz[mode], CLK_X); we_2_re = CEIL_DIV(Twhr[mode], CLK_X); cs_cnt = CEIL_DIV((Tcs[mode] - Trp[mode]), CLK_X); - if (!TclsRising) - cs_cnt = CEIL_DIV(Tcs[mode], CLK_X); if (cs_cnt == 0) cs_cnt = 1; @@ -1536,6 +1533,9 @@ int denali_init(struct denali_nand_info *denali) denali->nand.options |= NAND_SKIP_BBTSCAN; denali->nand.ecc.mode = NAND_ECC_HW_SYNDROME; + /* no subpage writes on denali */ + denali->nand.options |= NAND_NO_SUBPAGE_WRITE; + /* * Denali Controller only support 15bit and 8bit ECC in MRST, * so just let controller do 15bit ECC for MLC and 8bit ECC for diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c index 4c05f4f6a5c6..51394e59901b 100644 --- a/drivers/mtd/nand/fsl_ifc_nand.c +++ b/drivers/mtd/nand/fsl_ifc_nand.c @@ -317,7 +317,7 @@ static void fsl_ifc_run_command(struct mtd_info *mtd) /* wait for command complete flag or timeout */ wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat, - IFC_TIMEOUT_MSECS * HZ/1000); + msecs_to_jiffies(IFC_TIMEOUT_MSECS)); /* ctrl->nand_stat will be updated from IRQ context */ if (!ctrl->nand_stat) @@ -860,7 +860,7 @@ static void fsl_ifc_sram_init(struct fsl_ifc_mtd *priv) /* wait for command complete flag or timeout */ wait_event_timeout(ctrl->nand_wait, ctrl->nand_stat, - IFC_TIMEOUT_MSECS * HZ/1000); + msecs_to_jiffies(IFC_TIMEOUT_MSECS)); if (ctrl->nand_stat != IFC_NAND_EVTER_STAT_OPC) printk(KERN_ERR "fsl-ifc: Failed to Initialise SRAM\n"); diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c index edfaa21b1817..e58af4bfa8c8 100644 --- a/drivers/mtd/nand/fsmc_nand.c +++ b/drivers/mtd/nand/fsmc_nand.c @@ -873,6 +873,7 @@ static int fsmc_nand_probe_config_dt(struct platform_device *pdev, { struct fsmc_nand_platform_data *pdata = dev_get_platdata(&pdev->dev); u32 val; + int ret; /* Set default NAND width to 8 bits */ pdata->width = 8; @@ -891,8 +892,12 @@ static int fsmc_nand_probe_config_dt(struct platform_device *pdev, sizeof(*pdata->nand_timings), GFP_KERNEL); if (!pdata->nand_timings) return -ENOMEM; - of_property_read_u8_array(np, "timings", (u8 *)pdata->nand_timings, + ret = of_property_read_u8_array(np, "timings", (u8 *)pdata->nand_timings, sizeof(*pdata->nand_timings)); + if (ret) { + dev_info(&pdev->dev, "No timings in dts specified, using default timings!\n"); + pdata->nand_timings = NULL; + } /* Set default NAND bank to 0 */ pdata->bank = 0; diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c index 33f3c3c54dbc..1b8f3500e6d2 100644 --- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.c +++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.c @@ -446,7 +446,7 @@ int start_dma_without_bch_irq(struct gpmi_nand_data *this, struct dma_async_tx_descriptor *desc) { struct completion *dma_c = &this->dma_done; - int err; + unsigned long timeout; init_completion(dma_c); @@ -456,8 +456,8 @@ int start_dma_without_bch_irq(struct gpmi_nand_data *this, dma_async_issue_pending(get_dma_chan(this)); /* Wait for the interrupt from the DMA block. */ - err = wait_for_completion_timeout(dma_c, msecs_to_jiffies(1000)); - if (!err) { + timeout = wait_for_completion_timeout(dma_c, msecs_to_jiffies(1000)); + if (!timeout) { dev_err(this->dev, "DMA timeout, last DMA :%d\n", this->last_dma_type); gpmi_dump_info(this); @@ -477,7 +477,7 @@ int start_dma_with_bch_irq(struct gpmi_nand_data *this, struct dma_async_tx_descriptor *desc) { struct completion *bch_c = &this->bch_done; - int err; + unsigned long timeout; /* Prepare to receive an interrupt from the BCH block. */ init_completion(bch_c); @@ -486,8 +486,8 @@ int start_dma_with_bch_irq(struct gpmi_nand_data *this, start_dma_without_bch_irq(this, desc); /* Wait for the interrupt from the BCH block. */ - err = wait_for_completion_timeout(bch_c, msecs_to_jiffies(1000)); - if (!err) { + timeout = wait_for_completion_timeout(bch_c, msecs_to_jiffies(1000)); + if (!timeout) { dev_err(this->dev, "BCH timeout, last DMA :%d\n", this->last_dma_type); gpmi_dump_info(this); @@ -1950,7 +1950,9 @@ static int gpmi_nand_init(struct gpmi_nand_data *this) ret = nand_boot_init(this); if (ret) goto err_out; - chip->scan_bbt(mtd); + ret = chip->scan_bbt(mtd); + if (ret) + goto err_out; ppdata.of_node = this->pdev->dev.of_node; ret = mtd_device_parse_register(mtd, NULL, &ppdata, NULL, 0); diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c index a8f550fec35e..372e0e38f59b 100644 --- a/drivers/mtd/nand/mxc_nand.c +++ b/drivers/mtd/nand/mxc_nand.c @@ -386,26 +386,51 @@ static irqreturn_t mxc_nfc_irq(int irq, void *dev_id) /* This function polls the NANDFC to wait for the basic operation to * complete by checking the INT bit of config2 register. */ -static void wait_op_done(struct mxc_nand_host *host, int useirq) +static int wait_op_done(struct mxc_nand_host *host, int useirq) { - int max_retries = 8000; + int ret = 0; + + /* + * If operation is already complete, don't bother to setup an irq or a + * loop. + */ + if (host->devtype_data->check_int(host)) + return 0; if (useirq) { - if (!host->devtype_data->check_int(host)) { - reinit_completion(&host->op_completion); - irq_control(host, 1); - wait_for_completion(&host->op_completion); + unsigned long timeout; + + reinit_completion(&host->op_completion); + + irq_control(host, 1); + + timeout = wait_for_completion_timeout(&host->op_completion, HZ); + if (!timeout && !host->devtype_data->check_int(host)) { + dev_dbg(host->dev, "timeout waiting for irq\n"); + ret = -ETIMEDOUT; } } else { - while (max_retries-- > 0) { - if (host->devtype_data->check_int(host)) - break; + int max_retries = 8000; + int done; + do { udelay(1); + + done = host->devtype_data->check_int(host); + if (done) + break; + + } while (--max_retries); + + if (!done) { + dev_dbg(host->dev, "timeout polling for completion\n"); + ret = -ETIMEDOUT; } - if (max_retries < 0) - pr_debug("%s: INT not set\n", __func__); } + + WARN_ONCE(ret < 0, "timeout! useirq=%d\n", useirq); + + return ret; } static void send_cmd_v3(struct mxc_nand_host *host, uint16_t cmd, int useirq) @@ -527,30 +552,17 @@ static void send_page_v1(struct mtd_info *mtd, unsigned int ops) static void send_read_id_v3(struct mxc_nand_host *host) { - struct nand_chip *this = &host->nand; - /* Read ID into main buffer */ writel(NFC_ID, NFC_V3_LAUNCH); wait_op_done(host, true); memcpy32_fromio(host->data_buf, host->main_area0, 16); - - if (this->options & NAND_BUSWIDTH_16) { - /* compress the ID info */ - host->data_buf[1] = host->data_buf[2]; - host->data_buf[2] = host->data_buf[4]; - host->data_buf[3] = host->data_buf[6]; - host->data_buf[4] = host->data_buf[8]; - host->data_buf[5] = host->data_buf[10]; - } } /* Request the NANDFC to perform a read of the NAND device ID. */ static void send_read_id_v1_v2(struct mxc_nand_host *host) { - struct nand_chip *this = &host->nand; - /* NANDFC buffer 0 is used for device ID output */ writew(host->active_cs << 4, NFC_V1_V2_BUF_ADDR); @@ -560,15 +572,6 @@ static void send_read_id_v1_v2(struct mxc_nand_host *host) wait_op_done(host, true); memcpy32_fromio(host->data_buf, host->main_area0, 16); - - if (this->options & NAND_BUSWIDTH_16) { - /* compress the ID info */ - host->data_buf[1] = host->data_buf[2]; - host->data_buf[2] = host->data_buf[4]; - host->data_buf[3] = host->data_buf[6]; - host->data_buf[4] = host->data_buf[8]; - host->data_buf[5] = host->data_buf[10]; - } } static uint16_t get_dev_status_v3(struct mxc_nand_host *host) @@ -694,9 +697,17 @@ static u_char mxc_nand_read_byte(struct mtd_info *mtd) if (host->status_request) return host->devtype_data->get_dev_status(host) & 0xFF; - ret = *(uint8_t *)(host->data_buf + host->buf_start); - host->buf_start++; + if (nand_chip->options & NAND_BUSWIDTH_16) { + /* only take the lower byte of each word */ + ret = *(uint16_t *)(host->data_buf + host->buf_start); + + host->buf_start += 2; + } else { + ret = *(uint8_t *)(host->data_buf + host->buf_start); + host->buf_start++; + } + pr_debug("%s: ret=0x%hhx (start=%u)\n", __func__, ret, host->buf_start); return ret; } @@ -825,6 +836,12 @@ static void copy_spare(struct mtd_info *mtd, bool bfrom) } } +/* + * MXC NANDFC can only perform full page+spare or spare-only read/write. When + * the upper layers perform a read/write buf operation, the saved column address + * is used to index into the full page. So usually this function is called with + * column == 0 (unless no column cycle is needed indicated by column == -1) + */ static void mxc_do_addr_cycle(struct mtd_info *mtd, int column, int page_addr) { struct nand_chip *nand_chip = mtd->priv; @@ -832,16 +849,13 @@ static void mxc_do_addr_cycle(struct mtd_info *mtd, int column, int page_addr) /* Write out column address, if necessary */ if (column != -1) { - /* - * MXC NANDFC can only perform full page+spare or - * spare-only read/write. When the upper layers - * perform a read/write buf operation, the saved column - * address is used to index into the full page. - */ - host->devtype_data->send_addr(host, 0, page_addr == -1); + host->devtype_data->send_addr(host, column & 0xff, + page_addr == -1); if (mtd->writesize > 512) /* another col addr cycle for 2k page */ - host->devtype_data->send_addr(host, 0, false); + host->devtype_data->send_addr(host, + (column >> 8) & 0xff, + false); } /* Write out page address, if necessary */ @@ -903,7 +917,7 @@ static void preset_v1(struct mtd_info *mtd) struct mxc_nand_host *host = nand_chip->priv; uint16_t config1 = 0; - if (nand_chip->ecc.mode == NAND_ECC_HW) + if (nand_chip->ecc.mode == NAND_ECC_HW && mtd->writesize) config1 |= NFC_V1_V2_CONFIG1_ECC_EN; if (!host->devtype_data->irqpending_quirk) @@ -931,9 +945,6 @@ static void preset_v2(struct mtd_info *mtd) struct mxc_nand_host *host = nand_chip->priv; uint16_t config1 = 0; - if (nand_chip->ecc.mode == NAND_ECC_HW) - config1 |= NFC_V1_V2_CONFIG1_ECC_EN; - config1 |= NFC_V2_CONFIG1_FP_INT; if (!host->devtype_data->irqpending_quirk) @@ -942,6 +953,9 @@ static void preset_v2(struct mtd_info *mtd) if (mtd->writesize) { uint16_t pages_per_block = mtd->erasesize / mtd->writesize; + if (nand_chip->ecc.mode == NAND_ECC_HW) + config1 |= NFC_V1_V2_CONFIG1_ECC_EN; + host->eccsize = get_eccsize(mtd); if (host->eccsize == 4) config1 |= NFC_V2_CONFIG1_ECC_MODE_4; @@ -999,9 +1013,6 @@ static void preset_v3(struct mtd_info *mtd) NFC_V3_CONFIG2_INT_MSK | NFC_V3_CONFIG2_NUM_ADDR_PHASE0; - if (chip->ecc.mode == NAND_ECC_HW) - config2 |= NFC_V3_CONFIG2_ECC_EN; - addr_phases = fls(chip->pagemask) >> 3; if (mtd->writesize == 2048) { @@ -1016,6 +1027,9 @@ static void preset_v3(struct mtd_info *mtd) } if (mtd->writesize) { + if (chip->ecc.mode == NAND_ECC_HW) + config2 |= NFC_V3_CONFIG2_ECC_EN; + config2 |= NFC_V3_CONFIG2_PPB( ffs(mtd->erasesize / mtd->writesize) - 6, host->devtype_data->ppb_shift); @@ -1066,6 +1080,9 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, host->status_request = true; host->devtype_data->send_cmd(host, command, true); + WARN_ONCE(column != -1 || page_addr != -1, + "Unexpected column/row value (cmd=%u, col=%d, row=%d)\n", + command, column, page_addr); mxc_do_addr_cycle(mtd, column, page_addr); break; @@ -1079,7 +1096,10 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, command = NAND_CMD_READ0; /* only READ0 is valid */ host->devtype_data->send_cmd(host, command, false); - mxc_do_addr_cycle(mtd, column, page_addr); + WARN_ONCE(column < 0, + "Unexpected column/row value (cmd=%u, col=%d, row=%d)\n", + command, column, page_addr); + mxc_do_addr_cycle(mtd, 0, page_addr); if (mtd->writesize > 512) host->devtype_data->send_cmd(host, @@ -1100,7 +1120,10 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, host->buf_start = column; host->devtype_data->send_cmd(host, command, false); - mxc_do_addr_cycle(mtd, column, page_addr); + WARN_ONCE(column < -1, + "Unexpected column/row value (cmd=%u, col=%d, row=%d)\n", + command, column, page_addr); + mxc_do_addr_cycle(mtd, 0, page_addr); break; case NAND_CMD_PAGEPROG: @@ -1108,6 +1131,9 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, copy_spare(mtd, false); host->devtype_data->send_page(mtd, NFC_INPUT); host->devtype_data->send_cmd(host, command, true); + WARN_ONCE(column != -1 || page_addr != -1, + "Unexpected column/row value (cmd=%u, col=%d, row=%d)\n", + command, column, page_addr); mxc_do_addr_cycle(mtd, column, page_addr); break; @@ -1115,15 +1141,29 @@ static void mxc_nand_command(struct mtd_info *mtd, unsigned command, host->devtype_data->send_cmd(host, command, true); mxc_do_addr_cycle(mtd, column, page_addr); host->devtype_data->send_read_id(host); - host->buf_start = column; + host->buf_start = 0; break; case NAND_CMD_ERASE1: case NAND_CMD_ERASE2: host->devtype_data->send_cmd(host, command, false); + WARN_ONCE(column != -1, + "Unexpected column value (cmd=%u, col=%d)\n", + command, column); mxc_do_addr_cycle(mtd, column, page_addr); break; + case NAND_CMD_PARAM: + host->devtype_data->send_cmd(host, command, false); + mxc_do_addr_cycle(mtd, column, page_addr); + host->devtype_data->send_page(mtd, NFC_OUTPUT); + memcpy32_fromio(host->data_buf, host->main_area0, 512); + host->buf_start = 0; + break; + default: + WARN_ONCE(1, "Unimplemented command (cmd=%u)\n", + command); + break; } } diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index df7eb4ff07d1..c2e1232cd45c 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -386,7 +386,7 @@ static int nand_default_block_markbad(struct mtd_info *mtd, loff_t ofs) uint8_t buf[2] = { 0, 0 }; int ret = 0, res, i = 0; - ops.datbuf = NULL; + memset(&ops, 0, sizeof(ops)); ops.oobbuf = buf; ops.ooboffs = chip->badblockpos; if (chip->options & NAND_BUSWIDTH_16) { @@ -566,6 +566,25 @@ void nand_wait_ready(struct mtd_info *mtd) EXPORT_SYMBOL_GPL(nand_wait_ready); /** + * nand_wait_status_ready - [GENERIC] Wait for the ready status after commands. + * @mtd: MTD device structure + * @timeo: Timeout in ms + * + * Wait for status ready (i.e. command done) or timeout. + */ +static void nand_wait_status_ready(struct mtd_info *mtd, unsigned long timeo) +{ + register struct nand_chip *chip = mtd->priv; + + timeo = jiffies + msecs_to_jiffies(timeo); + do { + if ((chip->read_byte(mtd) & NAND_STATUS_READY)) + break; + touch_softlockup_watchdog(); + } while (time_before(jiffies, timeo)); +}; + +/** * nand_command - [DEFAULT] Send command to NAND device * @mtd: MTD device structure * @command: the command to be sent @@ -643,8 +662,8 @@ static void nand_command(struct mtd_info *mtd, unsigned int command, NAND_CTRL_CLE | NAND_CTRL_CHANGE); chip->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE); - while (!(chip->read_byte(mtd) & NAND_STATUS_READY)) - ; + /* EZ-NAND can take upto 250ms as per ONFi v4.0 */ + nand_wait_status_ready(mtd, 250); return; /* This applies to read commands */ @@ -740,8 +759,8 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command, NAND_NCE | NAND_CLE | NAND_CTRL_CHANGE); chip->cmd_ctrl(mtd, NAND_CMD_NONE, NAND_NCE | NAND_CTRL_CHANGE); - while (!(chip->read_byte(mtd) & NAND_STATUS_READY)) - ; + /* EZ-NAND can take upto 250ms as per ONFi v4.0 */ + nand_wait_status_ready(mtd, 250); return; case NAND_CMD_RNDOUT: @@ -968,7 +987,7 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) __func__, (unsigned long long)ofs, len); if (check_offs_len(mtd, ofs, len)) - ret = -EINVAL; + return -EINVAL; /* Align to last block address if size addresses end of the device */ if (ofs + len == mtd->size) @@ -1031,7 +1050,7 @@ int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) __func__, (unsigned long long)ofs, len); if (check_offs_len(mtd, ofs, len)) - ret = -EINVAL; + return -EINVAL; nand_get_device(mtd, FL_LOCKING); @@ -1716,9 +1735,9 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, int ret; nand_get_device(mtd, FL_READING); + memset(&ops, 0, sizeof(ops)); ops.len = len; ops.datbuf = buf; - ops.oobbuf = NULL; ops.mode = MTD_OPS_PLACE_OOB; ret = nand_do_read_ops(mtd, from, &ops); *retlen = ops.retlen; @@ -2124,7 +2143,7 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip, /** - * nand_write_subpage_hwecc - [REPLACABLE] hardware ECC based subpage write + * nand_write_subpage_hwecc - [REPLACEABLE] hardware ECC based subpage write * @mtd: mtd info structure * @chip: nand chip info structure * @offset: column address of subpage within the page @@ -2508,9 +2527,9 @@ static int panic_nand_write(struct mtd_info *mtd, loff_t to, size_t len, /* Grab the device */ panic_nand_get_device(chip, mtd, FL_WRITING); + memset(&ops, 0, sizeof(ops)); ops.len = len; ops.datbuf = (uint8_t *)buf; - ops.oobbuf = NULL; ops.mode = MTD_OPS_PLACE_OOB; ret = nand_do_write_ops(mtd, to, &ops); @@ -2536,9 +2555,9 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, int ret; nand_get_device(mtd, FL_WRITING); + memset(&ops, 0, sizeof(ops)); ops.len = len; ops.datbuf = (uint8_t *)buf; - ops.oobbuf = NULL; ops.mode = MTD_OPS_PLACE_OOB; ret = nand_do_write_ops(mtd, to, &ops); *retlen = ops.retlen; diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index 10b1f7a4fe50..a4615fcc3d00 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -38,8 +38,8 @@ #include <linux/platform_data/mtd-nand-pxa3xx.h> -#define CHIP_DELAY_TIMEOUT (2 * HZ/10) -#define NAND_STOP_DELAY (2 * HZ/50) +#define CHIP_DELAY_TIMEOUT msecs_to_jiffies(200) +#define NAND_STOP_DELAY msecs_to_jiffies(40) #define PAGE_CHUNK_SIZE (2048) /* @@ -605,11 +605,24 @@ static void start_data_dma(struct pxa3xx_nand_info *info) {} #endif +static irqreturn_t pxa3xx_nand_irq_thread(int irq, void *data) +{ + struct pxa3xx_nand_info *info = data; + + handle_data_pio(info); + + info->state = STATE_CMD_DONE; + nand_writel(info, NDSR, NDSR_WRDREQ | NDSR_RDDREQ); + + return IRQ_HANDLED; +} + static irqreturn_t pxa3xx_nand_irq(int irq, void *devid) { struct pxa3xx_nand_info *info = devid; unsigned int status, is_completed = 0, is_ready = 0; unsigned int ready, cmd_done; + irqreturn_t ret = IRQ_HANDLED; if (info->cs == 0) { ready = NDSR_FLASH_RDY; @@ -651,7 +664,8 @@ static irqreturn_t pxa3xx_nand_irq(int irq, void *devid) } else { info->state = (status & NDSR_RDDREQ) ? STATE_PIO_READING : STATE_PIO_WRITING; - handle_data_pio(info); + ret = IRQ_WAKE_THREAD; + goto NORMAL_IRQ_EXIT; } } if (status & cmd_done) { @@ -692,7 +706,7 @@ static irqreturn_t pxa3xx_nand_irq(int irq, void *devid) if (is_ready) complete(&info->dev_ready); NORMAL_IRQ_EXIT: - return IRQ_HANDLED; + return ret; } static inline int is_buf_blank(uint8_t *buf, size_t len) @@ -951,7 +965,7 @@ static void nand_cmdfunc(struct mtd_info *mtd, unsigned command, { struct pxa3xx_nand_host *host = mtd->priv; struct pxa3xx_nand_info *info = host->info_data; - int ret, exec_cmd; + int exec_cmd; /* * if this is a x16 device ,then convert the input @@ -983,9 +997,8 @@ static void nand_cmdfunc(struct mtd_info *mtd, unsigned command, info->need_wait = 1; pxa3xx_nand_start(info); - ret = wait_for_completion_timeout(&info->cmd_complete, - CHIP_DELAY_TIMEOUT); - if (!ret) { + if (!wait_for_completion_timeout(&info->cmd_complete, + CHIP_DELAY_TIMEOUT)) { dev_err(&info->pdev->dev, "Wait time out!!!\n"); /* Stop State Machine for next command cycle */ pxa3xx_nand_stop(info); @@ -1000,7 +1013,7 @@ static void nand_cmdfunc_extended(struct mtd_info *mtd, { struct pxa3xx_nand_host *host = mtd->priv; struct pxa3xx_nand_info *info = host->info_data; - int ret, exec_cmd, ext_cmd_type; + int exec_cmd, ext_cmd_type; /* * if this is a x16 device then convert the input @@ -1063,9 +1076,8 @@ static void nand_cmdfunc_extended(struct mtd_info *mtd, init_completion(&info->cmd_complete); pxa3xx_nand_start(info); - ret = wait_for_completion_timeout(&info->cmd_complete, - CHIP_DELAY_TIMEOUT); - if (!ret) { + if (!wait_for_completion_timeout(&info->cmd_complete, + CHIP_DELAY_TIMEOUT)) { dev_err(&info->pdev->dev, "Wait time out!!!\n"); /* Stop State Machine for next command cycle */ pxa3xx_nand_stop(info); @@ -1198,13 +1210,11 @@ static int pxa3xx_nand_waitfunc(struct mtd_info *mtd, struct nand_chip *this) { struct pxa3xx_nand_host *host = mtd->priv; struct pxa3xx_nand_info *info = host->info_data; - int ret; if (info->need_wait) { - ret = wait_for_completion_timeout(&info->dev_ready, - CHIP_DELAY_TIMEOUT); info->need_wait = 0; - if (!ret) { + if (!wait_for_completion_timeout(&info->dev_ready, + CHIP_DELAY_TIMEOUT)) { dev_err(&info->pdev->dev, "Ready time out!!!\n"); return NAND_STATUS_FAIL; } @@ -1508,6 +1518,8 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd) return ret; } + memset(pxa3xx_flash_ids, 0, sizeof(pxa3xx_flash_ids)); + pxa3xx_flash_ids[0].name = f->name; pxa3xx_flash_ids[0].dev_id = (f->chip_id >> 8) & 0xffff; pxa3xx_flash_ids[0].pagesize = f->page_size; @@ -1710,7 +1722,9 @@ static int alloc_nand_resource(struct platform_device *pdev) /* initialize all interrupts to be disabled */ disable_int(info, NDSR_MASK); - ret = request_irq(irq, pxa3xx_nand_irq, 0, pdev->name, info); + ret = request_threaded_irq(irq, pxa3xx_nand_irq, + pxa3xx_nand_irq_thread, IRQF_ONESHOT, + pdev->name, info); if (ret < 0) { dev_err(&pdev->dev, "failed to request IRQ\n"); goto fail_free_buf; diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c index 35aef5edb588..0e02be47ce1d 100644 --- a/drivers/mtd/nand/s3c2410.c +++ b/drivers/mtd/nand/s3c2410.c @@ -948,8 +948,6 @@ static int s3c24xx_nand_probe(struct platform_device *pdev) cpu_type = platform_get_device_id(pdev)->driver_data; - pr_debug("s3c2410_nand_probe(%p)\n", pdev); - info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL); if (info == NULL) { err = -ENOMEM; @@ -1045,7 +1043,6 @@ static int s3c24xx_nand_probe(struct platform_device *pdev) s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); } - pr_debug("initialised ok\n"); return 0; exit_error: diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index a21c378f096a..c3ce81c1a716 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -159,7 +159,6 @@ static void flctl_setup_dma(struct sh_flctl *flctl) return; memset(&cfg, 0, sizeof(cfg)); - cfg.slave_id = pdata->slave_id_fifo0_tx; cfg.direction = DMA_MEM_TO_DEV; cfg.dst_addr = (dma_addr_t)FLDTFIFO(flctl); cfg.src_addr = 0; @@ -175,7 +174,6 @@ static void flctl_setup_dma(struct sh_flctl *flctl) if (!flctl->chan_fifo0_rx) goto err; - cfg.slave_id = pdata->slave_id_fifo0_rx; cfg.direction = DMA_DEV_TO_MEM; cfg.dst_addr = 0; cfg.src_addr = (dma_addr_t)FLDTFIFO(flctl); diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 635ee0027691..43b3392ffee7 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1743,7 +1743,6 @@ static int onenand_panic_write(struct mtd_info *mtd, loff_t to, size_t len, struct onenand_chip *this = mtd->priv; int column, subpage; int written = 0; - int ret = 0; if (this->state == FL_PM_SUSPENDED) return -EBUSY; @@ -1786,15 +1785,10 @@ static int onenand_panic_write(struct mtd_info *mtd, loff_t to, size_t len, onenand_panic_wait(mtd); /* In partial page write we don't update bufferram */ - onenand_update_bufferram(mtd, to, !ret && !subpage); + onenand_update_bufferram(mtd, to, !subpage); if (ONENAND_IS_2PLANE(this)) { ONENAND_SET_BUFFERRAM1(this); - onenand_update_bufferram(mtd, to + this->writesize, !ret && !subpage); - } - - if (ret) { - printk(KERN_ERR "%s: write failed %d\n", __func__, ret); - break; + onenand_update_bufferram(mtd, to + this->writesize, !subpage); } written += thislen; @@ -1808,7 +1802,7 @@ static int onenand_panic_write(struct mtd_info *mtd, loff_t to, size_t len, } *retlen = written; - return ret; + return 0; } /** diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index 1c7308c2c77d..5d5d36272bb5 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -460,8 +460,7 @@ fsl_qspi_runcmd(struct fsl_qspi *q, u8 cmd, unsigned int addr, int len) writel((seqid << QUADSPI_IPCR_SEQID_SHIFT) | len, base + QUADSPI_IPCR); /* Wait for the interrupt. */ - err = wait_for_completion_timeout(&q->c, msecs_to_jiffies(1000)); - if (!err) { + if (!wait_for_completion_timeout(&q->c, msecs_to_jiffies(1000))) { dev_err(q->dev, "cmd 0x%.2x timeout, addr@%.8x, FR:0x%.8x, SR:0x%.8x\n", cmd, addr, readl(base + QUADSPI_FR), @@ -830,27 +829,27 @@ static int fsl_qspi_probe(struct platform_device *pdev) ret = clk_prepare_enable(q->clk_en); if (ret) { - dev_err(dev, "can not enable the qspi_en clock\n"); + dev_err(dev, "cannot enable the qspi_en clock: %d\n", ret); return ret; } ret = clk_prepare_enable(q->clk); if (ret) { - dev_err(dev, "can not enable the qspi clock\n"); + dev_err(dev, "cannot enable the qspi clock: %d\n", ret); goto clk_failed; } /* find the irq */ ret = platform_get_irq(pdev, 0); if (ret < 0) { - dev_err(dev, "failed to get the irq\n"); + dev_err(dev, "failed to get the irq: %d\n", ret); goto irq_failed; } ret = devm_request_irq(dev, ret, fsl_qspi_irq_handler, 0, pdev->name, q); if (ret) { - dev_err(dev, "failed to request irq.\n"); + dev_err(dev, "failed to request irq: %d\n", ret); goto irq_failed; } diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index b6a5a0c269e1..14a5d2325dac 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -369,17 +369,13 @@ erase_err: return ret; } -static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +static int stm_lock(struct spi_nor *nor, loff_t ofs, uint64_t len) { - struct spi_nor *nor = mtd_to_spi_nor(mtd); + struct mtd_info *mtd = nor->mtd; uint32_t offset = ofs; uint8_t status_old, status_new; int ret = 0; - ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_LOCK); - if (ret) - return ret; - status_old = read_sr(nor); if (offset < mtd->size - (mtd->size / 2)) @@ -402,26 +398,18 @@ static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) (status_old & (SR_BP2 | SR_BP1 | SR_BP0))) { write_enable(nor); ret = write_sr(nor, status_new); - if (ret) - goto err; } -err: - spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_LOCK); return ret; } -static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +static int stm_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len) { - struct spi_nor *nor = mtd_to_spi_nor(mtd); + struct mtd_info *mtd = nor->mtd; uint32_t offset = ofs; uint8_t status_old, status_new; int ret = 0; - ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_UNLOCK); - if (ret) - return ret; - status_old = read_sr(nor); if (offset+len > mtd->size - (mtd->size / 64)) @@ -444,15 +432,41 @@ static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) (status_old & (SR_BP2 | SR_BP1 | SR_BP0))) { write_enable(nor); ret = write_sr(nor, status_new); - if (ret) - goto err; } -err: + return ret; +} + +static int spi_nor_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +{ + struct spi_nor *nor = mtd_to_spi_nor(mtd); + int ret; + + ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_LOCK); + if (ret) + return ret; + + ret = nor->flash_lock(nor, ofs, len); + spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_UNLOCK); return ret; } +static int spi_nor_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +{ + struct spi_nor *nor = mtd_to_spi_nor(mtd); + int ret; + + ret = spi_nor_lock_and_prep(nor, SPI_NOR_OPS_UNLOCK); + if (ret) + return ret; + + ret = nor->flash_unlock(nor, ofs, len); + + spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_LOCK); + return ret; +} + /* Used when the "_ext_id" is two bytes at most */ #define INFO(_jedec_id, _ext_id, _sector_size, _n_sectors, _flags) \ ((kernel_ulong_t)&(struct flash_info) { \ @@ -524,6 +538,7 @@ static const struct spi_device_id spi_nor_ids[] = { { "en25q64", INFO(0x1c3017, 0, 64 * 1024, 128, SECT_4K) }, { "en25qh128", INFO(0x1c7018, 0, 64 * 1024, 256, 0) }, { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512, 0) }, + { "en25s64", INFO(0x1c3817, 0, 64 * 1024, 128, 0) }, /* ESMT */ { "f25l32pa", INFO(0x8c2016, 0, 64 * 1024, 64, SECT_4K) }, @@ -553,6 +568,7 @@ static const struct spi_device_id spi_nor_ids[] = { { "mx25l3205d", INFO(0xc22016, 0, 64 * 1024, 64, 0) }, { "mx25l3255e", INFO(0xc29e16, 0, 64 * 1024, 64, SECT_4K) }, { "mx25l6405d", INFO(0xc22017, 0, 64 * 1024, 128, 0) }, + { "mx25u6435f", INFO(0xc22537, 0, 64 * 1024, 128, SECT_4K) }, { "mx25l12805d", INFO(0xc22018, 0, 64 * 1024, 256, 0) }, { "mx25l12855e", INFO(0xc22618, 0, 64 * 1024, 256, 0) }, { "mx25l25635e", INFO(0xc22019, 0, 64 * 1024, 512, 0) }, @@ -648,6 +664,7 @@ static const struct spi_device_id spi_nor_ids[] = { { "m25px80", INFO(0x207114, 0, 64 * 1024, 16, 0) }, /* Winbond -- w25x "blocks" are 64K, "sectors" are 4KiB */ + { "w25x05", INFO(0xef3010, 0, 64 * 1024, 1, SECT_4K) }, { "w25x10", INFO(0xef3011, 0, 64 * 1024, 2, SECT_4K) }, { "w25x20", INFO(0xef3012, 0, 64 * 1024, 4, SECT_4K) }, { "w25x40", INFO(0xef3013, 0, 64 * 1024, 8, SECT_4K) }, @@ -658,6 +675,7 @@ static const struct spi_device_id spi_nor_ids[] = { { "w25q32dw", INFO(0xef6016, 0, 64 * 1024, 64, SECT_4K) }, { "w25x64", INFO(0xef3017, 0, 64 * 1024, 128, SECT_4K) }, { "w25q64", INFO(0xef4017, 0, 64 * 1024, 128, SECT_4K) }, + { "w25q64dw", INFO(0xef6017, 0, 64 * 1024, 128, SECT_4K) }, { "w25q80", INFO(0xef5014, 0, 64 * 1024, 16, SECT_4K) }, { "w25q80bl", INFO(0xef4014, 0, 64 * 1024, 16, SECT_4K) }, { "w25q128", INFO(0xef4018, 0, 64 * 1024, 256, SECT_4K) }, @@ -1045,6 +1063,11 @@ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode) /* nor protection support for STmicro chips */ if (JEDEC_MFR(info) == CFI_MFR_ST) { + nor->flash_lock = stm_lock; + nor->flash_unlock = stm_unlock; + } + + if (nor->flash_lock && nor->flash_unlock) { mtd->_lock = spi_nor_lock; mtd->_unlock = spi_nor_unlock; } diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c index e579f9027c47..79316159eec6 100644 --- a/drivers/mtd/tests/mtd_nandecctest.c +++ b/drivers/mtd/tests/mtd_nandecctest.c @@ -9,6 +9,8 @@ #include <linux/slab.h> #include <linux/mtd/nand_ecc.h> +#include "mtd_test.h" + /* * Test the implementation for software ECC * @@ -274,6 +276,10 @@ static int nand_ecc_test_run(const size_t size) } pr_info("ok - %s-%zd\n", nand_ecc_test[i].name, size); + + err = mtdtest_relax(); + if (err) + break; } error: kfree(error_data); diff --git a/drivers/mtd/tests/mtd_test.h b/drivers/mtd/tests/mtd_test.h index f437c776c54f..4b7bee17c924 100644 --- a/drivers/mtd/tests/mtd_test.h +++ b/drivers/mtd/tests/mtd_test.h @@ -1,4 +1,16 @@ #include <linux/mtd/mtd.h> +#include <linux/sched.h> + +static inline int mtdtest_relax(void) +{ + cond_resched(); + if (signal_pending(current)) { + pr_info("aborting test due to pending signal!\n"); + return -EINTR; + } + + return 0; +} int mtdtest_erase_eraseblock(struct mtd_info *mtd, unsigned int ebnum); int mtdtest_scan_for_bad_eraseblocks(struct mtd_info *mtd, unsigned char *bbt, diff --git a/drivers/mtd/tests/nandbiterrs.c b/drivers/mtd/tests/nandbiterrs.c index 273f7e553954..09a4ccac53a2 100644 --- a/drivers/mtd/tests/nandbiterrs.c +++ b/drivers/mtd/tests/nandbiterrs.c @@ -320,6 +320,10 @@ static int overwrite_test(void) break; } + err = mtdtest_relax(); + if (err) + break; + opno++; } diff --git a/drivers/mtd/tests/oobtest.c b/drivers/mtd/tests/oobtest.c index 5e061186eab1..8e8525f0202f 100644 --- a/drivers/mtd/tests/oobtest.c +++ b/drivers/mtd/tests/oobtest.c @@ -70,7 +70,7 @@ static int write_eraseblock(int ebnum) int i; struct mtd_oob_ops ops; int err = 0; - loff_t addr = ebnum * mtd->erasesize; + loff_t addr = (loff_t)ebnum * mtd->erasesize; prandom_bytes_state(&rnd_state, writebuf, use_len_max * pgcnt); for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) { @@ -112,7 +112,10 @@ static int write_whole_device(void) return err; if (i % 256 == 0) pr_info("written up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + return err; } pr_info("written %u eraseblocks\n", i); return 0; @@ -141,6 +144,31 @@ static size_t memcmpshow(loff_t addr, const void *cs, const void *ct, size_t cou return bitflips; } +/* + * Compare with 0xff and show the address, offset and data bytes at + * comparison failure. Return number of bitflips encountered. + */ +static size_t memffshow(loff_t addr, loff_t offset, const void *cs, + size_t count) +{ + const unsigned char *su1; + int res; + size_t i = 0; + size_t bitflips = 0; + + for (su1 = cs; 0 < count; ++su1, count--, i++) { + res = *su1 ^ 0xff; + if (res) { + pr_info("error @addr[0x%lx:0x%lx] 0x%x -> 0xff diff 0x%x\n", + (unsigned long)addr, (unsigned long)offset + i, + *su1, res); + bitflips += hweight8(res); + } + } + + return bitflips; +} + static int verify_eraseblock(int ebnum) { int i; @@ -203,6 +231,15 @@ static int verify_eraseblock(int ebnum) bitflips = memcmpshow(addr, readbuf + use_offset, writebuf + (use_len_max * i) + use_offset, use_len); + + /* verify pre-offset area for 0xff */ + bitflips += memffshow(addr, 0, readbuf, use_offset); + + /* verify post-(use_offset + use_len) area for 0xff */ + k = use_offset + use_len; + bitflips += memffshow(addr, k, readbuf + k, + mtd->ecclayout->oobavail - k); + if (bitflips > bitflip_limit) { pr_err("error: verify failed at %#llx\n", (long long)addr); @@ -212,34 +249,8 @@ static int verify_eraseblock(int ebnum) return -1; } } else if (bitflips) { - pr_info("ignoring error as within bitflip_limit\n"); + pr_info("ignoring errors as within bitflip limit\n"); } - - for (k = 0; k < use_offset; ++k) - if (readbuf[k] != 0xff) { - pr_err("error: verify 0xff " - "failed at %#llx\n", - (long long)addr); - errcnt += 1; - if (errcnt > 1000) { - pr_err("error: too " - "many errors\n"); - return -1; - } - } - for (k = use_offset + use_len; - k < mtd->ecclayout->oobavail; ++k) - if (readbuf[k] != 0xff) { - pr_err("error: verify 0xff " - "failed at %#llx\n", - (long long)addr); - errcnt += 1; - if (errcnt > 1000) { - pr_err("error: too " - "many errors\n"); - return -1; - } - } } if (vary_offset) do_vary_offset(); @@ -310,7 +321,10 @@ static int verify_all_eraseblocks(void) return err; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + return err; } pr_info("verified %u eraseblocks\n", i); return 0; @@ -421,7 +435,10 @@ static int __init mtd_oobtest_init(void) goto out; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("verified %u eraseblocks\n", i); @@ -634,7 +651,11 @@ static int __init mtd_oobtest_init(void) goto out; if (i % 256 == 0) pr_info("written up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; + addr += mtd->writesize; } } @@ -672,7 +693,10 @@ static int __init mtd_oobtest_init(void) } if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("verified %u eraseblocks\n", i); diff --git a/drivers/mtd/tests/pagetest.c b/drivers/mtd/tests/pagetest.c index 88296e888e9d..ba1890d5632c 100644 --- a/drivers/mtd/tests/pagetest.c +++ b/drivers/mtd/tests/pagetest.c @@ -407,7 +407,10 @@ static int __init mtd_pagetest_init(void) goto out; if (i % 256 == 0) pr_info("written up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("written %u eraseblocks\n", i); @@ -422,7 +425,10 @@ static int __init mtd_pagetest_init(void) goto out; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("verified %u eraseblocks\n", i); diff --git a/drivers/mtd/tests/readtest.c b/drivers/mtd/tests/readtest.c index a54cf1511114..a3196b750a22 100644 --- a/drivers/mtd/tests/readtest.c +++ b/drivers/mtd/tests/readtest.c @@ -190,7 +190,10 @@ static int __init mtd_readtest_init(void) if (!err) err = ret; } - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } if (err) diff --git a/drivers/mtd/tests/speedtest.c b/drivers/mtd/tests/speedtest.c index 5ee9f7021020..5a6f31af06f9 100644 --- a/drivers/mtd/tests/speedtest.c +++ b/drivers/mtd/tests/speedtest.c @@ -185,7 +185,7 @@ static long calc_speed(void) (finish.tv_usec - start.tv_usec) / 1000; if (ms == 0) return 0; - k = goodebcnt * (mtd->erasesize / 1024) * 1000; + k = (uint64_t)goodebcnt * (mtd->erasesize / 1024) * 1000; do_div(k, ms); return k; } @@ -269,7 +269,10 @@ static int __init mtd_speedtest_init(void) err = write_eraseblock(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -284,7 +287,10 @@ static int __init mtd_speedtest_init(void) err = read_eraseblock(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -303,7 +309,10 @@ static int __init mtd_speedtest_init(void) err = write_eraseblock_by_page(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -318,7 +327,10 @@ static int __init mtd_speedtest_init(void) err = read_eraseblock_by_page(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -337,7 +349,10 @@ static int __init mtd_speedtest_init(void) err = write_eraseblock_by_2pages(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -352,7 +367,10 @@ static int __init mtd_speedtest_init(void) err = read_eraseblock_by_2pages(i); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } stop_timing(); speed = calc_speed(); @@ -385,7 +403,11 @@ static int __init mtd_speedtest_init(void) err = multiblock_erase(i, j); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; + i += j; } stop_timing(); diff --git a/drivers/mtd/tests/stresstest.c b/drivers/mtd/tests/stresstest.c index c9d42cc2df1b..e509f8aa9a7e 100644 --- a/drivers/mtd/tests/stresstest.c +++ b/drivers/mtd/tests/stresstest.c @@ -96,7 +96,7 @@ static int do_read(void) if (offs + len > mtd->erasesize) len = mtd->erasesize - offs; } - addr = eb * mtd->erasesize + offs; + addr = (loff_t)eb * mtd->erasesize + offs; return mtdtest_read(mtd, addr, len, readbuf); } @@ -124,7 +124,7 @@ static int do_write(void) offsets[eb + 1] = 0; } } - addr = eb * mtd->erasesize + offs; + addr = (loff_t)eb * mtd->erasesize + offs; err = mtdtest_write(mtd, addr, len, writebuf); if (unlikely(err)) return err; @@ -221,7 +221,10 @@ static int __init mtd_stresstest_init(void) err = do_operation(); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("finished, %d operations done\n", op); diff --git a/drivers/mtd/tests/subpagetest.c b/drivers/mtd/tests/subpagetest.c index 7b59ef522d5e..aecc6ce5a9e1 100644 --- a/drivers/mtd/tests/subpagetest.c +++ b/drivers/mtd/tests/subpagetest.c @@ -95,7 +95,7 @@ static int write_eraseblock2(int ebnum) loff_t addr = (loff_t)ebnum * mtd->erasesize; for (k = 1; k < 33; ++k) { - if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize) + if (addr + (subpgsize * k) > (loff_t)(ebnum + 1) * mtd->erasesize) break; prandom_bytes_state(&rnd_state, writebuf, subpgsize * k); err = mtd_write(mtd, addr, subpgsize * k, &written, writebuf); @@ -195,7 +195,7 @@ static int verify_eraseblock2(int ebnum) loff_t addr = (loff_t)ebnum * mtd->erasesize; for (k = 1; k < 33; ++k) { - if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize) + if (addr + (subpgsize * k) > (loff_t)(ebnum + 1) * mtd->erasesize) break; prandom_bytes_state(&rnd_state, writebuf, subpgsize * k); clear_data(readbuf, subpgsize * k); @@ -269,7 +269,10 @@ static int verify_all_eraseblocks_ff(void) return err; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + return err; } pr_info("verified %u eraseblocks\n", i); return 0; @@ -346,7 +349,10 @@ static int __init mtd_subpagetest_init(void) goto out; if (i % 256 == 0) pr_info("written up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("written %u eraseblocks\n", i); @@ -360,7 +366,10 @@ static int __init mtd_subpagetest_init(void) goto out; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("verified %u eraseblocks\n", i); @@ -383,7 +392,10 @@ static int __init mtd_subpagetest_init(void) goto out; if (i % 256 == 0) pr_info("written up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("written %u eraseblocks\n", i); @@ -398,7 +410,10 @@ static int __init mtd_subpagetest_init(void) goto out; if (i % 256 == 0) pr_info("verified up to eraseblock %u\n", i); - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } pr_info("verified %u eraseblocks\n", i); diff --git a/drivers/mtd/tests/torturetest.c b/drivers/mtd/tests/torturetest.c index b55bc52a1340..e5d6e6d9532f 100644 --- a/drivers/mtd/tests/torturetest.c +++ b/drivers/mtd/tests/torturetest.c @@ -101,11 +101,11 @@ static inline int check_eraseblock(int ebnum, unsigned char *buf) { int err, retries = 0; size_t read; - loff_t addr = ebnum * mtd->erasesize; + loff_t addr = (loff_t)ebnum * mtd->erasesize; size_t len = mtd->erasesize; if (pgcnt) { - addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize; + addr = (loff_t)(ebnum + 1) * mtd->erasesize - pgcnt * pgsize; len = pgcnt * pgsize; } @@ -155,11 +155,11 @@ static inline int write_pattern(int ebnum, void *buf) { int err; size_t written; - loff_t addr = ebnum * mtd->erasesize; + loff_t addr = (loff_t)ebnum * mtd->erasesize; size_t len = mtd->erasesize; if (pgcnt) { - addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize; + addr = (loff_t)(ebnum + 1) * mtd->erasesize - pgcnt * pgsize; len = pgcnt * pgsize; } err = mtd_write(mtd, addr, len, &written, buf); @@ -279,7 +279,10 @@ static int __init tort_init(void) " for 0xFF... pattern\n"); goto out; } - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } } @@ -294,7 +297,10 @@ static int __init tort_init(void) err = write_pattern(i, patt); if (err) goto out; - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } /* Verify what we wrote */ @@ -314,7 +320,10 @@ static int __init tort_init(void) "0x55AA55..." : "0xAA55AA..."); goto out; } - cond_resched(); + + err = mtdtest_relax(); + if (err) + goto out; } } diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 9690cf9aaef5..b7f824d5ee88 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1169,9 +1169,9 @@ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev) return ERR_PTR(err); /* MTD device number is defined by the major / minor numbers */ - major = imajor(path.dentry->d_inode); - minor = iminor(path.dentry->d_inode); - mode = path.dentry->d_inode->i_mode; + major = imajor(d_backing_inode(path.dentry)); + minor = iminor(d_backing_inode(path.dentry)); + mode = d_backing_inode(path.dentry)->i_mode; path_put(&path); if (major != MTD_CHAR_MAJOR || !S_ISCHR(mode)) return ERR_PTR(-EINVAL); diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 478e00cf2d9e..e844887732fb 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -314,7 +314,7 @@ struct ubi_volume_desc *ubi_open_volume_path(const char *pathname, int mode) if (error) return ERR_PTR(error); - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); mod = inode->i_mode; ubi_num = ubi_major2num(imajor(inode)); vol_id = iminor(inode) - 1; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 78dde56ae6e6..3a10551d64cf 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -82,6 +82,8 @@ #include <net/bond_3ad.h> #include <net/bond_alb.h> +#include "bonding_priv.h" + /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 62694cfc05b6..b20b35acb47d 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -4,6 +4,7 @@ #include <net/netns/generic.h> #include <net/bonding.h> +#include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h new file mode 100644 index 000000000000..5a4d81a9437c --- /dev/null +++ b/drivers/net/bonding/bonding_priv.h @@ -0,0 +1,25 @@ +/* + * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. + * + * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes + * NCM: Network and Communications Management, Inc. + * + * BUT, I'm the one who modified it for ethernet, so: + * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov + * + * This software may be used and distributed according to the terms + * of the GNU Public License, incorporated herein by reference. + * + */ + +#ifndef _BONDING_PRIV_H +#define _BONDING_PRIV_H + +#define DRV_VERSION "3.7.1" +#define DRV_RELDATE "April 27, 2011" +#define DRV_NAME "bonding" +#define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" + +#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" + +#endif diff --git a/drivers/net/can/usb/kvaser_usb.c b/drivers/net/can/usb/kvaser_usb.c index 4643914859b2..8b17a9065b0b 100644 --- a/drivers/net/can/usb/kvaser_usb.c +++ b/drivers/net/can/usb/kvaser_usb.c @@ -1102,7 +1102,7 @@ static void kvaser_usb_rx_can_err(const struct kvaser_usb_net_priv *priv, if (msg->u.rx_can_header.flag & (MSG_FLAG_ERROR_FRAME | MSG_FLAG_NERR)) { - netdev_err(priv->netdev, "Unknow error (flags: 0x%02x)\n", + netdev_err(priv->netdev, "Unknown error (flags: 0x%02x)\n", msg->u.rx_can_header.flag); stats->rx_errors++; diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index b36ee9e0d220..d686b9cac29f 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -523,7 +523,7 @@ static int etherh_addr(char *addr, struct expansion_card *ec) char *s; if (!ecard_readchunk(&cd, ec, 0xf5, 0)) { - printk(KERN_ERR "%s: unable to read podule description string\n", + printk(KERN_ERR "%s: unable to read module description string\n", dev_name(&ec->dev)); goto no_addr; } diff --git a/drivers/net/ethernet/altera/altera_msgdmahw.h b/drivers/net/ethernet/altera/altera_msgdmahw.h index eba070f16782..89cd11d86642 100644 --- a/drivers/net/ethernet/altera/altera_msgdmahw.h +++ b/drivers/net/ethernet/altera/altera_msgdmahw.h @@ -58,15 +58,12 @@ struct msgdma_extended_desc { /* Tx buffer control flags */ #define MSGDMA_DESC_CTL_TX_FIRST (MSGDMA_DESC_CTL_GEN_SOP | \ - MSGDMA_DESC_CTL_TR_ERR_IRQ | \ MSGDMA_DESC_CTL_GO) -#define MSGDMA_DESC_CTL_TX_MIDDLE (MSGDMA_DESC_CTL_TR_ERR_IRQ | \ - MSGDMA_DESC_CTL_GO) +#define MSGDMA_DESC_CTL_TX_MIDDLE (MSGDMA_DESC_CTL_GO) #define MSGDMA_DESC_CTL_TX_LAST (MSGDMA_DESC_CTL_GEN_EOP | \ MSGDMA_DESC_CTL_TR_COMP_IRQ | \ - MSGDMA_DESC_CTL_TR_ERR_IRQ | \ MSGDMA_DESC_CTL_GO) #define MSGDMA_DESC_CTL_TX_SINGLE (MSGDMA_DESC_CTL_GEN_SOP | \ diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 90a76306ad0f..0533c051a3e5 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -777,6 +777,8 @@ static int init_phy(struct net_device *dev) struct altera_tse_private *priv = netdev_priv(dev); struct phy_device *phydev; struct device_node *phynode; + bool fixed_link = false; + int rc = 0; /* Avoid init phy in case of no phy present */ if (!priv->phy_iface) @@ -789,13 +791,32 @@ static int init_phy(struct net_device *dev) phynode = of_parse_phandle(priv->device->of_node, "phy-handle", 0); if (!phynode) { - netdev_dbg(dev, "no phy-handle found\n"); - if (!priv->mdio) { - netdev_err(dev, - "No phy-handle nor local mdio specified\n"); - return -ENODEV; + /* check if a fixed-link is defined in device-tree */ + if (of_phy_is_fixed_link(priv->device->of_node)) { + rc = of_phy_register_fixed_link(priv->device->of_node); + if (rc < 0) { + netdev_err(dev, "cannot register fixed PHY\n"); + return rc; + } + + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ + phynode = of_node_get(priv->device->of_node); + fixed_link = true; + + netdev_dbg(dev, "fixed-link detected\n"); + phydev = of_phy_connect(dev, phynode, + &altera_tse_adjust_link, + 0, priv->phy_iface); + } else { + netdev_dbg(dev, "no phy-handle found\n"); + if (!priv->mdio) { + netdev_err(dev, "No phy-handle nor local mdio specified\n"); + return -ENODEV; + } + phydev = connect_local_phy(dev); } - phydev = connect_local_phy(dev); } else { netdev_dbg(dev, "phy-handle found\n"); phydev = of_phy_connect(dev, phynode, @@ -819,10 +840,10 @@ static int init_phy(struct net_device *dev) /* Broken HW is sometimes missing the pull-up resistor on the * MDIO line, which results in reads to non-existent devices returning * 0 rather than 0xffff. Catch this here and treat 0 as a non-existent - * device as well. + * device as well. If a fixed-link is used the phy_id is always 0. * Note: phydev->phy_id is the result of reading the UID PHY registers. */ - if (phydev->phy_id == 0) { + if ((phydev->phy_id == 0) && !fixed_link) { netdev_err(dev, "Bad PHY UID 0x%08x\n", phydev->phy_id); phy_disconnect(phydev); return -ENODEV; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 6f7dc81581ff..3558a36b1c2d 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -2485,8 +2485,10 @@ static void bnx2x_bz_fp(struct bnx2x *bp, int index) else if (bp->flags & GRO_ENABLE_FLAG) fp->mode = TPA_MODE_GRO; - /* We don't want TPA on an FCoE L2 ring */ - if (IS_FCOE_FP(fp)) + /* We don't want TPA if it's disabled in bp + * or if this is an FCoE L2 ring. + */ + if (bp->disable_tpa || IS_FCOE_FP(fp)) fp->disable_tpa = 1; } diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1270b189a9a2..069952fa5d64 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -18129,7 +18129,9 @@ static pci_ers_result_t tg3_io_error_detected(struct pci_dev *pdev, rtnl_lock(); - tp->pcierr_recovery = true; + /* We needn't recover from permanent error */ + if (state == pci_channel_io_frozen) + tp->pcierr_recovery = true; /* We probably don't have netdev yet */ if (!netdev || !netif_running(netdev)) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index f0285bcbe598..371f75e782e5 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -538,7 +538,7 @@ static ssize_t tp_la_write(struct file *file, const char __user *buf, char s[32]; unsigned long val; size_t size = min(sizeof(s) - 1, count); - struct adapter *adap = FILE_DATA(file)->i_private; + struct adapter *adap = file_inode(file)->i_private; if (copy_from_user(s, buf, size)) return -EFAULT; @@ -647,7 +647,7 @@ static int pm_stats_open(struct inode *inode, struct file *file) static ssize_t pm_stats_clear(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct adapter *adap = FILE_DATA(file)->i_private; + struct adapter *adap = file_inode(file)->i_private; t4_write_reg(adap, PM_RX_STAT_CONFIG_A, 0); t4_write_reg(adap, PM_TX_STAT_CONFIG_A, 0); @@ -1005,7 +1005,7 @@ static ssize_t mbox_write(struct file *file, const char __user *buf, &data[7], &c) < 8 || c != '\n') return -EINVAL; - ino = FILE_DATA(file); + ino = file_inode(file); mbox = (uintptr_t)ino->i_private & 7; adap = ino->i_private - mbox; addr = adap->regs + PF_REG(mbox, CIM_PF_MAILBOX_DATA_A); @@ -1034,7 +1034,7 @@ static ssize_t flash_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { loff_t pos = *ppos; - loff_t avail = FILE_DATA(file)->i_size; + loff_t avail = file_inode(file)->i_size; struct adapter *adap = file->private_data; if (pos < 0) @@ -1479,7 +1479,7 @@ static ssize_t rss_key_write(struct file *file, const char __user *buf, int i, j; u32 key[10]; char s[100], *p; - struct adapter *adap = FILE_DATA(file)->i_private; + struct adapter *adap = file_inode(file)->i_private; if (count > sizeof(s) - 1) return -EINVAL; @@ -1951,12 +1951,6 @@ static const struct file_operations mem_debugfs_fops = { .llseek = default_llseek, }; -static void set_debugfs_file_size(struct dentry *de, loff_t size) -{ - if (!IS_ERR(de) && de->d_inode) - de->d_inode->i_size = size; -} - static void add_debugfs_mem(struct adapter *adap, const char *name, unsigned int idx, unsigned int size_mb) { @@ -2072,9 +2066,8 @@ int t4_setup_debugfs(struct adapter *adap) } } - de = debugfs_create_file("flash", S_IRUSR, adap->debugfs_root, adap, - &flash_debugfs_fops); - set_debugfs_file_size(de, adap->params.sf_size); + de = debugfs_create_file_size("flash", S_IRUSR, adap->debugfs_root, adap, + &flash_debugfs_fops, adap->params.sf_size); return 0; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h index 8f418ba868bd..23f43a0f8950 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.h @@ -37,8 +37,6 @@ #include <linux/export.h> -#define FILE_DATA(_file) ((_file)->f_path.dentry->d_inode) - #define DEFINE_SIMPLE_DEBUGFS_FILE(name) \ static int name##_open(struct inode *inode, struct file *file) \ { \ diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index 291c87036e17..2a0dc127df3f 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -3347,7 +3347,7 @@ static int ehea_register_memory_hooks(void) { int ret = 0; - if (atomic_inc_and_test(&ehea_memory_hooks_registered)) + if (atomic_inc_return(&ehea_memory_hooks_registered) > 1) return 0; ret = ehea_create_busmap(); @@ -3381,12 +3381,14 @@ out3: out2: unregister_reboot_notifier(&ehea_reboot_nb); out: + atomic_dec(&ehea_memory_hooks_registered); return ret; } static void ehea_unregister_memory_hooks(void) { - if (atomic_read(&ehea_memory_hooks_registered)) + /* Only remove the hooks if we've registered them */ + if (atomic_read(&ehea_memory_hooks_registered) == 0) return; unregister_reboot_notifier(&ehea_reboot_nb); diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index af829c578400..7ace07dad6a3 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1508,7 +1508,8 @@ static int pxa168_eth_probe(struct platform_device *pdev) np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); if (!np) { dev_err(&pdev->dev, "missing phy-handle\n"); - return -EINVAL; + err = -EINVAL; + goto err_netdev; } of_property_read_u32(np, "reg", &pep->phy_addr); pep->phy_intf = of_get_phy_mode(pdev->dev.of_node); @@ -1526,7 +1527,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) pep->smi_bus = mdiobus_alloc(); if (pep->smi_bus == NULL) { err = -ENOMEM; - goto err_base; + goto err_netdev; } pep->smi_bus->priv = pep; pep->smi_bus->name = "pxa168_eth smi"; @@ -1551,13 +1552,10 @@ err_mdiobus: mdiobus_unregister(pep->smi_bus); err_free_mdio: mdiobus_free(pep->smi_bus); -err_base: - iounmap(pep->base); err_netdev: free_netdev(dev); err_clk: - clk_disable(clk); - clk_put(clk); + clk_disable_unprepare(clk); return err; } @@ -1574,13 +1572,9 @@ static int pxa168_eth_remove(struct platform_device *pdev) if (pep->phy) phy_disconnect(pep->phy); if (pep->clk) { - clk_disable(pep->clk); - clk_put(pep->clk); - pep->clk = NULL; + clk_disable_unprepare(pep->clk); } - iounmap(pep->base); - pep->base = NULL; mdiobus_unregister(pep->smi_bus); mdiobus_free(pep->smi_bus); unregister_netdev(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index f0fbb4ade85d..4f7dc044601e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -939,21 +939,34 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, return err; } if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) { - /* compute slave's gid block */ - smp->attr_mod = cpu_to_be32(slave / 8); - /* execute cmd */ - err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, opcode_modifier, - vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); - if (!err) { - /* if needed, move slave gid to index 0 */ - if (slave % 8) - memcpy(outsmp->data, - outsmp->data + (slave % 8) * 8, 8); - /* delete all other gids */ - memset(outsmp->data + 8, 0, 56); + __be64 guid = mlx4_get_admin_guid(dev, slave, + port); + + /* set the PF admin guid to the FW/HW burned + * GUID, if it wasn't yet set + */ + if (slave == 0 && guid == 0) { + smp->attr_mod = 0; + err = mlx4_cmd_box(dev, + inbox->dma, + outbox->dma, + vhcr->in_modifier, + opcode_modifier, + vhcr->op, + MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (err) + return err; + mlx4_set_admin_guid(dev, + *(__be64 *)outsmp-> + data, slave, port); + } else { + memcpy(outsmp->data, &guid, 8); } - return err; + + /* clean all other gids */ + memset(outsmp->data + 8, 0, 56); + return 0; } if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) { err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, @@ -2350,6 +2363,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT; vf_oper->vport[port].vlan_idx = NO_INDX; vf_oper->vport[port].mac_idx = NO_INDX; + mlx4_set_random_admin_guid(dev, i, port); } spin_lock_init(&s_state->lock); } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 3f44e2bbb982..a2ddf3d75ff8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1102,20 +1102,21 @@ static int mlx4_en_check_rxfh_func(struct net_device *dev, u8 hfunc) struct mlx4_en_priv *priv = netdev_priv(dev); /* check if requested function is supported by the device */ - if ((hfunc == ETH_RSS_HASH_TOP && - !(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) || - (hfunc == ETH_RSS_HASH_XOR && - !(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR))) - return -EINVAL; + if (hfunc == ETH_RSS_HASH_TOP) { + if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) + return -EINVAL; + if (!(dev->features & NETIF_F_RXHASH)) + en_warn(priv, "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n"); + return 0; + } else if (hfunc == ETH_RSS_HASH_XOR) { + if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR)) + return -EINVAL; + if (dev->features & NETIF_F_RXHASH) + en_warn(priv, "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n"); + return 0; + } - priv->rss_hash_fn = hfunc; - if (hfunc == ETH_RSS_HASH_TOP && !(dev->features & NETIF_F_RXHASH)) - en_warn(priv, - "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n"); - if (hfunc == ETH_RSS_HASH_XOR && (dev->features & NETIF_F_RXHASH)) - en_warn(priv, - "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n"); - return 0; + return -EINVAL; } static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key, @@ -1189,6 +1190,8 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index, priv->prof->rss_rings = rss_rings; if (key) memcpy(priv->rss_key, key, MLX4_EN_RSS_KEY_SIZE); + if (hfunc != ETH_RSS_HASH_NO_CHANGE) + priv->rss_hash_fn = hfunc; if (port_up) { err = mlx4_en_start_port(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 190fd624bdfe..2619c9fbf42d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -702,6 +702,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; } spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, + flr_slave); queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.slave_flr_event_work); break; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index acceb75e8c44..ced5ecab5aa7 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2260,6 +2260,37 @@ void mlx4_counter_free(struct mlx4_dev *dev, u32 idx) } EXPORT_SYMBOL_GPL(mlx4_counter_free); +void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->mfunc.master.vf_admin[entry].vport[port].guid = guid; +} +EXPORT_SYMBOL_GPL(mlx4_set_admin_guid); + +__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return priv->mfunc.master.vf_admin[entry].vport[port].guid; +} +EXPORT_SYMBOL_GPL(mlx4_get_admin_guid); + +void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + __be64 guid; + + /* hw GUID */ + if (entry == 0) + return; + + get_random_bytes((char *)&guid, sizeof(guid)); + guid &= ~(cpu_to_be64(1ULL << 56)); + guid |= cpu_to_be64(1ULL << 57); + priv->mfunc.master.vf_admin[entry].vport[port].guid = guid; +} + static int mlx4_setup_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index f30eeb730a86..502d3dd2c888 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -499,6 +499,7 @@ struct mlx4_vport_state { bool spoofchk; u32 link_state; u8 qos_vport; + __be64 guid; }; struct mlx4_vf_admin_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index df2238372ea7..8a64542abc16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -211,26 +211,28 @@ static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) return 0; } +#define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT) + static void free_4k(struct mlx5_core_dev *dev, u64 addr) { struct fw_page *fwp; int n; - fwp = find_fw_page(dev, addr & PAGE_MASK); + fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK); if (!fwp) { mlx5_core_warn(dev, "page not found\n"); return; } - n = (addr & ~PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; + n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; fwp->free_count++; set_bit(n, &fwp->bitmask); if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { rb_erase(&fwp->rb_node, &dev->priv.page_root); if (fwp->free_count != 1) list_del(&fwp->list); - dma_unmap_page(&dev->pdev->dev, addr & PAGE_MASK, PAGE_SIZE, - DMA_BIDIRECTIONAL); + dma_unmap_page(&dev->pdev->dev, addr & MLX5_U64_4K_PAGE_MASK, + PAGE_SIZE, DMA_BIDIRECTIONAL); __free_page(fwp->page); kfree(fwp); } else if (fwp->free_count == 1) { diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c index 2bef655279f3..9b7e0a34c98b 100644 --- a/drivers/net/ethernet/ti/netcp_ethss.c +++ b/drivers/net/ethernet/ti/netcp_ethss.c @@ -1765,7 +1765,9 @@ static void netcp_ethss_link_state_action(struct gbe_priv *gbe_dev, ALE_PORT_STATE, ALE_PORT_STATE_FORWARD); - if (ndev && slave->open) + if (ndev && slave->open && + slave->link_interface != SGMII_LINK_MAC_PHY && + slave->link_interface != XGMII_LINK_MAC_PHY) netif_carrier_on(ndev); } else { writel(mac_control, GBE_REG_ADDR(slave, emac_regs, @@ -1773,7 +1775,9 @@ static void netcp_ethss_link_state_action(struct gbe_priv *gbe_dev, cpsw_ale_control_set(gbe_dev->ale, slave->port_num, ALE_PORT_STATE, ALE_PORT_STATE_DISABLE); - if (ndev) + if (ndev && + slave->link_interface != SGMII_LINK_MAC_PHY && + slave->link_interface != XGMII_LINK_MAC_PHY) netif_carrier_off(ndev); } diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c index 49ce7ece5af3..c9cb486c753d 100644 --- a/drivers/net/phy/mdio-gpio.c +++ b/drivers/net/phy/mdio-gpio.c @@ -80,7 +80,8 @@ static void mdio_dir(struct mdiobb_ctrl *ctrl, int dir) * assume the pin serves as pull-up. If direction is * output, the default value is high. */ - gpio_set_value(bitbang->mdo, 1 ^ bitbang->mdo_active_low); + gpio_set_value_cansleep(bitbang->mdo, + 1 ^ bitbang->mdo_active_low); return; } @@ -96,7 +97,8 @@ static int mdio_get(struct mdiobb_ctrl *ctrl) struct mdio_gpio_info *bitbang = container_of(ctrl, struct mdio_gpio_info, ctrl); - return gpio_get_value(bitbang->mdio) ^ bitbang->mdio_active_low; + return gpio_get_value_cansleep(bitbang->mdio) ^ + bitbang->mdio_active_low; } static void mdio_set(struct mdiobb_ctrl *ctrl, int what) @@ -105,9 +107,11 @@ static void mdio_set(struct mdiobb_ctrl *ctrl, int what) container_of(ctrl, struct mdio_gpio_info, ctrl); if (bitbang->mdo) - gpio_set_value(bitbang->mdo, what ^ bitbang->mdo_active_low); + gpio_set_value_cansleep(bitbang->mdo, + what ^ bitbang->mdo_active_low); else - gpio_set_value(bitbang->mdio, what ^ bitbang->mdio_active_low); + gpio_set_value_cansleep(bitbang->mdio, + what ^ bitbang->mdio_active_low); } static void mdc_set(struct mdiobb_ctrl *ctrl, int what) @@ -115,7 +119,7 @@ static void mdc_set(struct mdiobb_ctrl *ctrl, int what) struct mdio_gpio_info *bitbang = container_of(ctrl, struct mdio_gpio_info, ctrl); - gpio_set_value(bitbang->mdc, what ^ bitbang->mdc_active_low); + gpio_set_value_cansleep(bitbang->mdc, what ^ bitbang->mdc_active_low); } static struct mdiobb_ops mdio_gpio_ops = { diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c index 1a87a585e74d..66edd99bc302 100644 --- a/drivers/net/phy/mdio-mux-gpio.c +++ b/drivers/net/phy/mdio-mux-gpio.c @@ -12,33 +12,30 @@ #include <linux/module.h> #include <linux/phy.h> #include <linux/mdio-mux.h> -#include <linux/of_gpio.h> +#include <linux/gpio/consumer.h> #define DRV_VERSION "1.1" #define DRV_DESCRIPTION "GPIO controlled MDIO bus multiplexer driver" -#define MDIO_MUX_GPIO_MAX_BITS 8 - struct mdio_mux_gpio_state { - struct gpio_desc *gpio[MDIO_MUX_GPIO_MAX_BITS]; - unsigned int num_gpios; + struct gpio_descs *gpios; void *mux_handle; }; static int mdio_mux_gpio_switch_fn(int current_child, int desired_child, void *data) { - int values[MDIO_MUX_GPIO_MAX_BITS]; - unsigned int n; struct mdio_mux_gpio_state *s = data; + int values[s->gpios->ndescs]; + unsigned int n; if (current_child == desired_child) return 0; - for (n = 0; n < s->num_gpios; n++) { + for (n = 0; n < s->gpios->ndescs; n++) values[n] = (desired_child >> n) & 1; - } - gpiod_set_array_cansleep(s->num_gpios, s->gpio, values); + + gpiod_set_array_cansleep(s->gpios->ndescs, s->gpios->desc, values); return 0; } @@ -46,56 +43,33 @@ static int mdio_mux_gpio_switch_fn(int current_child, int desired_child, static int mdio_mux_gpio_probe(struct platform_device *pdev) { struct mdio_mux_gpio_state *s; - int num_gpios; - unsigned int n; int r; - if (!pdev->dev.of_node) - return -ENODEV; - - num_gpios = of_gpio_count(pdev->dev.of_node); - if (num_gpios <= 0 || num_gpios > MDIO_MUX_GPIO_MAX_BITS) - return -ENODEV; - s = devm_kzalloc(&pdev->dev, sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; - s->num_gpios = num_gpios; - - for (n = 0; n < num_gpios; ) { - struct gpio_desc *gpio = gpiod_get_index(&pdev->dev, NULL, n, - GPIOD_OUT_LOW); - if (IS_ERR(gpio)) { - r = PTR_ERR(gpio); - goto err; - } - s->gpio[n] = gpio; - n++; - } + s->gpios = gpiod_get_array(&pdev->dev, NULL, GPIOD_OUT_LOW); + if (IS_ERR(s->gpios)) + return PTR_ERR(s->gpios); r = mdio_mux_init(&pdev->dev, mdio_mux_gpio_switch_fn, &s->mux_handle, s); - if (r == 0) { - pdev->dev.platform_data = s; - return 0; - } -err: - while (n) { - n--; - gpiod_put(s->gpio[n]); + if (r != 0) { + gpiod_put_array(s->gpios); + return r; } - return r; + + pdev->dev.platform_data = s; + return 0; } static int mdio_mux_gpio_remove(struct platform_device *pdev) { - unsigned int n; struct mdio_mux_gpio_state *s = dev_get_platdata(&pdev->dev); mdio_mux_uninit(s->mux_handle); - for (n = 0; n < s->num_gpios; n++) - gpiod_put(s->gpio[n]); + gpiod_put_array(s->gpios); return 0; } diff --git a/drivers/net/ppp/ppp_mppe.c b/drivers/net/ppp/ppp_mppe.c index 911b21602ff2..05005c660d4d 100644 --- a/drivers/net/ppp/ppp_mppe.c +++ b/drivers/net/ppp/ppp_mppe.c @@ -478,7 +478,6 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, struct blkcipher_desc desc = { .tfm = state->arc4 }; unsigned ccount; int flushed = MPPE_BITS(ibuf) & MPPE_BIT_FLUSHED; - int sanity = 0; struct scatterlist sg_in[1], sg_out[1]; if (isize <= PPP_HDRLEN + MPPE_OVHD) { @@ -514,31 +513,19 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, "mppe_decompress[%d]: ENCRYPTED bit not set!\n", state->unit); state->sanity_errors += 100; - sanity = 1; + goto sanity_error; } if (!state->stateful && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set in " "stateless mode!\n", state->unit); state->sanity_errors += 100; - sanity = 1; + goto sanity_error; } if (state->stateful && ((ccount & 0xff) == 0xff) && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set on " "flag packet!\n", state->unit); state->sanity_errors += 100; - sanity = 1; - } - - if (sanity) { - if (state->sanity_errors < SANITY_MAX) - return DECOMP_ERROR; - else - /* - * Take LCP down if the peer is sending too many bogons. - * We don't want to do this for a single or just a few - * instances since it could just be due to packet corruption. - */ - return DECOMP_FATALERROR; + goto sanity_error; } /* @@ -546,6 +533,13 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, */ if (!state->stateful) { + /* Discard late packet */ + if ((ccount - state->ccount) % MPPE_CCOUNT_SPACE + > MPPE_CCOUNT_SPACE / 2) { + state->sanity_errors++; + goto sanity_error; + } + /* RFC 3078, sec 8.1. Rekey for every packet. */ while (state->ccount != ccount) { mppe_rekey(state, 0); @@ -649,6 +643,16 @@ mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, state->sanity_errors >>= 1; return osize; + +sanity_error: + if (state->sanity_errors < SANITY_MAX) + return DECOMP_ERROR; + else + /* Take LCP down if the peer is sending too many bogons. + * We don't want to do this for a single or just a few + * instances since it could just be due to packet corruption. + */ + return DECOMP_FATALERROR; } /* diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 1470b5227834..07bb3c8f191b 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -50,7 +50,7 @@ config OF_ADDRESS_PCI config OF_IRQ def_bool y - depends on !SPARC + depends on !SPARC && IRQ_DOMAIN config OF_NET depends on NETDEVICES diff --git a/drivers/of/base.c b/drivers/of/base.c index a1aa0c7dee50..99764db0875a 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -568,6 +568,29 @@ bool of_device_is_available(const struct device_node *device) EXPORT_SYMBOL(of_device_is_available); /** + * of_device_is_big_endian - check if a device has BE registers + * + * @device: Node to check for endianness + * + * Returns true if the device has a "big-endian" property, or if the kernel + * was compiled for BE *and* the device has a "native-endian" property. + * Returns false otherwise. + * + * Callers would nominally use ioread32be/iowrite32be if + * of_device_is_big_endian() == true, or readl/writel otherwise. + */ +bool of_device_is_big_endian(const struct device_node *device) +{ + if (of_property_read_bool(device, "big-endian")) + return true; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && + of_property_read_bool(device, "native-endian")) + return true; + return false; +} +EXPORT_SYMBOL(of_device_is_big_endian); + +/** * of_get_parent - Get a node's parent if any * @node: Node to get parent * @@ -640,8 +663,9 @@ static struct device_node *__of_get_next_child(const struct device_node *node, * @node: parent node * @prev: previous child of the parent node, or NULL to get first * - * Returns a node pointer with refcount incremented, use - * of_node_put() on it when done. + * Returns a node pointer with refcount incremented, use of_node_put() on + * it when done. Returns NULL when prev is the last child. Decrements the + * refcount of prev. */ struct device_node *of_get_next_child(const struct device_node *node, struct device_node *prev) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 3a896c9aeb74..cde35c5d0191 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -109,6 +109,25 @@ int of_fdt_is_compatible(const void *blob, } /** + * of_fdt_is_big_endian - Return true if given node needs BE MMIO accesses + * @blob: A device tree blob + * @node: node to test + * + * Returns true if the node has a "big-endian" property, or if the kernel + * was compiled for BE *and* the node has a "native-endian" property. + * Returns false otherwise. + */ +bool of_fdt_is_big_endian(const void *blob, unsigned long node) +{ + if (fdt_getprop(blob, node, "big-endian", NULL)) + return true; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && + fdt_getprop(blob, node, "native-endian", NULL)) + return true; + return false; +} + +/** * of_fdt_match - Return true if node matches a list of compatible values */ int of_fdt_match(const void *blob, unsigned long node, @@ -172,7 +191,7 @@ static void * unflatten_dt_node(void *blob, if (!pathp) return mem; - allocl = l++; + allocl = ++l; /* version 0x10 has a more compact unit name here instead of the full * path. we accumulate the full path size using "fpsize", we'll rebuild @@ -879,8 +898,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, endp = reg + (l / sizeof(__be32)); - pr_debug("memory scan node %s, reg size %d, data: %x %x %x %x,\n", - uname, l, reg[0], reg[1], reg[2], reg[3]); + pr_debug("memory scan node %s, reg size %d,\n", uname, l); while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) { u64 base, size; diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index e844907c9efa..18016341d5a9 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -23,6 +23,8 @@ #include <linux/i2c.h> #include <linux/i2c-mux.h> +#include <linux/bitops.h> + #include "of_private.h" static struct unittest_results { @@ -1109,6 +1111,59 @@ static const char *overlay_path(int nr) static const char *bus_path = "/testcase-data/overlay-node/test-bus"; +/* it is guaranteed that overlay ids are assigned in sequence */ +#define MAX_UNITTEST_OVERLAYS 256 +static unsigned long overlay_id_bits[BITS_TO_LONGS(MAX_UNITTEST_OVERLAYS)]; +static int overlay_first_id = -1; + +static void of_unittest_track_overlay(int id) +{ + if (overlay_first_id < 0) + overlay_first_id = id; + id -= overlay_first_id; + + /* we shouldn't need that many */ + BUG_ON(id >= MAX_UNITTEST_OVERLAYS); + overlay_id_bits[BIT_WORD(id)] |= BIT_MASK(id); +} + +static void of_unittest_untrack_overlay(int id) +{ + if (overlay_first_id < 0) + return; + id -= overlay_first_id; + BUG_ON(id >= MAX_UNITTEST_OVERLAYS); + overlay_id_bits[BIT_WORD(id)] &= ~BIT_MASK(id); +} + +static void of_unittest_destroy_tracked_overlays(void) +{ + int id, ret, defers; + + if (overlay_first_id < 0) + return; + + /* try until no defers */ + do { + defers = 0; + /* remove in reverse order */ + for (id = MAX_UNITTEST_OVERLAYS - 1; id >= 0; id--) { + if (!(overlay_id_bits[BIT_WORD(id)] & BIT_MASK(id))) + continue; + + ret = of_overlay_destroy(id + overlay_first_id); + if (ret != 0) { + defers++; + pr_warn("%s: overlay destroy failed for #%d\n", + __func__, id + overlay_first_id); + continue; + } + + overlay_id_bits[BIT_WORD(id)] &= ~BIT_MASK(id); + } + } while (defers > 0); +} + static int of_unittest_apply_overlay(int unittest_nr, int overlay_nr, int *overlay_id) { @@ -1130,6 +1185,7 @@ static int of_unittest_apply_overlay(int unittest_nr, int overlay_nr, goto out; } id = ret; + of_unittest_track_overlay(id); ret = 0; @@ -1343,6 +1399,7 @@ static void of_unittest_overlay_6(void) return; } ov_id[i] = ret; + of_unittest_track_overlay(ov_id[i]); } for (i = 0; i < 2; i++) { @@ -1367,6 +1424,7 @@ static void of_unittest_overlay_6(void) PDEV_OVERLAY)); return; } + of_unittest_untrack_overlay(ov_id[i]); } for (i = 0; i < 2; i++) { @@ -1411,6 +1469,7 @@ static void of_unittest_overlay_8(void) return; } ov_id[i] = ret; + of_unittest_track_overlay(ov_id[i]); } /* now try to remove first overlay (it should fail) */ @@ -1433,6 +1492,7 @@ static void of_unittest_overlay_8(void) PDEV_OVERLAY)); return; } + of_unittest_untrack_overlay(ov_id[i]); } unittest(1, "overlay test %d passed\n", 8); @@ -1855,6 +1915,8 @@ static void __init of_unittest_overlay(void) of_unittest_overlay_i2c_cleanup(); #endif + of_unittest_destroy_tracked_overlays(); + out: of_node_put(bus_np); } diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 3f493459378f..dd92c5edf219 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -138,22 +138,22 @@ static int __oprofilefs_create_file(struct dentry *root, char const *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, name); if (!dentry) { - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return -ENOMEM; } inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm); if (!inode) { dput(dentry); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return -ENOMEM; } inode->i_fop = fops; inode->i_private = priv; d_add(dentry, inode); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return 0; } @@ -215,22 +215,22 @@ struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name) struct dentry *dentry; struct inode *inode; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); dentry = d_alloc_name(parent, name); if (!dentry) { - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return NULL; } inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755); if (!inode) { dput(dentry); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return NULL; } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; d_add(dentry, inode); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return dentry; } diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig index 440ed776efd4..2a6531a5fde8 100644 --- a/drivers/platform/chrome/Kconfig +++ b/drivers/platform/chrome/Kconfig @@ -4,7 +4,7 @@ menuconfig CHROME_PLATFORMS bool "Platform support for Chrome hardware" - depends on X86 + depends on X86 || ARM ---help--- Say Y here to get to see options for platform support for various Chromebooks and Chromeboxes. This option alone does @@ -16,8 +16,7 @@ if CHROME_PLATFORMS config CHROMEOS_LAPTOP tristate "Chrome OS Laptop" - depends on I2C - depends on DMI + depends on I2C && DMI && X86 ---help--- This driver instantiates i2c and smbus devices such as light sensors and touchpads. @@ -27,6 +26,7 @@ config CHROMEOS_LAPTOP config CHROMEOS_PSTORE tristate "Chrome OS pstore support" + depends on X86 ---help--- This module instantiates the persistent storage on x86 ChromeOS devices. It can be used to store away console logs and crash @@ -38,5 +38,25 @@ config CHROMEOS_PSTORE If you have a supported Chromebook, choose Y or M here. The module will be called chromeos_pstore. +config CROS_EC_CHARDEV + tristate "Chrome OS Embedded Controller userspace device interface" + depends on MFD_CROS_EC + ---help--- + This driver adds support to talk with the ChromeOS EC from userspace. + + If you have a supported Chromebook, choose Y or M here. + The module will be called cros_ec_dev. + +config CROS_EC_LPC + tristate "ChromeOS Embedded Controller (LPC)" + depends on MFD_CROS_EC && (X86 || COMPILE_TEST) + help + If you say Y here, you get support for talking to the ChromeOS EC + over an LPC bus. This uses a simple byte-level protocol with a + checksum. This is used for userspace access only. The kernel + typically has its own communication methods. + + To compile this driver as a module, choose M here: the + module will be called cros_ec_lpc. endif # CHROMEOS_PLATFORMS diff --git a/drivers/platform/chrome/Makefile b/drivers/platform/chrome/Makefile index 2b860ca7450f..bd8d8601e875 100644 --- a/drivers/platform/chrome/Makefile +++ b/drivers/platform/chrome/Makefile @@ -1,3 +1,6 @@ obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o obj-$(CONFIG_CHROMEOS_PSTORE) += chromeos_pstore.o +cros_ec_devs-objs := cros_ec_dev.o cros_ec_sysfs.o cros_ec_lightbar.o +obj-$(CONFIG_CROS_EC_CHARDEV) += cros_ec_devs.o +obj-$(CONFIG_CROS_EC_LPC) += cros_ec_lpc.o diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c index b84fdd6b629b..a04019ab9feb 100644 --- a/drivers/platform/chrome/chromeos_laptop.c +++ b/drivers/platform/chrome/chromeos_laptop.c @@ -133,12 +133,13 @@ static struct i2c_client *__add_probed_i2c_device( const char *name, int bus, struct i2c_board_info *info, - const unsigned short *addrs) + const unsigned short *alt_addr_list) { const struct dmi_device *dmi_dev; const struct dmi_dev_onboard *dev_data; struct i2c_adapter *adapter; - struct i2c_client *client; + struct i2c_client *client = NULL; + const unsigned short addr_list[] = { info->addr, I2C_CLIENT_END }; if (bus < 0) return NULL; @@ -169,8 +170,28 @@ static struct i2c_client *__add_probed_i2c_device( return NULL; } - /* add the i2c device */ - client = i2c_new_probed_device(adapter, info, addrs, NULL); + /* + * Add the i2c device. If we can't detect it at the primary + * address we scan secondary addresses. In any case the client + * structure gets assigned primary address. + */ + client = i2c_new_probed_device(adapter, info, addr_list, NULL); + if (!client && alt_addr_list) { + struct i2c_board_info dummy_info = { + I2C_BOARD_INFO("dummy", info->addr), + }; + struct i2c_client *dummy; + + dummy = i2c_new_probed_device(adapter, &dummy_info, + alt_addr_list, NULL); + if (dummy) { + pr_debug("%s %d-%02x is probed at %02x\n", + __func__, bus, info->addr, dummy->addr); + i2c_unregister_device(dummy); + client = i2c_new_device(adapter, info); + } + } + if (!client) pr_notice("%s failed to register device %d-%02x\n", __func__, bus, info->addr); @@ -254,12 +275,10 @@ static struct i2c_client *add_i2c_device(const char *name, enum i2c_adapter_type type, struct i2c_board_info *info) { - const unsigned short addr_list[] = { info->addr, I2C_CLIENT_END }; - return __add_probed_i2c_device(name, find_i2c_adapter_num(type), info, - addr_list); + NULL); } static int setup_cyapa_tp(enum i2c_adapter_type type) @@ -275,7 +294,6 @@ static int setup_cyapa_tp(enum i2c_adapter_type type) static int setup_atmel_224s_tp(enum i2c_adapter_type type) { const unsigned short addr_list[] = { ATMEL_TP_I2C_BL_ADDR, - ATMEL_TP_I2C_ADDR, I2C_CLIENT_END }; if (tp) return 0; @@ -289,7 +307,6 @@ static int setup_atmel_224s_tp(enum i2c_adapter_type type) static int setup_atmel_1664s_ts(enum i2c_adapter_type type) { const unsigned short addr_list[] = { ATMEL_TS_I2C_BL_ADDR, - ATMEL_TS_I2C_ADDR, I2C_CLIENT_END }; if (ts) return 0; diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c new file mode 100644 index 000000000000..6090d0b2826f --- /dev/null +++ b/drivers/platform/chrome/cros_ec_dev.c @@ -0,0 +1,274 @@ +/* + * cros_ec_dev - expose the Chrome OS Embedded Controller to user-space + * + * Copyright (C) 2014 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/uaccess.h> + +#include "cros_ec_dev.h" + +/* Device variables */ +#define CROS_MAX_DEV 128 +static struct class *cros_class; +static int ec_major; + +/* Basic communication */ +static int ec_get_version(struct cros_ec_device *ec, char *str, int maxlen) +{ + struct ec_response_get_version *resp; + static const char * const current_image_name[] = { + "unknown", "read-only", "read-write", "invalid", + }; + struct cros_ec_command msg = { + .version = 0, + .command = EC_CMD_GET_VERSION, + .outdata = { 0 }, + .outsize = 0, + .indata = { 0 }, + .insize = sizeof(*resp), + }; + int ret; + + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + + if (msg.result != EC_RES_SUCCESS) { + snprintf(str, maxlen, + "%s\nUnknown EC version: EC returned %d\n", + CROS_EC_DEV_VERSION, msg.result); + return 0; + } + + resp = (struct ec_response_get_version *)msg.indata; + if (resp->current_image >= ARRAY_SIZE(current_image_name)) + resp->current_image = 3; /* invalid */ + + snprintf(str, maxlen, "%s\n%s\n%s\n%s\n", CROS_EC_DEV_VERSION, + resp->version_string_ro, resp->version_string_rw, + current_image_name[resp->current_image]); + + return 0; +} + +/* Device file ops */ +static int ec_device_open(struct inode *inode, struct file *filp) +{ + filp->private_data = container_of(inode->i_cdev, + struct cros_ec_device, cdev); + return 0; +} + +static int ec_device_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static ssize_t ec_device_read(struct file *filp, char __user *buffer, + size_t length, loff_t *offset) +{ + struct cros_ec_device *ec = filp->private_data; + char msg[sizeof(struct ec_response_get_version) + + sizeof(CROS_EC_DEV_VERSION)]; + size_t count; + int ret; + + if (*offset != 0) + return 0; + + ret = ec_get_version(ec, msg, sizeof(msg)); + if (ret) + return ret; + + count = min(length, strlen(msg)); + + if (copy_to_user(buffer, msg, count)) + return -EFAULT; + + *offset = count; + return count; +} + +/* Ioctls */ +static long ec_device_ioctl_xcmd(struct cros_ec_device *ec, void __user *arg) +{ + long ret; + struct cros_ec_command s_cmd = { }; + + if (copy_from_user(&s_cmd, arg, sizeof(s_cmd))) + return -EFAULT; + + ret = cros_ec_cmd_xfer(ec, &s_cmd); + /* Only copy data to userland if data was received. */ + if (ret < 0) + return ret; + + if (copy_to_user(arg, &s_cmd, sizeof(s_cmd))) + return -EFAULT; + + return 0; +} + +static long ec_device_ioctl_readmem(struct cros_ec_device *ec, void __user *arg) +{ + struct cros_ec_readmem s_mem = { }; + long num; + + /* Not every platform supports direct reads */ + if (!ec->cmd_readmem) + return -ENOTTY; + + if (copy_from_user(&s_mem, arg, sizeof(s_mem))) + return -EFAULT; + + num = ec->cmd_readmem(ec, s_mem.offset, s_mem.bytes, s_mem.buffer); + if (num <= 0) + return num; + + if (copy_to_user((void __user *)arg, &s_mem, sizeof(s_mem))) + return -EFAULT; + + return 0; +} + +static long ec_device_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct cros_ec_device *ec = filp->private_data; + + if (_IOC_TYPE(cmd) != CROS_EC_DEV_IOC) + return -ENOTTY; + + switch (cmd) { + case CROS_EC_DEV_IOCXCMD: + return ec_device_ioctl_xcmd(ec, (void __user *)arg); + case CROS_EC_DEV_IOCRDMEM: + return ec_device_ioctl_readmem(ec, (void __user *)arg); + } + + return -ENOTTY; +} + +/* Module initialization */ +static const struct file_operations fops = { + .open = ec_device_open, + .release = ec_device_release, + .read = ec_device_read, + .unlocked_ioctl = ec_device_ioctl, +}; + +static int ec_device_probe(struct platform_device *pdev) +{ + struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent); + int retval = -ENOTTY; + dev_t devno = MKDEV(ec_major, 0); + + /* Instantiate it (and remember the EC) */ + cdev_init(&ec->cdev, &fops); + + retval = cdev_add(&ec->cdev, devno, 1); + if (retval) { + dev_err(&pdev->dev, ": failed to add character device\n"); + return retval; + } + + ec->vdev = device_create(cros_class, NULL, devno, ec, + CROS_EC_DEV_NAME); + if (IS_ERR(ec->vdev)) { + retval = PTR_ERR(ec->vdev); + dev_err(&pdev->dev, ": failed to create device\n"); + cdev_del(&ec->cdev); + return retval; + } + + /* Initialize extra interfaces */ + ec_dev_sysfs_init(ec); + ec_dev_lightbar_init(ec); + + return 0; +} + +static int ec_device_remove(struct platform_device *pdev) +{ + struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent); + + ec_dev_lightbar_remove(ec); + ec_dev_sysfs_remove(ec); + device_destroy(cros_class, MKDEV(ec_major, 0)); + cdev_del(&ec->cdev); + return 0; +} + +static struct platform_driver cros_ec_dev_driver = { + .driver = { + .name = "cros-ec-ctl", + }, + .probe = ec_device_probe, + .remove = ec_device_remove, +}; + +static int __init cros_ec_dev_init(void) +{ + int ret; + dev_t dev = 0; + + cros_class = class_create(THIS_MODULE, "chromeos"); + if (IS_ERR(cros_class)) { + pr_err(CROS_EC_DEV_NAME ": failed to register device class\n"); + return PTR_ERR(cros_class); + } + + /* Get a range of minor numbers (starting with 0) to work with */ + ret = alloc_chrdev_region(&dev, 0, CROS_MAX_DEV, CROS_EC_DEV_NAME); + if (ret < 0) { + pr_err(CROS_EC_DEV_NAME ": alloc_chrdev_region() failed\n"); + goto failed_chrdevreg; + } + ec_major = MAJOR(dev); + + /* Register the driver */ + ret = platform_driver_register(&cros_ec_dev_driver); + if (ret < 0) { + pr_warn(CROS_EC_DEV_NAME ": can't register driver: %d\n", ret); + goto failed_devreg; + } + return 0; + +failed_devreg: + unregister_chrdev_region(MKDEV(ec_major, 0), CROS_MAX_DEV); +failed_chrdevreg: + class_destroy(cros_class); + return ret; +} + +static void __exit cros_ec_dev_exit(void) +{ + platform_driver_unregister(&cros_ec_dev_driver); + unregister_chrdev(ec_major, CROS_EC_DEV_NAME); + class_destroy(cros_class); +} + +module_init(cros_ec_dev_init); +module_exit(cros_ec_dev_exit); + +MODULE_AUTHOR("Bill Richardson <wfrichar@chromium.org>"); +MODULE_DESCRIPTION("Userspace interface to the Chrome OS Embedded Controller"); +MODULE_VERSION("1.0"); +MODULE_LICENSE("GPL"); diff --git a/drivers/platform/chrome/cros_ec_dev.h b/drivers/platform/chrome/cros_ec_dev.h new file mode 100644 index 000000000000..45d67f7e518c --- /dev/null +++ b/drivers/platform/chrome/cros_ec_dev.h @@ -0,0 +1,53 @@ +/* + * cros_ec_dev - expose the Chrome OS Embedded Controller to userspace + * + * Copyright (C) 2014 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _CROS_EC_DEV_H_ +#define _CROS_EC_DEV_H_ + +#include <linux/ioctl.h> +#include <linux/types.h> +#include <linux/mfd/cros_ec.h> + +#define CROS_EC_DEV_NAME "cros_ec" +#define CROS_EC_DEV_VERSION "1.0.0" + +/* + * @offset: within EC_LPC_ADDR_MEMMAP region + * @bytes: number of bytes to read. zero means "read a string" (including '\0') + * (at most only EC_MEMMAP_SIZE bytes can be read) + * @buffer: where to store the result + * ioctl returns the number of bytes read, negative on error + */ +struct cros_ec_readmem { + uint32_t offset; + uint32_t bytes; + uint8_t buffer[EC_MEMMAP_SIZE]; +}; + +#define CROS_EC_DEV_IOC 0xEC +#define CROS_EC_DEV_IOCXCMD _IOWR(CROS_EC_DEV_IOC, 0, struct cros_ec_command) +#define CROS_EC_DEV_IOCRDMEM _IOWR(CROS_EC_DEV_IOC, 1, struct cros_ec_readmem) + +void ec_dev_sysfs_init(struct cros_ec_device *); +void ec_dev_sysfs_remove(struct cros_ec_device *); + +void ec_dev_lightbar_init(struct cros_ec_device *); +void ec_dev_lightbar_remove(struct cros_ec_device *); + +#endif /* _CROS_EC_DEV_H_ */ diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c new file mode 100644 index 000000000000..b4ff47a9069a --- /dev/null +++ b/drivers/platform/chrome/cros_ec_lightbar.c @@ -0,0 +1,367 @@ +/* + * cros_ec_lightbar - expose the Chromebook Pixel lightbar to userspace + * + * Copyright (C) 2014 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) "cros_ec_lightbar: " fmt + +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/mfd/cros_ec.h> +#include <linux/mfd/cros_ec_commands.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/uaccess.h> + +#include "cros_ec_dev.h" + +/* Rate-limit the lightbar interface to prevent DoS. */ +static unsigned long lb_interval_jiffies = 50 * HZ / 1000; + +static ssize_t interval_msec_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned long msec = lb_interval_jiffies * 1000 / HZ; + + return scnprintf(buf, PAGE_SIZE, "%lu\n", msec); +} + +static ssize_t interval_msec_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msec; + + if (kstrtoul(buf, 0, &msec)) + return -EINVAL; + + lb_interval_jiffies = msec * HZ / 1000; + + return count; +} + +static DEFINE_MUTEX(lb_mutex); +/* Return 0 if able to throttle correctly, error otherwise */ +static int lb_throttle(void) +{ + static unsigned long last_access; + unsigned long now, next_timeslot; + long delay; + int ret = 0; + + mutex_lock(&lb_mutex); + + now = jiffies; + next_timeslot = last_access + lb_interval_jiffies; + + if (time_before(now, next_timeslot)) { + delay = (long)(next_timeslot) - (long)now; + set_current_state(TASK_INTERRUPTIBLE); + if (schedule_timeout(delay) > 0) { + /* interrupted - just abort */ + ret = -EINTR; + goto out; + } + now = jiffies; + } + + last_access = now; +out: + mutex_unlock(&lb_mutex); + + return ret; +} + +#define INIT_MSG(P, R) { \ + .command = EC_CMD_LIGHTBAR_CMD, \ + .outsize = sizeof(*P), \ + .insize = sizeof(*R), \ + } + +static int get_lightbar_version(struct cros_ec_device *ec, + uint32_t *ver_ptr, uint32_t *flg_ptr) +{ + struct ec_params_lightbar *param; + struct ec_response_lightbar *resp; + struct cros_ec_command msg = INIT_MSG(param, resp); + int ret; + + param = (struct ec_params_lightbar *)msg.outdata; + param->cmd = LIGHTBAR_CMD_VERSION; + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return 0; + + switch (msg.result) { + case EC_RES_INVALID_PARAM: + /* Pixel had no version command. */ + if (ver_ptr) + *ver_ptr = 0; + if (flg_ptr) + *flg_ptr = 0; + return 1; + + case EC_RES_SUCCESS: + resp = (struct ec_response_lightbar *)msg.indata; + + /* Future devices w/lightbars should implement this command */ + if (ver_ptr) + *ver_ptr = resp->version.num; + if (flg_ptr) + *flg_ptr = resp->version.flags; + return 1; + } + + /* Anything else (ie, EC_RES_INVALID_COMMAND) - no lightbar */ + return 0; +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + uint32_t version, flags; + struct cros_ec_device *ec = dev_get_drvdata(dev); + int ret; + + ret = lb_throttle(); + if (ret) + return ret; + + /* This should always succeed, because we check during init. */ + if (!get_lightbar_version(ec, &version, &flags)) + return -EIO; + + return scnprintf(buf, PAGE_SIZE, "%d %d\n", version, flags); +} + +static ssize_t brightness_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ec_params_lightbar *param; + struct ec_response_lightbar *resp; + struct cros_ec_command msg = INIT_MSG(param, resp); + int ret; + unsigned int val; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + if (kstrtouint(buf, 0, &val)) + return -EINVAL; + + param = (struct ec_params_lightbar *)msg.outdata; + param->cmd = LIGHTBAR_CMD_BRIGHTNESS; + param->brightness.num = val; + ret = lb_throttle(); + if (ret) + return ret; + + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + + if (msg.result != EC_RES_SUCCESS) + return -EINVAL; + + return count; +} + + +/* + * We expect numbers, and we'll keep reading until we find them, skipping over + * any whitespace (sysfs guarantees that the input is null-terminated). Every + * four numbers are sent to the lightbar as <LED,R,G,B>. We fail at the first + * parsing error, if we don't parse any numbers, or if we have numbers left + * over. + */ +static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ec_params_lightbar *param; + struct ec_response_lightbar *resp; + struct cros_ec_command msg = INIT_MSG(param, resp); + struct cros_ec_device *ec = dev_get_drvdata(dev); + unsigned int val[4]; + int ret, i = 0, j = 0, ok = 0; + + do { + /* Skip any whitespace */ + while (*buf && isspace(*buf)) + buf++; + + if (!*buf) + break; + + ret = sscanf(buf, "%i", &val[i++]); + if (ret == 0) + return -EINVAL; + + if (i == 4) { + param = (struct ec_params_lightbar *)msg.outdata; + param->cmd = LIGHTBAR_CMD_RGB; + param->rgb.led = val[0]; + param->rgb.red = val[1]; + param->rgb.green = val[2]; + param->rgb.blue = val[3]; + /* + * Throttle only the first of every four transactions, + * so that the user can update all four LEDs at once. + */ + if ((j++ % 4) == 0) { + ret = lb_throttle(); + if (ret) + return ret; + } + + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + + if (msg.result != EC_RES_SUCCESS) + return -EINVAL; + + i = 0; + ok = 1; + } + + /* Skip over the number we just read */ + while (*buf && !isspace(*buf)) + buf++; + + } while (*buf); + + return (ok && i == 0) ? count : -EINVAL; +} + +static char const *seqname[] = { + "ERROR", "S5", "S3", "S0", "S5S3", "S3S0", + "S0S3", "S3S5", "STOP", "RUN", "PULSE", "TEST", "KONAMI", +}; + +static ssize_t sequence_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ec_params_lightbar *param; + struct ec_response_lightbar *resp; + struct cros_ec_command msg = INIT_MSG(param, resp); + int ret; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + param = (struct ec_params_lightbar *)msg.outdata; + param->cmd = LIGHTBAR_CMD_GET_SEQ; + ret = lb_throttle(); + if (ret) + return ret; + + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + + if (msg.result != EC_RES_SUCCESS) + return scnprintf(buf, PAGE_SIZE, + "ERROR: EC returned %d\n", msg.result); + + resp = (struct ec_response_lightbar *)msg.indata; + if (resp->get_seq.num >= ARRAY_SIZE(seqname)) + return scnprintf(buf, PAGE_SIZE, "%d\n", resp->get_seq.num); + else + return scnprintf(buf, PAGE_SIZE, "%s\n", + seqname[resp->get_seq.num]); +} + +static ssize_t sequence_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ec_params_lightbar *param; + struct ec_response_lightbar *resp; + struct cros_ec_command msg = INIT_MSG(param, resp); + unsigned int num; + int ret, len; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + for (len = 0; len < count; len++) + if (!isalnum(buf[len])) + break; + + for (num = 0; num < ARRAY_SIZE(seqname); num++) + if (!strncasecmp(seqname[num], buf, len)) + break; + + if (num >= ARRAY_SIZE(seqname)) { + ret = kstrtouint(buf, 0, &num); + if (ret) + return ret; + } + + param = (struct ec_params_lightbar *)msg.outdata; + param->cmd = LIGHTBAR_CMD_SEQ; + param->seq.num = num; + ret = lb_throttle(); + if (ret) + return ret; + + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + + if (msg.result != EC_RES_SUCCESS) + return -EINVAL; + + return count; +} + +/* Module initialization */ + +static DEVICE_ATTR_RW(interval_msec); +static DEVICE_ATTR_RO(version); +static DEVICE_ATTR_WO(brightness); +static DEVICE_ATTR_WO(led_rgb); +static DEVICE_ATTR_RW(sequence); +static struct attribute *__lb_cmds_attrs[] = { + &dev_attr_interval_msec.attr, + &dev_attr_version.attr, + &dev_attr_brightness.attr, + &dev_attr_led_rgb.attr, + &dev_attr_sequence.attr, + NULL, +}; +static struct attribute_group lb_cmds_attr_group = { + .name = "lightbar", + .attrs = __lb_cmds_attrs, +}; + +void ec_dev_lightbar_init(struct cros_ec_device *ec) +{ + int ret = 0; + + /* Only instantiate this stuff if the EC has a lightbar */ + if (!get_lightbar_version(ec, NULL, NULL)) + return; + + ret = sysfs_create_group(&ec->vdev->kobj, &lb_cmds_attr_group); + if (ret) + pr_warn("sysfs_create_group() failed: %d\n", ret); +} + +void ec_dev_lightbar_remove(struct cros_ec_device *ec) +{ + sysfs_remove_group(&ec->vdev->kobj, &lb_cmds_attr_group); +} diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c new file mode 100644 index 000000000000..8f9ac4d7bbd0 --- /dev/null +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -0,0 +1,319 @@ +/* + * cros_ec_lpc - LPC access to the Chrome OS Embedded Controller + * + * Copyright (C) 2012-2015 Google, Inc + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * This driver uses the Chrome OS EC byte-level message-based protocol for + * communicating the keyboard state (which keys are pressed) from a keyboard EC + * to the AP over some bus (such as i2c, lpc, spi). The EC does debouncing, + * but everything else (including deghosting) is done here. The main + * motivation for this is to keep the EC firmware as simple as possible, since + * it cannot be easily upgraded and EC flash/IRAM space is relatively + * expensive. + */ + +#include <linux/dmi.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/mfd/cros_ec.h> +#include <linux/mfd/cros_ec_commands.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/printk.h> + +#define DRV_NAME "cros_ec_lpc" + +static int ec_response_timed_out(void) +{ + unsigned long one_second = jiffies + HZ; + + usleep_range(200, 300); + do { + if (!(inb(EC_LPC_ADDR_HOST_CMD) & EC_LPC_STATUS_BUSY_MASK)) + return 0; + usleep_range(100, 200); + } while (time_before(jiffies, one_second)); + + return 1; +} + +static int cros_ec_cmd_xfer_lpc(struct cros_ec_device *ec, + struct cros_ec_command *msg) +{ + struct ec_lpc_host_args args; + int csum; + int i; + int ret = 0; + + if (msg->outsize > EC_PROTO2_MAX_PARAM_SIZE || + msg->insize > EC_PROTO2_MAX_PARAM_SIZE) { + dev_err(ec->dev, + "invalid buffer sizes (out %d, in %d)\n", + msg->outsize, msg->insize); + return -EINVAL; + } + + /* Now actually send the command to the EC and get the result */ + args.flags = EC_HOST_ARGS_FLAG_FROM_HOST; + args.command_version = msg->version; + args.data_size = msg->outsize; + + /* Initialize checksum */ + csum = msg->command + args.flags + + args.command_version + args.data_size; + + /* Copy data and update checksum */ + for (i = 0; i < msg->outsize; i++) { + outb(msg->outdata[i], EC_LPC_ADDR_HOST_PARAM + i); + csum += msg->outdata[i]; + } + + /* Finalize checksum and write args */ + args.checksum = csum & 0xFF; + outb(args.flags, EC_LPC_ADDR_HOST_ARGS); + outb(args.command_version, EC_LPC_ADDR_HOST_ARGS + 1); + outb(args.data_size, EC_LPC_ADDR_HOST_ARGS + 2); + outb(args.checksum, EC_LPC_ADDR_HOST_ARGS + 3); + + /* Here we go */ + outb(msg->command, EC_LPC_ADDR_HOST_CMD); + + if (ec_response_timed_out()) { + dev_warn(ec->dev, "EC responsed timed out\n"); + ret = -EIO; + goto done; + } + + /* Check result */ + msg->result = inb(EC_LPC_ADDR_HOST_DATA); + + switch (msg->result) { + case EC_RES_SUCCESS: + break; + case EC_RES_IN_PROGRESS: + ret = -EAGAIN; + dev_dbg(ec->dev, "command 0x%02x in progress\n", + msg->command); + goto done; + default: + dev_dbg(ec->dev, "command 0x%02x returned %d\n", + msg->command, msg->result); + } + + /* Read back args */ + args.flags = inb(EC_LPC_ADDR_HOST_ARGS); + args.command_version = inb(EC_LPC_ADDR_HOST_ARGS + 1); + args.data_size = inb(EC_LPC_ADDR_HOST_ARGS + 2); + args.checksum = inb(EC_LPC_ADDR_HOST_ARGS + 3); + + if (args.data_size > msg->insize) { + dev_err(ec->dev, + "packet too long (%d bytes, expected %d)", + args.data_size, msg->insize); + ret = -ENOSPC; + goto done; + } + + /* Start calculating response checksum */ + csum = msg->command + args.flags + + args.command_version + args.data_size; + + /* Read response and update checksum */ + for (i = 0; i < args.data_size; i++) { + msg->indata[i] = inb(EC_LPC_ADDR_HOST_PARAM + i); + csum += msg->indata[i]; + } + + /* Verify checksum */ + if (args.checksum != (csum & 0xFF)) { + dev_err(ec->dev, + "bad packet checksum, expected %02x, got %02x\n", + args.checksum, csum & 0xFF); + ret = -EBADMSG; + goto done; + } + + /* Return actual amount of data received */ + ret = args.data_size; +done: + return ret; +} + +/* Returns num bytes read, or negative on error. Doesn't need locking. */ +static int cros_ec_lpc_readmem(struct cros_ec_device *ec, unsigned int offset, + unsigned int bytes, void *dest) +{ + int i = offset; + char *s = dest; + int cnt = 0; + + if (offset >= EC_MEMMAP_SIZE - bytes) + return -EINVAL; + + /* fixed length */ + if (bytes) { + for (; cnt < bytes; i++, s++, cnt++) + *s = inb(EC_LPC_ADDR_MEMMAP + i); + return cnt; + } + + /* string */ + for (; i < EC_MEMMAP_SIZE; i++, s++) { + *s = inb(EC_LPC_ADDR_MEMMAP + i); + cnt++; + if (!*s) + break; + } + + return cnt; +} + +static int cros_ec_lpc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct cros_ec_device *ec_dev; + int ret; + + if (!devm_request_region(dev, EC_LPC_ADDR_MEMMAP, EC_MEMMAP_SIZE, + dev_name(dev))) { + dev_err(dev, "couldn't reserve memmap region\n"); + return -EBUSY; + } + + if ((inb(EC_LPC_ADDR_MEMMAP + EC_MEMMAP_ID) != 'E') || + (inb(EC_LPC_ADDR_MEMMAP + EC_MEMMAP_ID + 1) != 'C')) { + dev_err(dev, "EC ID not detected\n"); + return -ENODEV; + } + + if (!devm_request_region(dev, EC_HOST_CMD_REGION0, + EC_HOST_CMD_REGION_SIZE, dev_name(dev))) { + dev_err(dev, "couldn't reserve region0\n"); + return -EBUSY; + } + if (!devm_request_region(dev, EC_HOST_CMD_REGION1, + EC_HOST_CMD_REGION_SIZE, dev_name(dev))) { + dev_err(dev, "couldn't reserve region1\n"); + return -EBUSY; + } + + ec_dev = devm_kzalloc(dev, sizeof(*ec_dev), GFP_KERNEL); + if (!ec_dev) + return -ENOMEM; + + platform_set_drvdata(pdev, ec_dev); + ec_dev->dev = dev; + ec_dev->ec_name = pdev->name; + ec_dev->phys_name = dev_name(dev); + ec_dev->parent = dev; + ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc; + ec_dev->cmd_readmem = cros_ec_lpc_readmem; + + ret = cros_ec_register(ec_dev); + if (ret) { + dev_err(dev, "couldn't register ec_dev (%d)\n", ret); + return ret; + } + + return 0; +} + +static int cros_ec_lpc_remove(struct platform_device *pdev) +{ + struct cros_ec_device *ec_dev; + + ec_dev = platform_get_drvdata(pdev); + cros_ec_remove(ec_dev); + + return 0; +} + +static struct dmi_system_id cros_ec_lpc_dmi_table[] __initdata = { + { + /* + * Today all Chromebooks/boxes ship with Google_* as version and + * coreboot as bios vendor. No other systems with this + * combination are known to date. + */ + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "coreboot"), + DMI_MATCH(DMI_BIOS_VERSION, "Google_"), + }, + }, + { + /* x86-link, the Chromebook Pixel. */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"), + DMI_MATCH(DMI_PRODUCT_NAME, "Link"), + }, + }, + { + /* x86-peppy, the Acer C720 Chromebook. */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Peppy"), + }, + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(dmi, cros_ec_lpc_dmi_table); + +static struct platform_driver cros_ec_lpc_driver = { + .driver = { + .name = DRV_NAME, + }, + .probe = cros_ec_lpc_probe, + .remove = cros_ec_lpc_remove, +}; + +static struct platform_device cros_ec_lpc_device = { + .name = DRV_NAME +}; + +static int __init cros_ec_lpc_init(void) +{ + int ret; + + if (!dmi_check_system(cros_ec_lpc_dmi_table)) { + pr_err(DRV_NAME ": unsupported system.\n"); + return -ENODEV; + } + + /* Register the driver */ + ret = platform_driver_register(&cros_ec_lpc_driver); + if (ret) { + pr_err(DRV_NAME ": can't register driver: %d\n", ret); + return ret; + } + + /* Register the device, and it'll get hooked up automatically */ + ret = platform_device_register(&cros_ec_lpc_device); + if (ret) { + pr_err(DRV_NAME ": can't register device: %d\n", ret); + platform_driver_unregister(&cros_ec_lpc_driver); + return ret; + } + + return 0; +} + +static void __exit cros_ec_lpc_exit(void) +{ + platform_device_unregister(&cros_ec_lpc_device); + platform_driver_unregister(&cros_ec_lpc_driver); +} + +module_init(cros_ec_lpc_init); +module_exit(cros_ec_lpc_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ChromeOS EC LPC driver"); diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c new file mode 100644 index 000000000000..fb62ab6cc659 --- /dev/null +++ b/drivers/platform/chrome/cros_ec_sysfs.c @@ -0,0 +1,271 @@ +/* + * cros_ec_sysfs - expose the Chrome OS EC through sysfs + * + * Copyright (C) 2014 Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#define pr_fmt(fmt) "cros_ec_sysfs: " fmt + +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/mfd/cros_ec.h> +#include <linux/mfd/cros_ec_commands.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/printk.h> +#include <linux/stat.h> +#include <linux/types.h> +#include <linux/uaccess.h> + +#include "cros_ec_dev.h" + +/* Accessor functions */ + +static ssize_t show_ec_reboot(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int count = 0; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "ro|rw|cancel|cold|disable-jump|hibernate"); + count += scnprintf(buf + count, PAGE_SIZE - count, + " [at-shutdown]\n"); + return count; +} + +static ssize_t store_ec_reboot(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + static const struct { + const char * const str; + uint8_t cmd; + uint8_t flags; + } words[] = { + {"cancel", EC_REBOOT_CANCEL, 0}, + {"ro", EC_REBOOT_JUMP_RO, 0}, + {"rw", EC_REBOOT_JUMP_RW, 0}, + {"cold", EC_REBOOT_COLD, 0}, + {"disable-jump", EC_REBOOT_DISABLE_JUMP, 0}, + {"hibernate", EC_REBOOT_HIBERNATE, 0}, + {"at-shutdown", -1, EC_REBOOT_FLAG_ON_AP_SHUTDOWN}, + }; + struct cros_ec_command msg = { 0 }; + struct ec_params_reboot_ec *param = + (struct ec_params_reboot_ec *)msg.outdata; + int got_cmd = 0, offset = 0; + int i; + int ret; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + param->flags = 0; + while (1) { + /* Find word to start scanning */ + while (buf[offset] && isspace(buf[offset])) + offset++; + if (!buf[offset]) + break; + + for (i = 0; i < ARRAY_SIZE(words); i++) { + if (!strncasecmp(words[i].str, buf+offset, + strlen(words[i].str))) { + if (words[i].flags) { + param->flags |= words[i].flags; + } else { + param->cmd = words[i].cmd; + got_cmd = 1; + } + break; + } + } + + /* On to the next word, if any */ + while (buf[offset] && !isspace(buf[offset])) + offset++; + } + + if (!got_cmd) + return -EINVAL; + + msg.command = EC_CMD_REBOOT_EC; + msg.outsize = sizeof(param); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + if (msg.result != EC_RES_SUCCESS) { + dev_dbg(ec->dev, "EC result %d\n", msg.result); + return -EINVAL; + } + + return count; +} + +static ssize_t show_ec_version(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const image_names[] = {"unknown", "RO", "RW"}; + struct ec_response_get_version *r_ver; + struct ec_response_get_chip_info *r_chip; + struct ec_response_board_version *r_board; + struct cros_ec_command msg = { 0 }; + int ret; + int count = 0; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + /* Get versions. RW may change. */ + msg.command = EC_CMD_GET_VERSION; + msg.insize = sizeof(*r_ver); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + if (msg.result != EC_RES_SUCCESS) + return scnprintf(buf, PAGE_SIZE, + "ERROR: EC returned %d\n", msg.result); + + r_ver = (struct ec_response_get_version *)msg.indata; + /* Strings should be null-terminated, but let's be sure. */ + r_ver->version_string_ro[sizeof(r_ver->version_string_ro) - 1] = '\0'; + r_ver->version_string_rw[sizeof(r_ver->version_string_rw) - 1] = '\0'; + count += scnprintf(buf + count, PAGE_SIZE - count, + "RO version: %s\n", r_ver->version_string_ro); + count += scnprintf(buf + count, PAGE_SIZE - count, + "RW version: %s\n", r_ver->version_string_rw); + count += scnprintf(buf + count, PAGE_SIZE - count, + "Firmware copy: %s\n", + (r_ver->current_image < ARRAY_SIZE(image_names) ? + image_names[r_ver->current_image] : "?")); + + /* Get build info. */ + msg.command = EC_CMD_GET_BUILD_INFO; + msg.insize = sizeof(msg.indata); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Build info: XFER ERROR %d\n", ret); + else if (msg.result != EC_RES_SUCCESS) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Build info: EC error %d\n", msg.result); + else { + msg.indata[sizeof(msg.indata) - 1] = '\0'; + count += scnprintf(buf + count, PAGE_SIZE - count, + "Build info: %s\n", msg.indata); + } + + /* Get chip info. */ + msg.command = EC_CMD_GET_CHIP_INFO; + msg.insize = sizeof(*r_chip); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Chip info: XFER ERROR %d\n", ret); + else if (msg.result != EC_RES_SUCCESS) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Chip info: EC error %d\n", msg.result); + else { + r_chip = (struct ec_response_get_chip_info *)msg.indata; + + r_chip->vendor[sizeof(r_chip->vendor) - 1] = '\0'; + r_chip->name[sizeof(r_chip->name) - 1] = '\0'; + r_chip->revision[sizeof(r_chip->revision) - 1] = '\0'; + count += scnprintf(buf + count, PAGE_SIZE - count, + "Chip vendor: %s\n", r_chip->vendor); + count += scnprintf(buf + count, PAGE_SIZE - count, + "Chip name: %s\n", r_chip->name); + count += scnprintf(buf + count, PAGE_SIZE - count, + "Chip revision: %s\n", r_chip->revision); + } + + /* Get board version */ + msg.command = EC_CMD_GET_BOARD_VERSION; + msg.insize = sizeof(*r_board); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Board version: XFER ERROR %d\n", ret); + else if (msg.result != EC_RES_SUCCESS) + count += scnprintf(buf + count, PAGE_SIZE - count, + "Board version: EC error %d\n", msg.result); + else { + r_board = (struct ec_response_board_version *)msg.indata; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "Board version: %d\n", + r_board->board_version); + } + + return count; +} + +static ssize_t show_ec_flashinfo(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ec_response_flash_info *resp; + struct cros_ec_command msg = { 0 }; + int ret; + struct cros_ec_device *ec = dev_get_drvdata(dev); + + /* The flash info shouldn't ever change, but ask each time anyway. */ + msg.command = EC_CMD_FLASH_INFO; + msg.insize = sizeof(*resp); + ret = cros_ec_cmd_xfer(ec, &msg); + if (ret < 0) + return ret; + if (msg.result != EC_RES_SUCCESS) + return scnprintf(buf, PAGE_SIZE, + "ERROR: EC returned %d\n", msg.result); + + resp = (struct ec_response_flash_info *)msg.indata; + + return scnprintf(buf, PAGE_SIZE, + "FlashSize %d\nWriteSize %d\n" + "EraseSize %d\nProtectSize %d\n", + resp->flash_size, resp->write_block_size, + resp->erase_block_size, resp->protect_block_size); +} + +/* Module initialization */ + +static DEVICE_ATTR(reboot, S_IWUSR | S_IRUGO, show_ec_reboot, store_ec_reboot); +static DEVICE_ATTR(version, S_IRUGO, show_ec_version, NULL); +static DEVICE_ATTR(flashinfo, S_IRUGO, show_ec_flashinfo, NULL); + +static struct attribute *__ec_attrs[] = { + &dev_attr_reboot.attr, + &dev_attr_version.attr, + &dev_attr_flashinfo.attr, + NULL, +}; + +static struct attribute_group ec_attr_group = { + .attrs = __ec_attrs, +}; + +void ec_dev_sysfs_init(struct cros_ec_device *ec) +{ + int error; + + error = sysfs_create_group(&ec->vdev->kobj, &ec_attr_group); + if (error) + pr_warn("failed to create group: %d\n", error); +} + +void ec_dev_sysfs_remove(struct cros_ec_device *ec) +{ + sysfs_remove_group(&ec->vdev->kobj, &ec_attr_group); +} diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 97527614141b..f9f205cb1f11 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -614,6 +614,7 @@ config ACPI_TOSHIBA depends on INPUT depends on RFKILL || RFKILL = n depends on SERIO_I8042 || SERIO_I8042 = n + depends on ACPI_VIDEO || ACPI_VIDEO = n select INPUT_POLLDEV select INPUT_SPARSEKMAP ---help--- diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index 66d6d22c239c..6808715003f6 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -22,6 +22,7 @@ #include <linux/delay.h> #include <linux/pci.h> #include <linux/vga_switcheroo.h> +#include <linux/vgaarb.h> #include <acpi/video.h> #include <asm/io.h> @@ -31,6 +32,7 @@ struct apple_gmux_data { bool indexed; struct mutex index_lock; + struct pci_dev *pdev; struct backlight_device *bdev; /* switcheroo data */ @@ -415,6 +417,23 @@ static int gmux_resume(struct device *dev) return 0; } +static struct pci_dev *gmux_get_io_pdev(void) +{ + struct pci_dev *pdev = NULL; + + while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev))) { + u16 cmd; + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_IO)) + continue; + + return pdev; + } + + return NULL; +} + static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) { struct apple_gmux_data *gmux_data; @@ -425,6 +444,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) int ret = -ENXIO; acpi_status status; unsigned long long gpe; + struct pci_dev *pdev = NULL; if (apple_gmux_data) return -EBUSY; @@ -475,7 +495,7 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) ver_minor = (version >> 16) & 0xff; ver_release = (version >> 8) & 0xff; } else { - pr_info("gmux device not present\n"); + pr_info("gmux device not present or IO disabled\n"); ret = -ENODEV; goto err_release; } @@ -483,6 +503,23 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id) pr_info("Found gmux version %d.%d.%d [%s]\n", ver_major, ver_minor, ver_release, (gmux_data->indexed ? "indexed" : "classic")); + /* + * Apple systems with gmux are EFI based and normally don't use + * VGA. In addition changing IO+MEM ownership between IGP and dGPU + * disables IO/MEM used for backlight control on some systems. + * Lock IO+MEM to GPU with active IO to prevent switch. + */ + pdev = gmux_get_io_pdev(); + if (pdev && vga_tryget(pdev, + VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM)) { + pr_err("IO+MEM vgaarb-locking for PCI:%s failed\n", + pci_name(pdev)); + ret = -EBUSY; + goto err_release; + } else if (pdev) + pr_info("locked IO for PCI:%s\n", pci_name(pdev)); + gmux_data->pdev = pdev; + memset(&props, 0, sizeof(props)); props.type = BACKLIGHT_PLATFORM; props.max_brightness = gmux_read32(gmux_data, GMUX_PORT_MAX_BRIGHTNESS); @@ -574,6 +611,10 @@ err_enable_gpe: err_notify: backlight_device_unregister(bdev); err_release: + if (gmux_data->pdev) + vga_put(gmux_data->pdev, + VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM); + pci_dev_put(pdev); release_region(gmux_data->iostart, gmux_data->iolen); err_free: kfree(gmux_data); @@ -593,6 +634,11 @@ static void gmux_remove(struct pnp_dev *pnp) &gmux_notify_handler); } + if (gmux_data->pdev) { + vga_put(gmux_data->pdev, + VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM); + pci_dev_put(gmux_data->pdev); + } backlight_device_unregister(gmux_data->bdev); release_region(gmux_data->iostart, gmux_data->iolen); diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c index 3d21efe11d7b..d688d806a8a5 100644 --- a/drivers/platform/x86/dell-laptop.c +++ b/drivers/platform/x86/dell-laptop.c @@ -2,9 +2,11 @@ * Driver for Dell laptop extras * * Copyright (c) Red Hat <mjg@redhat.com> + * Copyright (c) 2014 Gabriele Mazzotta <gabriele.mzt@gmail.com> + * Copyright (c) 2014 Pali Rohár <pali.rohar@gmail.com> * - * Based on documentation in the libsmbios package, Copyright (C) 2005 Dell - * Inc. + * Based on documentation in the libsmbios package: + * Copyright (C) 2005-2014 Dell Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,6 +34,13 @@ #include "../../firmware/dcdbas.h" #define BRIGHTNESS_TOKEN 0x7d +#define KBD_LED_OFF_TOKEN 0x01E1 +#define KBD_LED_ON_TOKEN 0x01E2 +#define KBD_LED_AUTO_TOKEN 0x01E3 +#define KBD_LED_AUTO_25_TOKEN 0x02EA +#define KBD_LED_AUTO_50_TOKEN 0x02EB +#define KBD_LED_AUTO_75_TOKEN 0x02EC +#define KBD_LED_AUTO_100_TOKEN 0x02F6 /* This structure will be modified by the firmware when we enter * system management mode, hence the volatiles */ @@ -62,6 +71,13 @@ struct calling_interface_structure { struct quirk_entry { u8 touchpad_led; + + int needs_kbd_timeouts; + /* + * Ordered list of timeouts expressed in seconds. + * The list must end with -1 + */ + int kbd_timeouts[]; }; static struct quirk_entry *quirks; @@ -76,6 +92,15 @@ static int __init dmi_matched(const struct dmi_system_id *dmi) return 1; } +/* + * These values come from Windows utility provided by Dell. If any other value + * is used then BIOS silently set timeout to 0 without any error message. + */ +static struct quirk_entry quirk_dell_xps13_9333 = { + .needs_kbd_timeouts = 1, + .kbd_timeouts = { 0, 5, 15, 60, 5 * 60, 15 * 60, -1 }, +}; + static int da_command_address; static int da_command_code; static int da_num_tokens; @@ -267,6 +292,15 @@ static const struct dmi_system_id dell_quirks[] __initconst = { }, .driver_data = &quirk_dell_vostro_v130, }, + { + .callback = dmi_matched, + .ident = "Dell XPS13 9333", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS13 9333"), + }, + .driver_data = &quirk_dell_xps13_9333, + }, { } }; @@ -331,17 +365,29 @@ static void __init find_tokens(const struct dmi_header *dm, void *dummy) } } -static int find_token_location(int tokenid) +static int find_token_id(int tokenid) { int i; + for (i = 0; i < da_num_tokens; i++) { if (da_tokens[i].tokenID == tokenid) - return da_tokens[i].location; + return i; } return -1; } +static int find_token_location(int tokenid) +{ + int id; + + id = find_token_id(tokenid); + if (id == -1) + return -1; + + return da_tokens[id].location; +} + static struct calling_interface_buffer * dell_send_request(struct calling_interface_buffer *buffer, int class, int select) @@ -362,6 +408,20 @@ dell_send_request(struct calling_interface_buffer *buffer, int class, return buffer; } +static inline int dell_smi_error(int value) +{ + switch (value) { + case 0: /* Completed successfully */ + return 0; + case -1: /* Completed with error */ + return -EIO; + case -2: /* Function not supported */ + return -ENXIO; + default: /* Unknown error */ + return -EINVAL; + } +} + /* Derived from information in DellWirelessCtl.cpp: Class 17, select 11 is radio control. It returns an array of 32-bit values. @@ -716,7 +776,7 @@ static int dell_send_intensity(struct backlight_device *bd) else dell_send_request(buffer, 1, 1); -out: + out: release_buffer(); return ret; } @@ -740,7 +800,7 @@ static int dell_get_intensity(struct backlight_device *bd) ret = buffer->output[1]; -out: + out: release_buffer(); return ret; } @@ -789,6 +849,1018 @@ static void touchpad_led_exit(void) led_classdev_unregister(&touchpad_led); } +/* + * Derived from information in smbios-keyboard-ctl: + * + * cbClass 4 + * cbSelect 11 + * Keyboard illumination + * cbArg1 determines the function to be performed + * + * cbArg1 0x0 = Get Feature Information + * cbRES1 Standard return codes (0, -1, -2) + * cbRES2, word0 Bitmap of user-selectable modes + * bit 0 Always off (All systems) + * bit 1 Always on (Travis ATG, Siberia) + * bit 2 Auto: ALS-based On; ALS-based Off (Travis ATG) + * bit 3 Auto: ALS- and input-activity-based On; input-activity based Off + * bit 4 Auto: Input-activity-based On; input-activity based Off + * bit 5 Auto: Input-activity-based On (illumination level 25%); input-activity based Off + * bit 6 Auto: Input-activity-based On (illumination level 50%); input-activity based Off + * bit 7 Auto: Input-activity-based On (illumination level 75%); input-activity based Off + * bit 8 Auto: Input-activity-based On (illumination level 100%); input-activity based Off + * bits 9-15 Reserved for future use + * cbRES2, byte2 Reserved for future use + * cbRES2, byte3 Keyboard illumination type + * 0 Reserved + * 1 Tasklight + * 2 Backlight + * 3-255 Reserved for future use + * cbRES3, byte0 Supported auto keyboard illumination trigger bitmap. + * bit 0 Any keystroke + * bit 1 Touchpad activity + * bit 2 Pointing stick + * bit 3 Any mouse + * bits 4-7 Reserved for future use + * cbRES3, byte1 Supported timeout unit bitmap + * bit 0 Seconds + * bit 1 Minutes + * bit 2 Hours + * bit 3 Days + * bits 4-7 Reserved for future use + * cbRES3, byte2 Number of keyboard light brightness levels + * cbRES4, byte0 Maximum acceptable seconds value (0 if seconds not supported). + * cbRES4, byte1 Maximum acceptable minutes value (0 if minutes not supported). + * cbRES4, byte2 Maximum acceptable hours value (0 if hours not supported). + * cbRES4, byte3 Maximum acceptable days value (0 if days not supported) + * + * cbArg1 0x1 = Get Current State + * cbRES1 Standard return codes (0, -1, -2) + * cbRES2, word0 Bitmap of current mode state + * bit 0 Always off (All systems) + * bit 1 Always on (Travis ATG, Siberia) + * bit 2 Auto: ALS-based On; ALS-based Off (Travis ATG) + * bit 3 Auto: ALS- and input-activity-based On; input-activity based Off + * bit 4 Auto: Input-activity-based On; input-activity based Off + * bit 5 Auto: Input-activity-based On (illumination level 25%); input-activity based Off + * bit 6 Auto: Input-activity-based On (illumination level 50%); input-activity based Off + * bit 7 Auto: Input-activity-based On (illumination level 75%); input-activity based Off + * bit 8 Auto: Input-activity-based On (illumination level 100%); input-activity based Off + * bits 9-15 Reserved for future use + * Note: Only One bit can be set + * cbRES2, byte2 Currently active auto keyboard illumination triggers. + * bit 0 Any keystroke + * bit 1 Touchpad activity + * bit 2 Pointing stick + * bit 3 Any mouse + * bits 4-7 Reserved for future use + * cbRES2, byte3 Current Timeout + * bits 7:6 Timeout units indicator: + * 00b Seconds + * 01b Minutes + * 10b Hours + * 11b Days + * bits 5:0 Timeout value (0-63) in sec/min/hr/day + * NOTE: A value of 0 means always on (no timeout) if any bits of RES3 byte + * are set upon return from the [Get feature information] call. + * cbRES3, byte0 Current setting of ALS value that turns the light on or off. + * cbRES3, byte1 Current ALS reading + * cbRES3, byte2 Current keyboard light level. + * + * cbArg1 0x2 = Set New State + * cbRES1 Standard return codes (0, -1, -2) + * cbArg2, word0 Bitmap of current mode state + * bit 0 Always off (All systems) + * bit 1 Always on (Travis ATG, Siberia) + * bit 2 Auto: ALS-based On; ALS-based Off (Travis ATG) + * bit 3 Auto: ALS- and input-activity-based On; input-activity based Off + * bit 4 Auto: Input-activity-based On; input-activity based Off + * bit 5 Auto: Input-activity-based On (illumination level 25%); input-activity based Off + * bit 6 Auto: Input-activity-based On (illumination level 50%); input-activity based Off + * bit 7 Auto: Input-activity-based On (illumination level 75%); input-activity based Off + * bit 8 Auto: Input-activity-based On (illumination level 100%); input-activity based Off + * bits 9-15 Reserved for future use + * Note: Only One bit can be set + * cbArg2, byte2 Desired auto keyboard illumination triggers. Must remain inactive to allow + * keyboard to turn off automatically. + * bit 0 Any keystroke + * bit 1 Touchpad activity + * bit 2 Pointing stick + * bit 3 Any mouse + * bits 4-7 Reserved for future use + * cbArg2, byte3 Desired Timeout + * bits 7:6 Timeout units indicator: + * 00b Seconds + * 01b Minutes + * 10b Hours + * 11b Days + * bits 5:0 Timeout value (0-63) in sec/min/hr/day + * cbArg3, byte0 Desired setting of ALS value that turns the light on or off. + * cbArg3, byte2 Desired keyboard light level. + */ + + +enum kbd_timeout_unit { + KBD_TIMEOUT_SECONDS = 0, + KBD_TIMEOUT_MINUTES, + KBD_TIMEOUT_HOURS, + KBD_TIMEOUT_DAYS, +}; + +enum kbd_mode_bit { + KBD_MODE_BIT_OFF = 0, + KBD_MODE_BIT_ON, + KBD_MODE_BIT_ALS, + KBD_MODE_BIT_TRIGGER_ALS, + KBD_MODE_BIT_TRIGGER, + KBD_MODE_BIT_TRIGGER_25, + KBD_MODE_BIT_TRIGGER_50, + KBD_MODE_BIT_TRIGGER_75, + KBD_MODE_BIT_TRIGGER_100, +}; + +#define kbd_is_als_mode_bit(bit) \ + ((bit) == KBD_MODE_BIT_ALS || (bit) == KBD_MODE_BIT_TRIGGER_ALS) +#define kbd_is_trigger_mode_bit(bit) \ + ((bit) >= KBD_MODE_BIT_TRIGGER_ALS && (bit) <= KBD_MODE_BIT_TRIGGER_100) +#define kbd_is_level_mode_bit(bit) \ + ((bit) >= KBD_MODE_BIT_TRIGGER_25 && (bit) <= KBD_MODE_BIT_TRIGGER_100) + +struct kbd_info { + u16 modes; + u8 type; + u8 triggers; + u8 levels; + u8 seconds; + u8 minutes; + u8 hours; + u8 days; +}; + +struct kbd_state { + u8 mode_bit; + u8 triggers; + u8 timeout_value; + u8 timeout_unit; + u8 als_setting; + u8 als_value; + u8 level; +}; + +static const int kbd_tokens[] = { + KBD_LED_OFF_TOKEN, + KBD_LED_AUTO_25_TOKEN, + KBD_LED_AUTO_50_TOKEN, + KBD_LED_AUTO_75_TOKEN, + KBD_LED_AUTO_100_TOKEN, + KBD_LED_ON_TOKEN, +}; + +static u16 kbd_token_bits; + +static struct kbd_info kbd_info; +static bool kbd_als_supported; +static bool kbd_triggers_supported; + +static u8 kbd_mode_levels[16]; +static int kbd_mode_levels_count; + +static u8 kbd_previous_level; +static u8 kbd_previous_mode_bit; + +static bool kbd_led_present; + +/* + * NOTE: there are three ways to set the keyboard backlight level. + * First, via kbd_state.mode_bit (assigning KBD_MODE_BIT_TRIGGER_* value). + * Second, via kbd_state.level (assigning numerical value <= kbd_info.levels). + * Third, via SMBIOS tokens (KBD_LED_* in kbd_tokens) + * + * There are laptops which support only one of these methods. If we want to + * support as many machines as possible we need to implement all three methods. + * The first two methods use the kbd_state structure. The third uses SMBIOS + * tokens. If kbd_info.levels == 0, the machine does not support setting the + * keyboard backlight level via kbd_state.level. + */ + +static int kbd_get_info(struct kbd_info *info) +{ + u8 units; + int ret; + + get_buffer(); + + buffer->input[0] = 0x0; + dell_send_request(buffer, 4, 11); + ret = buffer->output[0]; + + if (ret) { + ret = dell_smi_error(ret); + goto out; + } + + info->modes = buffer->output[1] & 0xFFFF; + info->type = (buffer->output[1] >> 24) & 0xFF; + info->triggers = buffer->output[2] & 0xFF; + units = (buffer->output[2] >> 8) & 0xFF; + info->levels = (buffer->output[2] >> 16) & 0xFF; + + if (units & BIT(0)) + info->seconds = (buffer->output[3] >> 0) & 0xFF; + if (units & BIT(1)) + info->minutes = (buffer->output[3] >> 8) & 0xFF; + if (units & BIT(2)) + info->hours = (buffer->output[3] >> 16) & 0xFF; + if (units & BIT(3)) + info->days = (buffer->output[3] >> 24) & 0xFF; + + out: + release_buffer(); + return ret; +} + +static unsigned int kbd_get_max_level(void) +{ + if (kbd_info.levels != 0) + return kbd_info.levels; + if (kbd_mode_levels_count > 0) + return kbd_mode_levels_count - 1; + return 0; +} + +static int kbd_get_level(struct kbd_state *state) +{ + int i; + + if (kbd_info.levels != 0) + return state->level; + + if (kbd_mode_levels_count > 0) { + for (i = 0; i < kbd_mode_levels_count; ++i) + if (kbd_mode_levels[i] == state->mode_bit) + return i; + return 0; + } + + return -EINVAL; +} + +static int kbd_set_level(struct kbd_state *state, u8 level) +{ + if (kbd_info.levels != 0) { + if (level != 0) + kbd_previous_level = level; + if (state->level == level) + return 0; + state->level = level; + if (level != 0 && state->mode_bit == KBD_MODE_BIT_OFF) + state->mode_bit = kbd_previous_mode_bit; + else if (level == 0 && state->mode_bit != KBD_MODE_BIT_OFF) { + kbd_previous_mode_bit = state->mode_bit; + state->mode_bit = KBD_MODE_BIT_OFF; + } + return 0; + } + + if (kbd_mode_levels_count > 0 && level < kbd_mode_levels_count) { + if (level != 0) + kbd_previous_level = level; + state->mode_bit = kbd_mode_levels[level]; + return 0; + } + + return -EINVAL; +} + +static int kbd_get_state(struct kbd_state *state) +{ + int ret; + + get_buffer(); + + buffer->input[0] = 0x1; + dell_send_request(buffer, 4, 11); + ret = buffer->output[0]; + + if (ret) { + ret = dell_smi_error(ret); + goto out; + } + + state->mode_bit = ffs(buffer->output[1] & 0xFFFF); + if (state->mode_bit != 0) + state->mode_bit--; + + state->triggers = (buffer->output[1] >> 16) & 0xFF; + state->timeout_value = (buffer->output[1] >> 24) & 0x3F; + state->timeout_unit = (buffer->output[1] >> 30) & 0x3; + state->als_setting = buffer->output[2] & 0xFF; + state->als_value = (buffer->output[2] >> 8) & 0xFF; + state->level = (buffer->output[2] >> 16) & 0xFF; + + out: + release_buffer(); + return ret; +} + +static int kbd_set_state(struct kbd_state *state) +{ + int ret; + + get_buffer(); + buffer->input[0] = 0x2; + buffer->input[1] = BIT(state->mode_bit) & 0xFFFF; + buffer->input[1] |= (state->triggers & 0xFF) << 16; + buffer->input[1] |= (state->timeout_value & 0x3F) << 24; + buffer->input[1] |= (state->timeout_unit & 0x3) << 30; + buffer->input[2] = state->als_setting & 0xFF; + buffer->input[2] |= (state->level & 0xFF) << 16; + dell_send_request(buffer, 4, 11); + ret = buffer->output[0]; + release_buffer(); + + return dell_smi_error(ret); +} + +static int kbd_set_state_safe(struct kbd_state *state, struct kbd_state *old) +{ + int ret; + + ret = kbd_set_state(state); + if (ret == 0) + return 0; + + /* + * When setting the new state fails,try to restore the previous one. + * This is needed on some machines where BIOS sets a default state when + * setting a new state fails. This default state could be all off. + */ + + if (kbd_set_state(old)) + pr_err("Setting old previous keyboard state failed\n"); + + return ret; +} + +static int kbd_set_token_bit(u8 bit) +{ + int id; + int ret; + + if (bit >= ARRAY_SIZE(kbd_tokens)) + return -EINVAL; + + id = find_token_id(kbd_tokens[bit]); + if (id == -1) + return -EINVAL; + + get_buffer(); + buffer->input[0] = da_tokens[id].location; + buffer->input[1] = da_tokens[id].value; + dell_send_request(buffer, 1, 0); + ret = buffer->output[0]; + release_buffer(); + + return dell_smi_error(ret); +} + +static int kbd_get_token_bit(u8 bit) +{ + int id; + int ret; + int val; + + if (bit >= ARRAY_SIZE(kbd_tokens)) + return -EINVAL; + + id = find_token_id(kbd_tokens[bit]); + if (id == -1) + return -EINVAL; + + get_buffer(); + buffer->input[0] = da_tokens[id].location; + dell_send_request(buffer, 0, 0); + ret = buffer->output[0]; + val = buffer->output[1]; + release_buffer(); + + if (ret) + return dell_smi_error(ret); + + return (val == da_tokens[id].value); +} + +static int kbd_get_first_active_token_bit(void) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(kbd_tokens); ++i) { + ret = kbd_get_token_bit(i); + if (ret == 1) + return i; + } + + return ret; +} + +static int kbd_get_valid_token_counts(void) +{ + return hweight16(kbd_token_bits); +} + +static inline int kbd_init_info(void) +{ + struct kbd_state state; + int ret; + int i; + + ret = kbd_get_info(&kbd_info); + if (ret) + return ret; + + kbd_get_state(&state); + + /* NOTE: timeout value is stored in 6 bits so max value is 63 */ + if (kbd_info.seconds > 63) + kbd_info.seconds = 63; + if (kbd_info.minutes > 63) + kbd_info.minutes = 63; + if (kbd_info.hours > 63) + kbd_info.hours = 63; + if (kbd_info.days > 63) + kbd_info.days = 63; + + /* NOTE: On tested machines ON mode did not work and caused + * problems (turned backlight off) so do not use it + */ + kbd_info.modes &= ~BIT(KBD_MODE_BIT_ON); + + kbd_previous_level = kbd_get_level(&state); + kbd_previous_mode_bit = state.mode_bit; + + if (kbd_previous_level == 0 && kbd_get_max_level() != 0) + kbd_previous_level = 1; + + if (kbd_previous_mode_bit == KBD_MODE_BIT_OFF) { + kbd_previous_mode_bit = + ffs(kbd_info.modes & ~BIT(KBD_MODE_BIT_OFF)); + if (kbd_previous_mode_bit != 0) + kbd_previous_mode_bit--; + } + + if (kbd_info.modes & (BIT(KBD_MODE_BIT_ALS) | + BIT(KBD_MODE_BIT_TRIGGER_ALS))) + kbd_als_supported = true; + + if (kbd_info.modes & ( + BIT(KBD_MODE_BIT_TRIGGER_ALS) | BIT(KBD_MODE_BIT_TRIGGER) | + BIT(KBD_MODE_BIT_TRIGGER_25) | BIT(KBD_MODE_BIT_TRIGGER_50) | + BIT(KBD_MODE_BIT_TRIGGER_75) | BIT(KBD_MODE_BIT_TRIGGER_100) + )) + kbd_triggers_supported = true; + + /* kbd_mode_levels[0] is reserved, see below */ + for (i = 0; i < 16; ++i) + if (kbd_is_level_mode_bit(i) && (BIT(i) & kbd_info.modes)) + kbd_mode_levels[1 + kbd_mode_levels_count++] = i; + + /* + * Find the first supported mode and assign to kbd_mode_levels[0]. + * This should be 0 (off), but we cannot depend on the BIOS to + * support 0. + */ + if (kbd_mode_levels_count > 0) { + for (i = 0; i < 16; ++i) { + if (BIT(i) & kbd_info.modes) { + kbd_mode_levels[0] = i; + break; + } + } + kbd_mode_levels_count++; + } + + return 0; + +} + +static inline void kbd_init_tokens(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kbd_tokens); ++i) + if (find_token_id(kbd_tokens[i]) != -1) + kbd_token_bits |= BIT(i); +} + +static void kbd_init(void) +{ + int ret; + + ret = kbd_init_info(); + kbd_init_tokens(); + + if (kbd_token_bits != 0 || ret == 0) + kbd_led_present = true; +} + +static ssize_t kbd_led_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kbd_state new_state; + struct kbd_state state; + bool convert; + int value; + int ret; + char ch; + u8 unit; + int i; + + ret = sscanf(buf, "%d %c", &value, &ch); + if (ret < 1) + return -EINVAL; + else if (ret == 1) + ch = 's'; + + if (value < 0) + return -EINVAL; + + convert = false; + + switch (ch) { + case 's': + if (value > kbd_info.seconds) + convert = true; + unit = KBD_TIMEOUT_SECONDS; + break; + case 'm': + if (value > kbd_info.minutes) + convert = true; + unit = KBD_TIMEOUT_MINUTES; + break; + case 'h': + if (value > kbd_info.hours) + convert = true; + unit = KBD_TIMEOUT_HOURS; + break; + case 'd': + if (value > kbd_info.days) + convert = true; + unit = KBD_TIMEOUT_DAYS; + break; + default: + return -EINVAL; + } + + if (quirks && quirks->needs_kbd_timeouts) + convert = true; + + if (convert) { + /* Convert value from current units to seconds */ + switch (unit) { + case KBD_TIMEOUT_DAYS: + value *= 24; + case KBD_TIMEOUT_HOURS: + value *= 60; + case KBD_TIMEOUT_MINUTES: + value *= 60; + unit = KBD_TIMEOUT_SECONDS; + } + + if (quirks && quirks->needs_kbd_timeouts) { + for (i = 0; quirks->kbd_timeouts[i] != -1; i++) { + if (value <= quirks->kbd_timeouts[i]) { + value = quirks->kbd_timeouts[i]; + break; + } + } + } + + if (value <= kbd_info.seconds && kbd_info.seconds) { + unit = KBD_TIMEOUT_SECONDS; + } else if (value / 60 <= kbd_info.minutes && kbd_info.minutes) { + value /= 60; + unit = KBD_TIMEOUT_MINUTES; + } else if (value / (60 * 60) <= kbd_info.hours && kbd_info.hours) { + value /= (60 * 60); + unit = KBD_TIMEOUT_HOURS; + } else if (value / (60 * 60 * 24) <= kbd_info.days && kbd_info.days) { + value /= (60 * 60 * 24); + unit = KBD_TIMEOUT_DAYS; + } else { + return -EINVAL; + } + } + + ret = kbd_get_state(&state); + if (ret) + return ret; + + new_state = state; + new_state.timeout_value = value; + new_state.timeout_unit = unit; + + ret = kbd_set_state_safe(&new_state, &state); + if (ret) + return ret; + + return count; +} + +static ssize_t kbd_led_timeout_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct kbd_state state; + int ret; + int len; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + len = sprintf(buf, "%d", state.timeout_value); + + switch (state.timeout_unit) { + case KBD_TIMEOUT_SECONDS: + return len + sprintf(buf+len, "s\n"); + case KBD_TIMEOUT_MINUTES: + return len + sprintf(buf+len, "m\n"); + case KBD_TIMEOUT_HOURS: + return len + sprintf(buf+len, "h\n"); + case KBD_TIMEOUT_DAYS: + return len + sprintf(buf+len, "d\n"); + default: + return -EINVAL; + } + + return len; +} + +static DEVICE_ATTR(stop_timeout, S_IRUGO | S_IWUSR, + kbd_led_timeout_show, kbd_led_timeout_store); + +static const char * const kbd_led_triggers[] = { + "keyboard", + "touchpad", + /*"trackstick"*/ NULL, /* NOTE: trackstick is just alias for touchpad */ + "mouse", +}; + +static ssize_t kbd_led_triggers_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kbd_state new_state; + struct kbd_state state; + bool triggers_enabled = false; + int trigger_bit = -1; + char trigger[21]; + int i, ret; + + ret = sscanf(buf, "%20s", trigger); + if (ret != 1) + return -EINVAL; + + if (trigger[0] != '+' && trigger[0] != '-') + return -EINVAL; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + if (kbd_triggers_supported) + triggers_enabled = kbd_is_trigger_mode_bit(state.mode_bit); + + if (kbd_triggers_supported) { + for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); ++i) { + if (!(kbd_info.triggers & BIT(i))) + continue; + if (!kbd_led_triggers[i]) + continue; + if (strcmp(trigger+1, kbd_led_triggers[i]) != 0) + continue; + if (trigger[0] == '+' && + triggers_enabled && (state.triggers & BIT(i))) + return count; + if (trigger[0] == '-' && + (!triggers_enabled || !(state.triggers & BIT(i)))) + return count; + trigger_bit = i; + break; + } + } + + if (trigger_bit != -1) { + new_state = state; + if (trigger[0] == '+') + new_state.triggers |= BIT(trigger_bit); + else { + new_state.triggers &= ~BIT(trigger_bit); + /* NOTE: trackstick bit (2) must be disabled when + * disabling touchpad bit (1), otherwise touchpad + * bit (1) will not be disabled */ + if (trigger_bit == 1) + new_state.triggers &= ~BIT(2); + } + if ((kbd_info.triggers & new_state.triggers) != + new_state.triggers) + return -EINVAL; + if (new_state.triggers && !triggers_enabled) { + new_state.mode_bit = KBD_MODE_BIT_TRIGGER; + kbd_set_level(&new_state, kbd_previous_level); + } else if (new_state.triggers == 0) { + kbd_set_level(&new_state, 0); + } + if (!(kbd_info.modes & BIT(new_state.mode_bit))) + return -EINVAL; + ret = kbd_set_state_safe(&new_state, &state); + if (ret) + return ret; + if (new_state.mode_bit != KBD_MODE_BIT_OFF) + kbd_previous_mode_bit = new_state.mode_bit; + return count; + } + + return -EINVAL; +} + +static ssize_t kbd_led_triggers_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct kbd_state state; + bool triggers_enabled; + int level, i, ret; + int len = 0; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + len = 0; + + if (kbd_triggers_supported) { + triggers_enabled = kbd_is_trigger_mode_bit(state.mode_bit); + level = kbd_get_level(&state); + for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); ++i) { + if (!(kbd_info.triggers & BIT(i))) + continue; + if (!kbd_led_triggers[i]) + continue; + if ((triggers_enabled || level <= 0) && + (state.triggers & BIT(i))) + buf[len++] = '+'; + else + buf[len++] = '-'; + len += sprintf(buf+len, "%s ", kbd_led_triggers[i]); + } + } + + if (len) + buf[len - 1] = '\n'; + + return len; +} + +static DEVICE_ATTR(start_triggers, S_IRUGO | S_IWUSR, + kbd_led_triggers_show, kbd_led_triggers_store); + +static ssize_t kbd_led_als_enabled_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kbd_state new_state; + struct kbd_state state; + bool triggers_enabled = false; + int enable; + int ret; + + ret = kstrtoint(buf, 0, &enable); + if (ret) + return ret; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + if (enable == kbd_is_als_mode_bit(state.mode_bit)) + return count; + + new_state = state; + + if (kbd_triggers_supported) + triggers_enabled = kbd_is_trigger_mode_bit(state.mode_bit); + + if (enable) { + if (triggers_enabled) + new_state.mode_bit = KBD_MODE_BIT_TRIGGER_ALS; + else + new_state.mode_bit = KBD_MODE_BIT_ALS; + } else { + if (triggers_enabled) { + new_state.mode_bit = KBD_MODE_BIT_TRIGGER; + kbd_set_level(&new_state, kbd_previous_level); + } else { + new_state.mode_bit = KBD_MODE_BIT_ON; + } + } + if (!(kbd_info.modes & BIT(new_state.mode_bit))) + return -EINVAL; + + ret = kbd_set_state_safe(&new_state, &state); + if (ret) + return ret; + kbd_previous_mode_bit = new_state.mode_bit; + + return count; +} + +static ssize_t kbd_led_als_enabled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kbd_state state; + bool enabled = false; + int ret; + + ret = kbd_get_state(&state); + if (ret) + return ret; + enabled = kbd_is_als_mode_bit(state.mode_bit); + + return sprintf(buf, "%d\n", enabled ? 1 : 0); +} + +static DEVICE_ATTR(als_enabled, S_IRUGO | S_IWUSR, + kbd_led_als_enabled_show, kbd_led_als_enabled_store); + +static ssize_t kbd_led_als_setting_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct kbd_state state; + struct kbd_state new_state; + u8 setting; + int ret; + + ret = kstrtou8(buf, 10, &setting); + if (ret) + return ret; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + new_state = state; + new_state.als_setting = setting; + + ret = kbd_set_state_safe(&new_state, &state); + if (ret) + return ret; + + return count; +} + +static ssize_t kbd_led_als_setting_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct kbd_state state; + int ret; + + ret = kbd_get_state(&state); + if (ret) + return ret; + + return sprintf(buf, "%d\n", state.als_setting); +} + +static DEVICE_ATTR(als_setting, S_IRUGO | S_IWUSR, + kbd_led_als_setting_show, kbd_led_als_setting_store); + +static struct attribute *kbd_led_attrs[] = { + &dev_attr_stop_timeout.attr, + &dev_attr_start_triggers.attr, + NULL, +}; + +static const struct attribute_group kbd_led_group = { + .attrs = kbd_led_attrs, +}; + +static struct attribute *kbd_led_als_attrs[] = { + &dev_attr_als_enabled.attr, + &dev_attr_als_setting.attr, + NULL, +}; + +static const struct attribute_group kbd_led_als_group = { + .attrs = kbd_led_als_attrs, +}; + +static const struct attribute_group *kbd_led_groups[] = { + &kbd_led_group, + &kbd_led_als_group, + NULL, +}; + +static enum led_brightness kbd_led_level_get(struct led_classdev *led_cdev) +{ + int ret; + u16 num; + struct kbd_state state; + + if (kbd_get_max_level()) { + ret = kbd_get_state(&state); + if (ret) + return 0; + ret = kbd_get_level(&state); + if (ret < 0) + return 0; + return ret; + } + + if (kbd_get_valid_token_counts()) { + ret = kbd_get_first_active_token_bit(); + if (ret < 0) + return 0; + for (num = kbd_token_bits; num != 0 && ret > 0; --ret) + num &= num - 1; /* clear the first bit set */ + if (num == 0) + return 0; + return ffs(num) - 1; + } + + pr_warn("Keyboard brightness level control not supported\n"); + return 0; +} + +static void kbd_led_level_set(struct led_classdev *led_cdev, + enum led_brightness value) +{ + struct kbd_state state; + struct kbd_state new_state; + u16 num; + + if (kbd_get_max_level()) { + if (kbd_get_state(&state)) + return; + new_state = state; + if (kbd_set_level(&new_state, value)) + return; + kbd_set_state_safe(&new_state, &state); + return; + } + + if (kbd_get_valid_token_counts()) { + for (num = kbd_token_bits; num != 0 && value > 0; --value) + num &= num - 1; /* clear the first bit set */ + if (num == 0) + return; + kbd_set_token_bit(ffs(num) - 1); + return; + } + + pr_warn("Keyboard brightness level control not supported\n"); +} + +static struct led_classdev kbd_led = { + .name = "dell::kbd_backlight", + .brightness_set = kbd_led_level_set, + .brightness_get = kbd_led_level_get, + .groups = kbd_led_groups, +}; + +static int __init kbd_led_init(struct device *dev) +{ + kbd_init(); + if (!kbd_led_present) + return -ENODEV; + if (!kbd_als_supported) + kbd_led_groups[1] = NULL; + kbd_led.max_brightness = kbd_get_max_level(); + if (!kbd_led.max_brightness) { + kbd_led.max_brightness = kbd_get_valid_token_counts(); + if (kbd_led.max_brightness) + kbd_led.max_brightness--; + } + return led_classdev_register(dev, &kbd_led); +} + +static void brightness_set_exit(struct led_classdev *led_cdev, + enum led_brightness value) +{ + /* Don't change backlight level on exit */ +}; + +static void kbd_led_exit(void) +{ + if (!kbd_led_present) + return; + kbd_led.brightness_set = brightness_set_exit; + led_classdev_unregister(&kbd_led); +} + static int __init dell_init(void) { int max_intensity = 0; @@ -841,6 +1913,8 @@ static int __init dell_init(void) if (quirks && quirks->touchpad_led) touchpad_led_init(&platform_device->dev); + kbd_led_init(&platform_device->dev); + dell_laptop_dir = debugfs_create_dir("dell_laptop", NULL); if (dell_laptop_dir != NULL) debugfs_create_file("rfkill", 0444, dell_laptop_dir, NULL, @@ -908,6 +1982,7 @@ static void __exit dell_exit(void) debugfs_remove_recursive(dell_laptop_dir); if (quirks && quirks->touchpad_led) touchpad_led_exit(); + kbd_led_exit(); i8042_remove_filter(dell_laptop_i8042_filter); cancel_delayed_work_sync(&dell_rfkill_work); backlight_device_unregister(dell_backlight_device); @@ -924,5 +1999,7 @@ module_init(dell_init); module_exit(dell_exit); MODULE_AUTHOR("Matthew Garrett <mjg@redhat.com>"); +MODULE_AUTHOR("Gabriele Mazzotta <gabriele.mzt@gmail.com>"); +MODULE_AUTHOR("Pali Rohár <pali.rohar@gmail.com>"); MODULE_DESCRIPTION("Dell laptop driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/intel_oaktrail.c b/drivers/platform/x86/intel_oaktrail.c index a4a4258f6134..8037c8b46241 100644 --- a/drivers/platform/x86/intel_oaktrail.c +++ b/drivers/platform/x86/intel_oaktrail.c @@ -62,7 +62,7 @@ * (1 << 1): Bluetooth enable/disable, RW. * (1 << 2): GPS enable/disable, RW. * (1 << 3): WiFi enable/disable, RW. - * (1 << 4): WWAN (3G) enable/disalbe, RW. + * (1 << 4): WWAN (3G) enable/disable, RW. * (1 << 5): Touchscreen enable/disable, Read Only. */ #define OT_EC_DEVICE_STATE_ADDRESS 0xD6 diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 3b8ceee7c5cb..7769575345d8 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -319,6 +319,7 @@ static struct { u32 sensors_pdrv_attrs_registered:1; u32 sensors_pdev_attrs_registered:1; u32 hotkey_poll_active:1; + u32 has_adaptive_kbd:1; } tp_features; static struct { @@ -1911,6 +1912,27 @@ enum { /* hot key scan codes (derived from ACPI DSDT) */ TP_ACPI_HOTKEYSCAN_UNK7, TP_ACPI_HOTKEYSCAN_UNK8, + TP_ACPI_HOTKEYSCAN_MUTE2, + TP_ACPI_HOTKEYSCAN_BRIGHTNESS_ZERO, + TP_ACPI_HOTKEYSCAN_CLIPPING_TOOL, + TP_ACPI_HOTKEYSCAN_CLOUD, + TP_ACPI_HOTKEYSCAN_UNK9, + TP_ACPI_HOTKEYSCAN_VOICE, + TP_ACPI_HOTKEYSCAN_UNK10, + TP_ACPI_HOTKEYSCAN_GESTURES, + TP_ACPI_HOTKEYSCAN_UNK11, + TP_ACPI_HOTKEYSCAN_UNK12, + TP_ACPI_HOTKEYSCAN_UNK13, + TP_ACPI_HOTKEYSCAN_CONFIG, + TP_ACPI_HOTKEYSCAN_NEW_TAB, + TP_ACPI_HOTKEYSCAN_RELOAD, + TP_ACPI_HOTKEYSCAN_BACK, + TP_ACPI_HOTKEYSCAN_MIC_DOWN, + TP_ACPI_HOTKEYSCAN_MIC_UP, + TP_ACPI_HOTKEYSCAN_MIC_CANCELLATION, + TP_ACPI_HOTKEYSCAN_CAMERA_MODE, + TP_ACPI_HOTKEYSCAN_ROTATE_DISPLAY, + /* Hotkey keymap size */ TPACPI_HOTKEY_MAP_LEN }; @@ -2647,9 +2669,7 @@ static ssize_t hotkey_enable_store(struct device *dev, return count; } -static struct device_attribute dev_attr_hotkey_enable = - __ATTR(hotkey_enable, S_IWUSR | S_IRUGO, - hotkey_enable_show, hotkey_enable_store); +static DEVICE_ATTR_RW(hotkey_enable); /* sysfs hotkey mask --------------------------------------------------- */ static ssize_t hotkey_mask_show(struct device *dev, @@ -2685,9 +2705,7 @@ static ssize_t hotkey_mask_store(struct device *dev, return (res) ? res : count; } -static struct device_attribute dev_attr_hotkey_mask = - __ATTR(hotkey_mask, S_IWUSR | S_IRUGO, - hotkey_mask_show, hotkey_mask_store); +static DEVICE_ATTR_RW(hotkey_mask); /* sysfs hotkey bios_enabled ------------------------------------------- */ static ssize_t hotkey_bios_enabled_show(struct device *dev, @@ -2697,8 +2715,7 @@ static ssize_t hotkey_bios_enabled_show(struct device *dev, return sprintf(buf, "0\n"); } -static struct device_attribute dev_attr_hotkey_bios_enabled = - __ATTR(hotkey_bios_enabled, S_IRUGO, hotkey_bios_enabled_show, NULL); +static DEVICE_ATTR_RO(hotkey_bios_enabled); /* sysfs hotkey bios_mask ---------------------------------------------- */ static ssize_t hotkey_bios_mask_show(struct device *dev, @@ -2710,8 +2727,7 @@ static ssize_t hotkey_bios_mask_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "0x%08x\n", hotkey_orig_mask); } -static struct device_attribute dev_attr_hotkey_bios_mask = - __ATTR(hotkey_bios_mask, S_IRUGO, hotkey_bios_mask_show, NULL); +static DEVICE_ATTR_RO(hotkey_bios_mask); /* sysfs hotkey all_mask ----------------------------------------------- */ static ssize_t hotkey_all_mask_show(struct device *dev, @@ -2722,8 +2738,7 @@ static ssize_t hotkey_all_mask_show(struct device *dev, hotkey_all_mask | hotkey_source_mask); } -static struct device_attribute dev_attr_hotkey_all_mask = - __ATTR(hotkey_all_mask, S_IRUGO, hotkey_all_mask_show, NULL); +static DEVICE_ATTR_RO(hotkey_all_mask); /* sysfs hotkey recommended_mask --------------------------------------- */ static ssize_t hotkey_recommended_mask_show(struct device *dev, @@ -2735,9 +2750,7 @@ static ssize_t hotkey_recommended_mask_show(struct device *dev, & ~hotkey_reserved_mask); } -static struct device_attribute dev_attr_hotkey_recommended_mask = - __ATTR(hotkey_recommended_mask, S_IRUGO, - hotkey_recommended_mask_show, NULL); +static DEVICE_ATTR_RO(hotkey_recommended_mask); #ifdef CONFIG_THINKPAD_ACPI_HOTKEY_POLL @@ -2792,9 +2805,7 @@ static ssize_t hotkey_source_mask_store(struct device *dev, return (rc < 0) ? rc : count; } -static struct device_attribute dev_attr_hotkey_source_mask = - __ATTR(hotkey_source_mask, S_IWUSR | S_IRUGO, - hotkey_source_mask_show, hotkey_source_mask_store); +static DEVICE_ATTR_RW(hotkey_source_mask); /* sysfs hotkey hotkey_poll_freq --------------------------------------- */ static ssize_t hotkey_poll_freq_show(struct device *dev, @@ -2826,9 +2837,7 @@ static ssize_t hotkey_poll_freq_store(struct device *dev, return count; } -static struct device_attribute dev_attr_hotkey_poll_freq = - __ATTR(hotkey_poll_freq, S_IWUSR | S_IRUGO, - hotkey_poll_freq_show, hotkey_poll_freq_store); +static DEVICE_ATTR_RW(hotkey_poll_freq); #endif /* CONFIG_THINKPAD_ACPI_HOTKEY_POLL */ @@ -2849,8 +2858,7 @@ static ssize_t hotkey_radio_sw_show(struct device *dev, (res == TPACPI_RFK_RADIO_OFF) ? 0 : 1); } -static struct device_attribute dev_attr_hotkey_radio_sw = - __ATTR(hotkey_radio_sw, S_IRUGO, hotkey_radio_sw_show, NULL); +static DEVICE_ATTR_RO(hotkey_radio_sw); static void hotkey_radio_sw_notify_change(void) { @@ -2872,8 +2880,7 @@ static ssize_t hotkey_tablet_mode_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%d\n", !!s); } -static struct device_attribute dev_attr_hotkey_tablet_mode = - __ATTR(hotkey_tablet_mode, S_IRUGO, hotkey_tablet_mode_show, NULL); +static DEVICE_ATTR_RO(hotkey_tablet_mode); static void hotkey_tablet_mode_notify_change(void) { @@ -2890,8 +2897,7 @@ static ssize_t hotkey_wakeup_reason_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%d\n", hotkey_wakeup_reason); } -static struct device_attribute dev_attr_hotkey_wakeup_reason = - __ATTR(wakeup_reason, S_IRUGO, hotkey_wakeup_reason_show, NULL); +static DEVICE_ATTR_RO(hotkey_wakeup_reason); static void hotkey_wakeup_reason_notify_change(void) { @@ -2907,9 +2913,7 @@ static ssize_t hotkey_wakeup_hotunplug_complete_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%d\n", hotkey_autosleep_ack); } -static struct device_attribute dev_attr_hotkey_wakeup_hotunplug_complete = - __ATTR(wakeup_hotunplug_complete, S_IRUGO, - hotkey_wakeup_hotunplug_complete_show, NULL); +static DEVICE_ATTR_RO(hotkey_wakeup_hotunplug_complete); static void hotkey_wakeup_hotunplug_complete_notify_change(void) { @@ -2917,6 +2921,57 @@ static void hotkey_wakeup_hotunplug_complete_notify_change(void) "wakeup_hotunplug_complete"); } +/* sysfs adaptive kbd mode --------------------------------------------- */ + +static int adaptive_keyboard_get_mode(void); +static int adaptive_keyboard_set_mode(int new_mode); + +enum ADAPTIVE_KEY_MODE { + HOME_MODE, + WEB_BROWSER_MODE, + WEB_CONFERENCE_MODE, + FUNCTION_MODE, + LAYFLAT_MODE +}; + +static ssize_t adaptive_kbd_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int current_mode; + + current_mode = adaptive_keyboard_get_mode(); + if (current_mode < 0) + return current_mode; + + return snprintf(buf, PAGE_SIZE, "%d\n", current_mode); +} + +static ssize_t adaptive_kbd_mode_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long t; + int res; + + if (parse_strtoul(buf, LAYFLAT_MODE, &t)) + return -EINVAL; + + res = adaptive_keyboard_set_mode(t); + return (res < 0) ? res : count; +} + +static DEVICE_ATTR_RW(adaptive_kbd_mode); + +static struct attribute *adaptive_kbd_attributes[] = { + &dev_attr_adaptive_kbd_mode.attr, + NULL +}; + +static const struct attribute_group adaptive_kbd_attr_group = { + .attrs = adaptive_kbd_attributes, +}; + /* --------------------------------------------------------------------- */ static struct attribute *hotkey_attributes[] __initdata = { @@ -3118,6 +3173,13 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) /* (assignments unknown, please report if found) */ KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, + + /* No assignments, only used for Adaptive keyboards. */ + KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, + KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, + KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, + KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, + KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, }, /* Generic keymap for Lenovo ThinkPads */ @@ -3174,6 +3236,35 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) /* Extra keys in use since the X240 / T440 / T540 */ KEY_CONFIG, KEY_SEARCH, KEY_SCALE, KEY_FILE, + + /* + * These are the adaptive keyboard keycodes for Carbon X1 2014. + * The first item in this list is the Mute button which is + * emitted with 0x103 through + * adaptive_keyboard_hotkey_notify_hotkey() when the sound + * symbol is held. + * We'll need to offset those by 0x20. + */ + KEY_RESERVED, /* Mute held, 0x103 */ + KEY_BRIGHTNESS_MIN, /* Backlight off */ + KEY_RESERVED, /* Clipping tool */ + KEY_RESERVED, /* Cloud */ + KEY_RESERVED, + KEY_VOICECOMMAND, /* Voice */ + KEY_RESERVED, + KEY_RESERVED, /* Gestures */ + KEY_RESERVED, + KEY_RESERVED, + KEY_RESERVED, + KEY_CONFIG, /* Settings */ + KEY_RESERVED, /* New tab */ + KEY_REFRESH, /* Reload */ + KEY_BACK, /* Back */ + KEY_RESERVED, /* Microphone down */ + KEY_RESERVED, /* Microphone up */ + KEY_RESERVED, /* Microphone cancellation */ + KEY_RESERVED, /* Camera mode */ + KEY_RESERVED, /* Rotate display, 0x116 */ }, }; @@ -3227,6 +3318,20 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) if (!tp_features.hotkey) return 1; + /* + * Check if we have an adaptive keyboard, like on the + * Lenovo Carbon X1 2014 (2nd Gen). + */ + if (acpi_evalf(hkey_handle, &hkeyv, "MHKV", "qd")) { + if ((hkeyv >> 8) == 2) { + tp_features.has_adaptive_kbd = true; + res = sysfs_create_group(&tpacpi_pdev->dev.kobj, + &adaptive_kbd_attr_group); + if (res) + goto err_exit; + } + } + quirks = tpacpi_check_quirks(tpacpi_hotkey_qtable, ARRAY_SIZE(tpacpi_hotkey_qtable)); @@ -3437,6 +3542,9 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) err_exit: delete_attr_set(hotkey_dev_attributes, &tpacpi_pdev->dev.kobj); + sysfs_remove_group(&tpacpi_pdev->dev.kobj, + &adaptive_kbd_attr_group); + hotkey_dev_attributes = NULL; return (res < 0) ? res : 1; @@ -3449,14 +3557,6 @@ err_exit: * Will consider support rest of modes in future. * */ -enum ADAPTIVE_KEY_MODE { - HOME_MODE, - WEB_BROWSER_MODE, - WEB_CONFERENCE_MODE, - FUNCTION_MODE, - LAYFLAT_MODE -}; - static const int adaptive_keyboard_modes[] = { HOME_MODE, /* WEB_BROWSER_MODE = 2, @@ -3466,6 +3566,8 @@ static const int adaptive_keyboard_modes[] = { #define DFR_CHANGE_ROW 0x101 #define DFR_SHOW_QUICKVIEW_ROW 0x102 +#define FIRST_ADAPTIVE_KEY 0x103 +#define ADAPTIVE_KEY_OFFSET 0x020 /* press Fn key a while second, it will switch to Function Mode. Then * release Fn key, previous mode be restored. @@ -3473,6 +3575,32 @@ static const int adaptive_keyboard_modes[] = { static bool adaptive_keyboard_mode_is_saved; static int adaptive_keyboard_prev_mode; +static int adaptive_keyboard_get_mode(void) +{ + int mode = 0; + + if (!acpi_evalf(hkey_handle, &mode, "GTRW", "dd", 0)) { + pr_err("Cannot read adaptive keyboard mode\n"); + return -EIO; + } + + return mode; +} + +static int adaptive_keyboard_set_mode(int new_mode) +{ + if (new_mode < 0 || + new_mode > LAYFLAT_MODE) + return -EINVAL; + + if (!acpi_evalf(hkey_handle, NULL, "STRW", "vd", new_mode)) { + pr_err("Cannot set adaptive keyboard mode\n"); + return -EIO; + } + + return 0; +} + static int adaptive_keyboard_get_next_mode(int mode) { size_t i; @@ -3493,8 +3621,9 @@ static int adaptive_keyboard_get_next_mode(int mode) static bool adaptive_keyboard_hotkey_notify_hotkey(unsigned int scancode) { - u32 current_mode = 0; + int current_mode = 0; int new_mode = 0; + int keycode; switch (scancode) { case DFR_CHANGE_ROW: @@ -3502,43 +3631,51 @@ static bool adaptive_keyboard_hotkey_notify_hotkey(unsigned int scancode) new_mode = adaptive_keyboard_prev_mode; adaptive_keyboard_mode_is_saved = false; } else { - if (!acpi_evalf( - hkey_handle, ¤t_mode, - "GTRW", "dd", 0)) { - pr_err("Cannot read adaptive keyboard mode\n"); + current_mode = adaptive_keyboard_get_mode(); + if (current_mode < 0) return false; - } else { - new_mode = adaptive_keyboard_get_next_mode( - current_mode); - } + new_mode = adaptive_keyboard_get_next_mode( + current_mode); } - if (!acpi_evalf(hkey_handle, NULL, "STRW", "vd", new_mode)) { - pr_err("Cannot set adaptive keyboard mode\n"); + if (adaptive_keyboard_set_mode(new_mode) < 0) return false; - } return true; case DFR_SHOW_QUICKVIEW_ROW: - if (!acpi_evalf(hkey_handle, - &adaptive_keyboard_prev_mode, - "GTRW", "dd", 0)) { - pr_err("Cannot read adaptive keyboard mode\n"); + current_mode = adaptive_keyboard_get_mode(); + if (current_mode < 0) return false; - } else { - adaptive_keyboard_mode_is_saved = true; - if (!acpi_evalf(hkey_handle, - NULL, "STRW", "vd", FUNCTION_MODE)) { - pr_err("Cannot set adaptive keyboard mode\n"); - return false; - } - } + adaptive_keyboard_prev_mode = current_mode; + adaptive_keyboard_mode_is_saved = true; + + if (adaptive_keyboard_set_mode (FUNCTION_MODE) < 0) + return false; return true; default: - return false; + if (scancode < FIRST_ADAPTIVE_KEY || + scancode >= FIRST_ADAPTIVE_KEY + TPACPI_HOTKEY_MAP_LEN - + ADAPTIVE_KEY_OFFSET) { + pr_info("Unhandled adaptive keyboard key: 0x%x\n", + scancode); + return false; + } + keycode = hotkey_keycode_map[scancode - FIRST_ADAPTIVE_KEY + ADAPTIVE_KEY_OFFSET]; + if (keycode != KEY_RESERVED) { + mutex_lock(&tpacpi_inputdev_send_mutex); + + input_report_key(tpacpi_inputdev, keycode, 1); + input_sync(tpacpi_inputdev); + + input_report_key(tpacpi_inputdev, keycode, 0); + input_sync(tpacpi_inputdev); + + mutex_unlock(&tpacpi_inputdev_send_mutex); + } + return true; } } @@ -3836,28 +3973,21 @@ static void hotkey_notify(struct ibm_struct *ibm, u32 event) static void hotkey_suspend(void) { - int hkeyv; - /* Do these on suspend, we get the events on early resume! */ hotkey_wakeup_reason = TP_ACPI_WAKEUP_NONE; hotkey_autosleep_ack = 0; /* save previous mode of adaptive keyboard of X1 Carbon */ - if (acpi_evalf(hkey_handle, &hkeyv, "MHKV", "qd")) { - if ((hkeyv >> 8) == 2) { - if (!acpi_evalf(hkey_handle, - &adaptive_keyboard_prev_mode, - "GTRW", "dd", 0)) { - pr_err("Cannot read adaptive keyboard mode.\n"); - } + if (tp_features.has_adaptive_kbd) { + if (!acpi_evalf(hkey_handle, &adaptive_keyboard_prev_mode, + "GTRW", "dd", 0)) { + pr_err("Cannot read adaptive keyboard mode.\n"); } } } static void hotkey_resume(void) { - int hkeyv; - tpacpi_disable_brightness_delay(); if (hotkey_status_set(true) < 0 || @@ -3872,14 +4002,10 @@ static void hotkey_resume(void) hotkey_poll_setup_safe(false); /* restore previous mode of adapive keyboard of X1 Carbon */ - if (acpi_evalf(hkey_handle, &hkeyv, "MHKV", "qd")) { - if ((hkeyv >> 8) == 2) { - if (!acpi_evalf(hkey_handle, - NULL, - "STRW", "vd", - adaptive_keyboard_prev_mode)) { - pr_err("Cannot set adaptive keyboard mode.\n"); - } + if (tp_features.has_adaptive_kbd) { + if (!acpi_evalf(hkey_handle, NULL, "STRW", "vd", + adaptive_keyboard_prev_mode)) { + pr_err("Cannot set adaptive keyboard mode.\n"); } } } @@ -4079,9 +4205,7 @@ static ssize_t bluetooth_enable_store(struct device *dev, attr, buf, count); } -static struct device_attribute dev_attr_bluetooth_enable = - __ATTR(bluetooth_enable, S_IWUSR | S_IRUGO, - bluetooth_enable_show, bluetooth_enable_store); +static DEVICE_ATTR_RW(bluetooth_enable); /* --------------------------------------------------------------------- */ @@ -4269,9 +4393,7 @@ static ssize_t wan_enable_store(struct device *dev, attr, buf, count); } -static struct device_attribute dev_attr_wan_enable = - __ATTR(wwan_enable, S_IWUSR | S_IRUGO, - wan_enable_show, wan_enable_store); +static DEVICE_ATTR_RW(wan_enable); /* --------------------------------------------------------------------- */ @@ -5048,8 +5170,7 @@ static ssize_t cmos_command_store(struct device *dev, return (res) ? res : count; } -static struct device_attribute dev_attr_cmos_command = - __ATTR(cmos_command, S_IWUSR, NULL, cmos_command_store); +static DEVICE_ATTR_WO(cmos_command); /* --------------------------------------------------------------------- */ @@ -8017,9 +8138,7 @@ static ssize_t fan_pwm1_enable_store(struct device *dev, return count; } -static struct device_attribute dev_attr_fan_pwm1_enable = - __ATTR(pwm1_enable, S_IWUSR | S_IRUGO, - fan_pwm1_enable_show, fan_pwm1_enable_store); +static DEVICE_ATTR_RW(fan_pwm1_enable); /* sysfs fan pwm1 ------------------------------------------------------ */ static ssize_t fan_pwm1_show(struct device *dev, @@ -8079,9 +8198,7 @@ static ssize_t fan_pwm1_store(struct device *dev, return (rc) ? rc : count; } -static struct device_attribute dev_attr_fan_pwm1 = - __ATTR(pwm1, S_IWUSR | S_IRUGO, - fan_pwm1_show, fan_pwm1_store); +static DEVICE_ATTR_RW(fan_pwm1); /* sysfs fan fan1_input ------------------------------------------------ */ static ssize_t fan_fan1_input_show(struct device *dev, @@ -8098,9 +8215,7 @@ static ssize_t fan_fan1_input_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%u\n", speed); } -static struct device_attribute dev_attr_fan_fan1_input = - __ATTR(fan1_input, S_IRUGO, - fan_fan1_input_show, NULL); +static DEVICE_ATTR_RO(fan_fan1_input); /* sysfs fan fan2_input ------------------------------------------------ */ static ssize_t fan_fan2_input_show(struct device *dev, @@ -8117,9 +8232,7 @@ static ssize_t fan_fan2_input_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%u\n", speed); } -static struct device_attribute dev_attr_fan_fan2_input = - __ATTR(fan2_input, S_IRUGO, - fan_fan2_input_show, NULL); +static DEVICE_ATTR_RO(fan_fan2_input); /* sysfs fan fan_watchdog (hwmon driver) ------------------------------- */ static ssize_t fan_fan_watchdog_show(struct device_driver *drv, @@ -8735,8 +8848,7 @@ static ssize_t thinkpad_acpi_pdev_name_show(struct device *dev, return snprintf(buf, PAGE_SIZE, "%s\n", TPACPI_NAME); } -static struct device_attribute dev_attr_thinkpad_acpi_pdev_name = - __ATTR(name, S_IRUGO, thinkpad_acpi_pdev_name_show, NULL); +static DEVICE_ATTR_RO(thinkpad_acpi_pdev_name); /* --------------------------------------------------------------------- */ diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index dbcb7a8915b8..9956b9902bb4 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -51,6 +51,7 @@ #include <linux/acpi.h> #include <linux/dmi.h> #include <linux/uaccess.h> +#include <acpi/video.h> MODULE_AUTHOR("John Belmonte"); MODULE_DESCRIPTION("Toshiba Laptop ACPI Extras Driver"); @@ -116,6 +117,7 @@ MODULE_LICENSE("GPL"); #define HCI_KBD_ILLUMINATION 0x0095 #define HCI_ECO_MODE 0x0097 #define HCI_ACCELEROMETER2 0x00a6 +#define HCI_SYSTEM_INFO 0xc000 #define SCI_PANEL_POWER_ON 0x010d #define SCI_ILLUMINATION 0x014e #define SCI_USB_SLEEP_CHARGE 0x0150 @@ -129,10 +131,13 @@ MODULE_LICENSE("GPL"); #define HCI_ACCEL_MASK 0x7fff #define HCI_HOTKEY_DISABLE 0x0b #define HCI_HOTKEY_ENABLE 0x09 +#define HCI_HOTKEY_SPECIAL_FUNCTIONS 0x10 #define HCI_LCD_BRIGHTNESS_BITS 3 #define HCI_LCD_BRIGHTNESS_SHIFT (16-HCI_LCD_BRIGHTNESS_BITS) #define HCI_LCD_BRIGHTNESS_LEVELS (1 << HCI_LCD_BRIGHTNESS_BITS) #define HCI_MISC_SHIFT 0x10 +#define HCI_SYSTEM_TYPE1 0x10 +#define HCI_SYSTEM_TYPE2 0x11 #define HCI_VIDEO_OUT_LCD 0x1 #define HCI_VIDEO_OUT_CRT 0x2 #define HCI_VIDEO_OUT_TV 0x4 @@ -147,9 +152,10 @@ MODULE_LICENSE("GPL"); #define SCI_KBD_MODE_OFF 0x10 #define SCI_KBD_TIME_MAX 0x3c001a #define SCI_USB_CHARGE_MODE_MASK 0xff -#define SCI_USB_CHARGE_DISABLED 0x30000 -#define SCI_USB_CHARGE_ALTERNATE 0x30009 -#define SCI_USB_CHARGE_AUTO 0x30021 +#define SCI_USB_CHARGE_DISABLED 0x00 +#define SCI_USB_CHARGE_ALTERNATE 0x09 +#define SCI_USB_CHARGE_TYPICAL 0x11 +#define SCI_USB_CHARGE_AUTO 0x21 #define SCI_USB_CHARGE_BAT_MASK 0x7 #define SCI_USB_CHARGE_BAT_LVL_OFF 0x1 #define SCI_USB_CHARGE_BAT_LVL_ON 0x4 @@ -174,6 +180,8 @@ struct toshiba_acpi_dev { int kbd_mode; int kbd_time; int usbsc_bat_level; + int usbsc_mode_base; + int hotkey_event_type; unsigned int illumination_supported:1; unsigned int video_supported:1; @@ -243,29 +251,6 @@ static const struct key_entry toshiba_acpi_keymap[] = { { KE_END, 0 }, }; -/* alternative keymap */ -static const struct dmi_system_id toshiba_alt_keymap_dmi[] = { - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), - DMI_MATCH(DMI_PRODUCT_NAME, "Satellite M840"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), - DMI_MATCH(DMI_PRODUCT_NAME, "Qosmio X75-A"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), - DMI_MATCH(DMI_PRODUCT_NAME, "TECRA A50-A"), - }, - }, - {} -}; - static const struct key_entry toshiba_acpi_alt_keymap[] = { { KE_KEY, 0x157, { KEY_MUTE } }, { KE_KEY, 0x102, { KEY_ZOOMOUT } }, @@ -281,6 +266,14 @@ static const struct key_entry toshiba_acpi_alt_keymap[] = { }; /* + * List of models which have a broken acpi-video backlight interface and thus + * need to use the toshiba (vendor) interface instead. + */ +static const struct dmi_system_id toshiba_vendor_backlight_dmi[] = { + {} +}; + +/* * Utility */ @@ -819,6 +812,54 @@ static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev, } /* Sleep (Charge and Music) utilities support */ +static void toshiba_usb_sleep_charge_available(struct toshiba_acpi_dev *dev) +{ + u32 in[TCI_WORDS] = { SCI_GET, SCI_USB_SLEEP_CHARGE, 0, 0, 0, 0 }; + u32 out[TCI_WORDS]; + acpi_status status; + + /* Set the feature to "not supported" in case of error */ + dev->usb_sleep_charge_supported = 0; + + if (!sci_open(dev)) + return; + + status = tci_raw(dev, in, out); + if (ACPI_FAILURE(status) || out[0] == TOS_FAILURE) { + pr_err("ACPI call to get USB Sleep and Charge mode failed\n"); + sci_close(dev); + return; + } else if (out[0] == TOS_NOT_SUPPORTED) { + pr_info("USB Sleep and Charge not supported\n"); + sci_close(dev); + return; + } else if (out[0] == TOS_SUCCESS) { + dev->usbsc_mode_base = out[4]; + } + + in[5] = SCI_USB_CHARGE_BAT_LVL; + status = tci_raw(dev, in, out); + if (ACPI_FAILURE(status) || out[0] == TOS_FAILURE) { + pr_err("ACPI call to get USB Sleep and Charge mode failed\n"); + sci_close(dev); + return; + } else if (out[0] == TOS_NOT_SUPPORTED) { + pr_info("USB Sleep and Charge not supported\n"); + sci_close(dev); + return; + } else if (out[0] == TOS_SUCCESS) { + dev->usbsc_bat_level = out[2]; + /* + * If we reach this point, it means that the laptop has support + * for this feature and all values are initialized. + * Set it as supported. + */ + dev->usb_sleep_charge_supported = 1; + } + + sci_close(dev); +} + static int toshiba_usb_sleep_charge_get(struct toshiba_acpi_dev *dev, u32 *mode) { @@ -934,11 +975,11 @@ static int toshiba_usb_rapid_charge_get(struct toshiba_acpi_dev *dev, status = tci_raw(dev, in, out); sci_close(dev); if (ACPI_FAILURE(status) || out[0] == TOS_FAILURE) { - pr_err("ACPI call to get USB S&C battery level failed\n"); + pr_err("ACPI call to get USB Rapid Charge failed\n"); return -EIO; } else if (out[0] == TOS_NOT_SUPPORTED || out[0] == TOS_INPUT_DATA_ERROR) { - pr_info("USB Sleep and Charge not supported\n"); + pr_info("USB Rapid Charge not supported\n"); return -ENODEV; } @@ -962,10 +1003,10 @@ static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev, status = tci_raw(dev, in, out); sci_close(dev); if (ACPI_FAILURE(status) || out[0] == TOS_FAILURE) { - pr_err("ACPI call to set USB S&C battery level failed\n"); + pr_err("ACPI call to set USB Rapid Charge failed\n"); return -EIO; } else if (out[0] == TOS_NOT_SUPPORTED) { - pr_info("USB Sleep and Charge not supported\n"); + pr_info("USB Rapid Charge not supported\n"); return -ENODEV; } else if (out[0] == TOS_INPUT_DATA_ERROR) { return -EIO; @@ -984,10 +1025,10 @@ static int toshiba_usb_sleep_music_get(struct toshiba_acpi_dev *dev, u32 *state) result = sci_read(dev, SCI_USB_SLEEP_MUSIC, state); sci_close(dev); if (result == TOS_FAILURE) { - pr_err("ACPI call to set USB S&C mode failed\n"); + pr_err("ACPI call to get Sleep and Music failed\n"); return -EIO; } else if (result == TOS_NOT_SUPPORTED) { - pr_info("USB Sleep and Charge not supported\n"); + pr_info("Sleep and Music not supported\n"); return -ENODEV; } else if (result == TOS_INPUT_DATA_ERROR) { return -EIO; @@ -1006,10 +1047,10 @@ static int toshiba_usb_sleep_music_set(struct toshiba_acpi_dev *dev, u32 state) result = sci_write(dev, SCI_USB_SLEEP_MUSIC, state); sci_close(dev); if (result == TOS_FAILURE) { - pr_err("ACPI call to set USB S&C mode failed\n"); + pr_err("ACPI call to set Sleep and Music failed\n"); return -EIO; } else if (result == TOS_NOT_SUPPORTED) { - pr_info("USB Sleep and Charge not supported\n"); + pr_info("Sleep and Music not supported\n"); return -ENODEV; } else if (result == TOS_INPUT_DATA_ERROR) { return -EIO; @@ -1149,6 +1190,28 @@ static int toshiba_usb_three_set(struct toshiba_acpi_dev *dev, u32 state) return 0; } +/* Hotkey Event type */ +static int toshiba_hotkey_event_type_get(struct toshiba_acpi_dev *dev, + u32 *type) +{ + u32 val1 = 0x03; + u32 val2 = 0; + u32 result; + + result = hci_read2(dev, HCI_SYSTEM_INFO, &val1, &val2); + if (result == TOS_FAILURE) { + pr_err("ACPI call to get System type failed\n"); + return -EIO; + } else if (result == TOS_NOT_SUPPORTED) { + pr_info("System type not supported\n"); + return -ENODEV; + } + + *type = val2; + + return 0; +} + /* Bluetooth rfkill handlers */ static u32 hci_get_bt_present(struct toshiba_acpi_dev *dev, bool *present) @@ -1973,17 +2036,21 @@ static ssize_t usb_sleep_charge_store(struct device *dev, * 0 - Disabled * 1 - Alternate (Non USB conformant devices that require more power) * 2 - Auto (USB conformant devices) + * 3 - Typical */ - if (state != 0 && state != 1 && state != 2) + if (state != 0 && state != 1 && state != 2 && state != 3) return -EINVAL; /* Set the USB charging mode to internal value */ + mode = toshiba->usbsc_mode_base; if (state == 0) - mode = SCI_USB_CHARGE_DISABLED; + mode |= SCI_USB_CHARGE_DISABLED; else if (state == 1) - mode = SCI_USB_CHARGE_ALTERNATE; + mode |= SCI_USB_CHARGE_ALTERNATE; else if (state == 2) - mode = SCI_USB_CHARGE_AUTO; + mode |= SCI_USB_CHARGE_AUTO; + else if (state == 3) + mode |= SCI_USB_CHARGE_TYPICAL; ret = toshiba_usb_sleep_charge_set(toshiba, mode); if (ret) @@ -2333,6 +2400,20 @@ static int toshiba_acpi_enable_hotkeys(struct toshiba_acpi_dev *dev) return 0; } +static void toshiba_acpi_enable_special_functions(struct toshiba_acpi_dev *dev) +{ + u32 result; + + /* + * Re-activate the hotkeys, but this time, we are using the + * "Special Functions" mode. + */ + result = hci_write1(dev, HCI_HOTKEY_EVENT, + HCI_HOTKEY_SPECIAL_FUNCTIONS); + if (result != TOS_SUCCESS) + pr_err("Could not enable the Special Function mode\n"); +} + static bool toshiba_acpi_i8042_filter(unsigned char data, unsigned char str, struct serio *port) { @@ -2434,10 +2515,22 @@ static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev) static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev) { + const struct key_entry *keymap = toshiba_acpi_keymap; acpi_handle ec_handle; - int error; + u32 events_type; u32 hci_result; - const struct key_entry *keymap = toshiba_acpi_keymap; + int error; + + error = toshiba_acpi_enable_hotkeys(dev); + if (error) + return error; + + error = toshiba_hotkey_event_type_get(dev, &events_type); + if (error) { + pr_err("Unable to query Hotkey Event Type\n"); + return error; + } + dev->hotkey_event_type = events_type; dev->hotkey_dev = input_allocate_device(); if (!dev->hotkey_dev) @@ -2447,8 +2540,14 @@ static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev) dev->hotkey_dev->phys = "toshiba_acpi/input0"; dev->hotkey_dev->id.bustype = BUS_HOST; - if (dmi_check_system(toshiba_alt_keymap_dmi)) + if (events_type == HCI_SYSTEM_TYPE1 || + !dev->kbd_function_keys_supported) + keymap = toshiba_acpi_keymap; + else if (events_type == HCI_SYSTEM_TYPE2 || + dev->kbd_function_keys_supported) keymap = toshiba_acpi_alt_keymap; + else + pr_info("Unknown event type received %x\n", events_type); error = sparse_keymap_setup(dev->hotkey_dev, keymap, NULL); if (error) goto err_free_dev; @@ -2490,12 +2589,6 @@ static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev) goto err_remove_filter; } - error = toshiba_acpi_enable_hotkeys(dev); - if (error) { - pr_info("Unable to enable hotkeys\n"); - goto err_remove_filter; - } - error = input_register_device(dev->hotkey_dev); if (error) { pr_info("Unable to register input device\n"); @@ -2541,6 +2634,20 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev) ret = get_tr_backlight_status(dev, &enabled); dev->tr_backlight_supported = !ret; + /* + * Tell acpi-video-detect code to prefer vendor backlight on all + * systems with transflective backlight and on dmi matched systems. + */ + if (dev->tr_backlight_supported || + dmi_check_system(toshiba_vendor_backlight_dmi)) + acpi_video_dmi_promote_vendor(); + + if (acpi_video_backlight_support()) + return 0; + + /* acpi-video may have loaded before we called dmi_promote_vendor() */ + acpi_video_unregister_backlight(); + memset(&props, 0, sizeof(props)); props.type = BACKLIGHT_PLATFORM; props.max_brightness = HCI_LCD_BRIGHTNESS_LEVELS - 1; @@ -2624,6 +2731,7 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) { struct toshiba_acpi_dev *dev; const char *hci_method; + u32 special_functions; u32 dummy; bool bt_present; int ret = 0; @@ -2648,6 +2756,16 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) acpi_dev->driver_data = dev; dev_set_drvdata(&acpi_dev->dev, dev); + /* Query the BIOS for supported features */ + + /* + * The "Special Functions" are always supported by the laptops + * with the new keyboard layout, query for its presence to help + * determine the keymap layout to use. + */ + ret = toshiba_function_keys_get(dev, &special_functions); + dev->kbd_function_keys_supported = !ret; + if (toshiba_acpi_setup_keyboard(dev)) pr_info("Unable to activate hotkeys\n"); @@ -2716,8 +2834,7 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) ret = toshiba_accelerometer_supported(dev); dev->accelerometer_supported = !ret; - ret = toshiba_usb_sleep_charge_get(dev, &dummy); - dev->usb_sleep_charge_supported = !ret; + toshiba_usb_sleep_charge_available(dev); ret = toshiba_usb_rapid_charge_get(dev, &dummy); dev->usb_rapid_charge_supported = !ret; @@ -2725,23 +2842,25 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev) ret = toshiba_usb_sleep_music_get(dev, &dummy); dev->usb_sleep_music_supported = !ret; - ret = toshiba_function_keys_get(dev, &dummy); - dev->kbd_function_keys_supported = !ret; - ret = toshiba_panel_power_on_get(dev, &dummy); dev->panel_power_on_supported = !ret; ret = toshiba_usb_three_get(dev, &dummy); dev->usb_three_supported = !ret; - /* Determine whether or not BIOS supports fan and video interfaces */ - ret = get_video_status(dev, &dummy); dev->video_supported = !ret; ret = get_fan_status(dev, &dummy); dev->fan_supported = !ret; + /* + * Enable the "Special Functions" mode only if they are + * supported and if they are activated. + */ + if (dev->kbd_function_keys_supported && special_functions) + toshiba_acpi_enable_special_functions(dev); + ret = sysfs_create_group(&dev->acpi_dev->dev.kobj, &toshiba_attr_group); if (ret) { @@ -2770,6 +2889,21 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) case 0x80: /* Hotkeys and some system events */ toshiba_acpi_process_hotkeys(dev); break; + case 0x81: /* Dock events */ + case 0x82: + case 0x83: + pr_info("Dock event received %x\n", event); + break; + case 0x88: /* Thermal events */ + pr_info("Thermal event received\n"); + break; + case 0x8f: /* LID closed */ + case 0x90: /* LID is closed and Dock has been ejected */ + break; + case 0x8c: /* SATA power events */ + case 0x8b: + pr_info("SATA power event received %x\n", event); + break; case 0x92: /* Keyboard backlight mode changed */ /* Update sysfs entries */ ret = sysfs_update_group(&acpi_dev->dev.kobj, @@ -2777,17 +2911,19 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event) if (ret) pr_err("Unable to update sysfs entries\n"); break; - case 0x81: /* Unknown */ - case 0x82: /* Unknown */ - case 0x83: /* Unknown */ - case 0x8c: /* Unknown */ + case 0x85: /* Unknown */ + case 0x8d: /* Unknown */ case 0x8e: /* Unknown */ - case 0x8f: /* Unknown */ - case 0x90: /* Unknown */ + case 0x94: /* Unknown */ + case 0x95: /* Unknown */ default: pr_info("Unknown event received %x\n", event); break; } + + acpi_bus_generate_netlink_event(acpi_dev->pnp.device_class, + dev_name(&acpi_dev->dev), + event, 0); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/platform/x86/toshiba_bluetooth.c b/drivers/platform/x86/toshiba_bluetooth.c index 2cb1ea62b4a7..249800763362 100644 --- a/drivers/platform/x86/toshiba_bluetooth.c +++ b/drivers/platform/x86/toshiba_bluetooth.c @@ -2,6 +2,7 @@ * Toshiba Bluetooth Enable Driver * * Copyright (C) 2009 Jes Sorensen <Jes.Sorensen@gmail.com> + * Copyright (C) 2015 Azael Avalos <coproscefalo@gmail.com> * * Thanks to Matthew Garrett for background info on ACPI innards which * normal people aren't meant to understand :-) @@ -25,6 +26,10 @@ #include <linux/types.h> #include <linux/acpi.h> +#define BT_KILLSWITCH_MASK 0x01 +#define BT_PLUGGED_MASK 0x40 +#define BT_POWER_MASK 0x80 + MODULE_AUTHOR("Jes Sorensen <Jes.Sorensen@gmail.com>"); MODULE_DESCRIPTION("Toshiba Laptop ACPI Bluetooth Enable Driver"); MODULE_LICENSE("GPL"); @@ -57,32 +62,107 @@ static struct acpi_driver toshiba_bt_rfkill_driver = { .drv.pm = &toshiba_bt_pm, }; +static int toshiba_bluetooth_present(acpi_handle handle) +{ + acpi_status result; + u64 bt_present; + + /* + * Some Toshiba laptops may have a fake TOS6205 device in + * their ACPI BIOS, so query the _STA method to see if there + * is really anything there. + */ + result = acpi_evaluate_integer(handle, "_STA", NULL, &bt_present); + if (ACPI_FAILURE(result)) { + pr_err("ACPI call to query Bluetooth presence failed"); + return -ENXIO; + } else if (!bt_present) { + pr_info("Bluetooth device not present\n"); + return -ENODEV; + } + + return 0; +} + +static int toshiba_bluetooth_status(acpi_handle handle) +{ + acpi_status result; + u64 status; + + result = acpi_evaluate_integer(handle, "BTST", NULL, &status); + if (ACPI_FAILURE(result)) { + pr_err("Could not get Bluetooth device status\n"); + return -ENXIO; + } + + pr_info("Bluetooth status %llu\n", status); + + return status; +} static int toshiba_bluetooth_enable(acpi_handle handle) { - acpi_status res1, res2; - u64 result; + acpi_status result; + bool killswitch; + bool powered; + bool plugged; + int status; /* * Query ACPI to verify RFKill switch is set to 'on'. * If not, we return silently, no need to report it as * an error. */ - res1 = acpi_evaluate_integer(handle, "BTST", NULL, &result); - if (ACPI_FAILURE(res1)) - return res1; - if (!(result & 0x01)) - return 0; + status = toshiba_bluetooth_status(handle); + if (status < 0) + return status; + + killswitch = (status & BT_KILLSWITCH_MASK) ? true : false; + powered = (status & BT_POWER_MASK) ? true : false; + plugged = (status & BT_PLUGGED_MASK) ? true : false; - pr_info("Re-enabling Toshiba Bluetooth\n"); - res1 = acpi_evaluate_object(handle, "AUSB", NULL, NULL); - res2 = acpi_evaluate_object(handle, "BTPO", NULL, NULL); - if (!ACPI_FAILURE(res1) || !ACPI_FAILURE(res2)) + if (!killswitch) return 0; + /* + * This check ensures to only enable the device if it is powered + * off or detached, as some recent devices somehow pass the killswitch + * test, causing a loop enabling/disabling the device, see bug 93911. + */ + if (powered || plugged) + return 0; + + result = acpi_evaluate_object(handle, "AUSB", NULL, NULL); + if (ACPI_FAILURE(result)) { + pr_err("Could not attach USB Bluetooth device\n"); + return -ENXIO; + } + + result = acpi_evaluate_object(handle, "BTPO", NULL, NULL); + if (ACPI_FAILURE(result)) { + pr_err("Could not power ON Bluetooth device\n"); + return -ENXIO; + } + + return 0; +} + +static int toshiba_bluetooth_disable(acpi_handle handle) +{ + acpi_status result; + + result = acpi_evaluate_object(handle, "BTPF", NULL, NULL); + if (ACPI_FAILURE(result)) { + pr_err("Could not power OFF Bluetooth device\n"); + return -ENXIO; + } - pr_warn("Failed to re-enable Toshiba Bluetooth\n"); + result = acpi_evaluate_object(handle, "DUSB", NULL, NULL); + if (ACPI_FAILURE(result)) { + pr_err("Could not detach USB Bluetooth device\n"); + return -ENXIO; + } - return -ENODEV; + return 0; } static void toshiba_bt_rfkill_notify(struct acpi_device *device, u32 event) @@ -99,23 +179,18 @@ static int toshiba_bt_resume(struct device *dev) static int toshiba_bt_rfkill_add(struct acpi_device *device) { - acpi_status status; - u64 bt_present; - int result = -ENODEV; + int result; - /* - * Some Toshiba laptops may have a fake TOS6205 device in - * their ACPI BIOS, so query the _STA method to see if there - * is really anything there, before trying to enable it. - */ - status = acpi_evaluate_integer(device->handle, "_STA", NULL, - &bt_present); + result = toshiba_bluetooth_present(device->handle); + if (result) + return result; - if (!ACPI_FAILURE(status) && bt_present) { - pr_info("Detected Toshiba ACPI Bluetooth device - " - "installing RFKill handler\n"); - result = toshiba_bluetooth_enable(device->handle); - } + pr_info("Toshiba ACPI Bluetooth device driver\n"); + + /* Enable the BT device */ + result = toshiba_bluetooth_enable(device->handle); + if (result) + return result; return result; } @@ -123,7 +198,7 @@ static int toshiba_bt_rfkill_add(struct acpi_device *device) static int toshiba_bt_rfkill_remove(struct acpi_device *device) { /* clean up */ - return 0; + return toshiba_bluetooth_disable(device->handle); } module_acpi_driver(toshiba_bt_rfkill_driver); diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 737e56d46f61..aac47573f9ed 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -45,7 +45,6 @@ MODULE_LICENSE("GPL"); #define ACPI_WMI_CLASS "wmi" -static DEFINE_MUTEX(wmi_data_lock); static LIST_HEAD(wmi_block_list); struct guid_block { @@ -240,10 +239,10 @@ static bool find_guid(const char *guid_string, struct wmi_block **out) if (memcmp(block->guid, guid_input, 16) == 0) { if (out) *out = wblock; - return 1; + return true; } } - return 0; + return false; } static acpi_status wmi_method_enable(struct wmi_block *wblock, int enable) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index e03877c4b195..fd243231620a 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1064,6 +1064,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { RAPL_CPU(0x3f, rapl_defaults_hsw_server),/* Haswell servers */ RAPL_CPU(0x4f, rapl_defaults_hsw_server),/* Broadwell servers */ RAPL_CPU(0x45, rapl_defaults_core),/* Haswell ULT */ + RAPL_CPU(0x4E, rapl_defaults_core),/* Skylake */ RAPL_CPU(0x4C, rapl_defaults_atom),/* Braswell */ RAPL_CPU(0x4A, rapl_defaults_atom),/* Tangier */ RAPL_CPU(0x56, rapl_defaults_core),/* Future Xeon */ diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 810aef3f4c3e..ba34c7d89042 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -573,7 +573,7 @@ EXPORT_SYMBOL_GPL(of_pwm_get); * @table: array of consumers to register * @num: number of consumers in table */ -void __init pwm_add_table(struct pwm_lookup *table, size_t num) +void pwm_add_table(struct pwm_lookup *table, size_t num) { mutex_lock(&pwm_lookup_lock); diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c index 522f7075bb1a..fa5feaba25a5 100644 --- a/drivers/pwm/pwm-atmel-hlcdc.c +++ b/drivers/pwm/pwm-atmel-hlcdc.c @@ -225,6 +225,10 @@ static const struct of_device_id atmel_hlcdc_dt_ids[] = { .compatible = "atmel,sama5d3-hlcdc", .data = &atmel_hlcdc_pwm_sama5d3_errata, }, + { + .compatible = "atmel,sama5d4-hlcdc", + .data = &atmel_hlcdc_pwm_sama5d3_errata, + }, { /* sentinel */ }, }; diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c index f75ecb09d97d..b430811e14f5 100644 --- a/drivers/pwm/pwm-mxs.c +++ b/drivers/pwm/pwm-mxs.c @@ -35,6 +35,10 @@ #define PERIOD_CDIV(div) (((div) & 0x7) << 20) #define PERIOD_CDIV_MAX 8 +static const unsigned int cdiv[PERIOD_CDIV_MAX] = { + 1, 2, 4, 8, 16, 64, 256, 1024 +}; + struct mxs_pwm_chip { struct pwm_chip chip; struct clk *clk; @@ -54,13 +58,13 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, rate = clk_get_rate(mxs->clk); while (1) { - c = rate / (1 << div); + c = rate / cdiv[div]; c = c * period_ns; do_div(c, 1000000000); if (c < PERIOD_PERIOD_MAX) break; div++; - if (div > PERIOD_CDIV_MAX) + if (div >= PERIOD_CDIV_MAX) return -EINVAL; } diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 3fb775ded0df..34b5c275a92a 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -202,7 +202,7 @@ static const struct pwm_ops pca9685_pwm_ops = { .owner = THIS_MODULE, }; -static struct regmap_config pca9685_regmap_i2c_config = { +static const struct regmap_config pca9685_regmap_i2c_config = { .reg_bits = 8, .val_bits = 8, .max_register = PCA9685_NUMREGS, diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index 3e9b5835a4af..ff201e1b9219 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -269,12 +269,31 @@ static void pwm_samsung_disable(struct pwm_chip *chip, struct pwm_device *pwm) spin_unlock_irqrestore(&samsung_pwm_lock, flags); } +static void pwm_samsung_manual_update(struct samsung_pwm_chip *chip, + struct pwm_device *pwm) +{ + unsigned int tcon_chan = to_tcon_channel(pwm->hwpwm); + u32 tcon; + unsigned long flags; + + spin_lock_irqsave(&samsung_pwm_lock, flags); + + tcon = readl(chip->base + REG_TCON); + tcon |= TCON_MANUALUPDATE(tcon_chan); + writel(tcon, chip->base + REG_TCON); + + tcon &= ~TCON_MANUALUPDATE(tcon_chan); + writel(tcon, chip->base + REG_TCON); + + spin_unlock_irqrestore(&samsung_pwm_lock, flags); +} + static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct samsung_pwm_chip *our_chip = to_samsung_pwm_chip(chip); struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm); - u32 tin_ns = chan->tin_ns, tcnt, tcmp; + u32 tin_ns = chan->tin_ns, tcnt, tcmp, oldtcmp; /* * We currently avoid using 64bit arithmetic by using the @@ -288,6 +307,7 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; tcnt = readl(our_chip->base + REG_TCNTB(pwm->hwpwm)); + oldtcmp = readl(our_chip->base + REG_TCMPB(pwm->hwpwm)); /* We need tick count for calculation, not last tick. */ ++tcnt; @@ -335,6 +355,16 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, writel(tcnt, our_chip->base + REG_TCNTB(pwm->hwpwm)); writel(tcmp, our_chip->base + REG_TCMPB(pwm->hwpwm)); + /* + * In case the PWM is currently at 100% duty cycle, force a manual + * update to prevent the signal staying high if the PWM is disabled + * shortly afer this update (before it autoreloaded the new values). + */ + if (oldtcmp == (u32) -1) { + dev_dbg(our_chip->chip.dev, "Forcing manual update"); + pwm_samsung_manual_update(our_chip, pwm); + } + chan->period_ns = period_ns; chan->tin_ns = tin_ns; chan->duty_ns = duty_ns; diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c index 71d7802aa8b4..6f1fa1773e76 100644 --- a/drivers/s390/kvm/virtio_ccw.c +++ b/drivers/s390/kvm/virtio_ccw.c @@ -1201,13 +1201,9 @@ static int virtio_ccw_online(struct ccw_device *cdev) vcdev->vdev.id.vendor = cdev->id.cu_type; vcdev->vdev.id.device = cdev->id.cu_model; - if (virtio_device_is_legacy_only(vcdev->vdev.id)) { - vcdev->revision = 0; - } else { - ret = virtio_ccw_set_transport_rev(vcdev); - if (ret) - goto out_free; - } + ret = virtio_ccw_set_transport_rev(vcdev); + if (ret) + goto out_free; ret = register_virtio_device(&vcdev->vdev); if (ret) { diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 57418258c101..fe8a8d157e22 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -3065,7 +3065,7 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle, { struct qla_hw_data *ha = vha->hw; struct se_cmd *se_cmd; - struct target_core_fabric_ops *tfo; + const struct target_core_fabric_ops *tfo; struct qla_tgt_cmd *cmd; if (handle & CTIO_INTERMEDIATE_HANDLE_MARK) { diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index ab4879e12ea7..68c2002e78bf 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -53,9 +53,8 @@ static struct workqueue_struct *tcm_qla2xxx_free_wq; static struct workqueue_struct *tcm_qla2xxx_cmd_wq; -/* Local pointer to allocated TCM configfs fabric module */ -static struct target_fabric_configfs *tcm_qla2xxx_fabric_configfs; -static struct target_fabric_configfs *tcm_qla2xxx_npiv_fabric_configfs; +static const struct target_core_fabric_ops tcm_qla2xxx_ops; +static const struct target_core_fabric_ops tcm_qla2xxx_npiv_ops; /* * Parse WWN. @@ -336,6 +335,14 @@ static int tcm_qla2xxx_check_demo_mode_login_only(struct se_portal_group *se_tpg return tpg->tpg_attrib.demo_mode_login_only; } +static int tcm_qla2xxx_check_prot_fabric_only(struct se_portal_group *se_tpg) +{ + struct tcm_qla2xxx_tpg *tpg = container_of(se_tpg, + struct tcm_qla2xxx_tpg, se_tpg); + + return tpg->tpg_attrib.fabric_prot_type; +} + static struct se_node_acl *tcm_qla2xxx_alloc_fabric_acl( struct se_portal_group *se_tpg) { @@ -1082,8 +1089,53 @@ static ssize_t tcm_qla2xxx_tpg_store_enable( TF_TPG_BASE_ATTR(tcm_qla2xxx, enable, S_IRUGO | S_IWUSR); +static ssize_t tcm_qla2xxx_tpg_show_dynamic_sessions( + struct se_portal_group *se_tpg, + char *page) +{ + return target_show_dynamic_sessions(se_tpg, page); +} + +TF_TPG_BASE_ATTR_RO(tcm_qla2xxx, dynamic_sessions); + +static ssize_t tcm_qla2xxx_tpg_store_fabric_prot_type( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct tcm_qla2xxx_tpg *tpg = container_of(se_tpg, + struct tcm_qla2xxx_tpg, se_tpg); + unsigned long val; + int ret = kstrtoul(page, 0, &val); + + if (ret) { + pr_err("kstrtoul() returned %d for fabric_prot_type\n", ret); + return ret; + } + if (val != 0 && val != 1 && val != 3) { + pr_err("Invalid qla2xxx fabric_prot_type: %lu\n", val); + return -EINVAL; + } + tpg->tpg_attrib.fabric_prot_type = val; + + return count; +} + +static ssize_t tcm_qla2xxx_tpg_show_fabric_prot_type( + struct se_portal_group *se_tpg, + char *page) +{ + struct tcm_qla2xxx_tpg *tpg = container_of(se_tpg, + struct tcm_qla2xxx_tpg, se_tpg); + + return sprintf(page, "%d\n", tpg->tpg_attrib.fabric_prot_type); +} +TF_TPG_BASE_ATTR(tcm_qla2xxx, fabric_prot_type, S_IRUGO | S_IWUSR); + static struct configfs_attribute *tcm_qla2xxx_tpg_attrs[] = { &tcm_qla2xxx_tpg_enable.attr, + &tcm_qla2xxx_tpg_dynamic_sessions.attr, + &tcm_qla2xxx_tpg_fabric_prot_type.attr, NULL, }; @@ -1124,7 +1176,7 @@ static struct se_portal_group *tcm_qla2xxx_make_tpg( tpg->tpg_attrib.cache_dynamic_acls = 1; tpg->tpg_attrib.demo_mode_login_only = 1; - ret = core_tpg_register(&tcm_qla2xxx_fabric_configfs->tf_ops, wwn, + ret = core_tpg_register(&tcm_qla2xxx_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { kfree(tpg); @@ -1244,7 +1296,7 @@ static struct se_portal_group *tcm_qla2xxx_npiv_make_tpg( tpg->tpg_attrib.cache_dynamic_acls = 1; tpg->tpg_attrib.demo_mode_login_only = 1; - ret = core_tpg_register(&tcm_qla2xxx_npiv_fabric_configfs->tf_ops, wwn, + ret = core_tpg_register(&tcm_qla2xxx_npiv_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { kfree(tpg); @@ -1560,7 +1612,7 @@ static int tcm_qla2xxx_check_initiator_node_acl( se_sess = transport_init_session_tags(num_tags, sizeof(struct qla_tgt_cmd), - TARGET_PROT_NORMAL); + TARGET_PROT_ALL); if (IS_ERR(se_sess)) { pr_err("Unable to initialize struct se_session\n"); return PTR_ERR(se_sess); @@ -1934,7 +1986,9 @@ static struct configfs_attribute *tcm_qla2xxx_wwn_attrs[] = { NULL, }; -static struct target_core_fabric_ops tcm_qla2xxx_ops = { +static const struct target_core_fabric_ops tcm_qla2xxx_ops = { + .module = THIS_MODULE, + .name = "qla2xxx", .get_fabric_name = tcm_qla2xxx_get_fabric_name, .get_fabric_proto_ident = tcm_qla2xxx_get_fabric_proto_ident, .tpg_get_wwn = tcm_qla2xxx_get_fabric_wwn, @@ -1949,6 +2003,7 @@ static struct target_core_fabric_ops tcm_qla2xxx_ops = { tcm_qla2xxx_check_demo_write_protect, .tpg_check_prod_mode_write_protect = tcm_qla2xxx_check_prod_write_protect, + .tpg_check_prot_fabric_only = tcm_qla2xxx_check_prot_fabric_only, .tpg_check_demo_mode_login_only = tcm_qla2xxx_check_demo_mode_login_only, .tpg_alloc_fabric_acl = tcm_qla2xxx_alloc_fabric_acl, .tpg_release_fabric_acl = tcm_qla2xxx_release_fabric_acl, @@ -1983,9 +2038,15 @@ static struct target_core_fabric_ops tcm_qla2xxx_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = tcm_qla2xxx_make_nodeacl, .fabric_drop_nodeacl = tcm_qla2xxx_drop_nodeacl, + + .tfc_wwn_attrs = tcm_qla2xxx_wwn_attrs, + .tfc_tpg_base_attrs = tcm_qla2xxx_tpg_attrs, + .tfc_tpg_attrib_attrs = tcm_qla2xxx_tpg_attrib_attrs, }; -static struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = { +static const struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = { + .module = THIS_MODULE, + .name = "qla2xxx_npiv", .get_fabric_name = tcm_qla2xxx_npiv_get_fabric_name, .get_fabric_proto_ident = tcm_qla2xxx_get_fabric_proto_ident, .tpg_get_wwn = tcm_qla2xxx_get_fabric_wwn, @@ -2033,94 +2094,26 @@ static struct target_core_fabric_ops tcm_qla2xxx_npiv_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = tcm_qla2xxx_make_nodeacl, .fabric_drop_nodeacl = tcm_qla2xxx_drop_nodeacl, + + .tfc_wwn_attrs = tcm_qla2xxx_wwn_attrs, + .tfc_tpg_base_attrs = tcm_qla2xxx_npiv_tpg_attrs, }; static int tcm_qla2xxx_register_configfs(void) { - struct target_fabric_configfs *fabric, *npiv_fabric; int ret; pr_debug("TCM QLOGIC QLA2XXX fabric module %s on %s/%s on " UTS_RELEASE"\n", TCM_QLA2XXX_VERSION, utsname()->sysname, utsname()->machine); - /* - * Register the top level struct config_item_type with TCM core - */ - fabric = target_fabric_configfs_init(THIS_MODULE, "qla2xxx"); - if (IS_ERR(fabric)) { - pr_err("target_fabric_configfs_init() failed\n"); - return PTR_ERR(fabric); - } - /* - * Setup fabric->tf_ops from our local tcm_qla2xxx_ops - */ - fabric->tf_ops = tcm_qla2xxx_ops; - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = tcm_qla2xxx_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = tcm_qla2xxx_tpg_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = - tcm_qla2xxx_tpg_attrib_attrs; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - /* - * Register the fabric for use within TCM - */ - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() failed for TCM_QLA2XXX\n"); + + ret = target_register_template(&tcm_qla2xxx_ops); + if (ret) return ret; - } - /* - * Setup our local pointer to *fabric - */ - tcm_qla2xxx_fabric_configfs = fabric; - pr_debug("TCM_QLA2XXX[0] - Set fabric -> tcm_qla2xxx_fabric_configfs\n"); - /* - * Register the top level struct config_item_type for NPIV with TCM core - */ - npiv_fabric = target_fabric_configfs_init(THIS_MODULE, "qla2xxx_npiv"); - if (IS_ERR(npiv_fabric)) { - pr_err("target_fabric_configfs_init() failed\n"); - ret = PTR_ERR(npiv_fabric); - goto out_fabric; - } - /* - * Setup fabric->tf_ops from our local tcm_qla2xxx_npiv_ops - */ - npiv_fabric->tf_ops = tcm_qla2xxx_npiv_ops; - /* - * Setup default attribute lists for various npiv_fabric->tf_cit_tmpl - */ - npiv_fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = tcm_qla2xxx_wwn_attrs; - npiv_fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = - tcm_qla2xxx_npiv_tpg_attrs; - npiv_fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - npiv_fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - /* - * Register the npiv_fabric for use within TCM - */ - ret = target_fabric_configfs_register(npiv_fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() failed for TCM_QLA2XXX\n"); + ret = target_register_template(&tcm_qla2xxx_npiv_ops); + if (ret) goto out_fabric; - } - /* - * Setup our local pointer to *npiv_fabric - */ - tcm_qla2xxx_npiv_fabric_configfs = npiv_fabric; - pr_debug("TCM_QLA2XXX[0] - Set fabric -> tcm_qla2xxx_npiv_fabric_configfs\n"); tcm_qla2xxx_free_wq = alloc_workqueue("tcm_qla2xxx_free", WQ_MEM_RECLAIM, 0); @@ -2140,9 +2133,9 @@ static int tcm_qla2xxx_register_configfs(void) out_free_wq: destroy_workqueue(tcm_qla2xxx_free_wq); out_fabric_npiv: - target_fabric_configfs_deregister(tcm_qla2xxx_npiv_fabric_configfs); + target_unregister_template(&tcm_qla2xxx_npiv_ops); out_fabric: - target_fabric_configfs_deregister(tcm_qla2xxx_fabric_configfs); + target_unregister_template(&tcm_qla2xxx_ops); return ret; } @@ -2151,13 +2144,8 @@ static void tcm_qla2xxx_deregister_configfs(void) destroy_workqueue(tcm_qla2xxx_cmd_wq); destroy_workqueue(tcm_qla2xxx_free_wq); - target_fabric_configfs_deregister(tcm_qla2xxx_fabric_configfs); - tcm_qla2xxx_fabric_configfs = NULL; - pr_debug("TCM_QLA2XXX[0] - Cleared tcm_qla2xxx_fabric_configfs\n"); - - target_fabric_configfs_deregister(tcm_qla2xxx_npiv_fabric_configfs); - tcm_qla2xxx_npiv_fabric_configfs = NULL; - pr_debug("TCM_QLA2XXX[0] - Cleared tcm_qla2xxx_npiv_fabric_configfs\n"); + target_unregister_template(&tcm_qla2xxx_ops); + target_unregister_template(&tcm_qla2xxx_npiv_ops); } static int __init tcm_qla2xxx_init(void) diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.h b/drivers/scsi/qla2xxx/tcm_qla2xxx.h index 10c002145648..23295115c9fc 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.h +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.h @@ -33,6 +33,7 @@ struct tcm_qla2xxx_tpg_attrib { int demo_mode_write_protect; int prod_mode_write_protect; int demo_mode_login_only; + int fabric_prot_type; }; struct tcm_qla2xxx_tpg { diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c index 186924aa4740..f6bac9e77d06 100644 --- a/drivers/spi/spi-rspi.c +++ b/drivers/spi/spi-rspi.c @@ -1023,7 +1023,6 @@ static struct dma_chan *rspi_request_dma_chan(struct device *dev, } memset(&cfg, 0, sizeof(cfg)); - cfg.slave_id = id; cfg.direction = dir; if (dir == DMA_MEM_TO_DEV) { cfg.dst_addr = port_addr; diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c index e57eec0b2f46..bcc7c635d8e7 100644 --- a/drivers/spi/spi-sh-msiof.c +++ b/drivers/spi/spi-sh-msiof.c @@ -1030,7 +1030,6 @@ static struct dma_chan *sh_msiof_request_dma_chan(struct device *dev, } memset(&cfg, 0, sizeof(cfg)); - cfg.slave_id = id; cfg.direction = dir; if (dir == DMA_MEM_TO_DEV) { cfg.dst_addr = port_addr; diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 0e3d8c7add24..b0b96ab31954 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -1106,6 +1106,7 @@ struct dma_buf *ion_share_dma_buf(struct ion_client *client, struct ion_buffer *buffer; struct dma_buf *dmabuf; bool valid_handle; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); mutex_lock(&client->lock); valid_handle = ion_handle_validate(client, handle); @@ -1118,8 +1119,12 @@ struct dma_buf *ion_share_dma_buf(struct ion_client *client, ion_buffer_get(buffer); mutex_unlock(&client->lock); - dmabuf = dma_buf_export(buffer, &dma_buf_ops, buffer->size, O_RDWR, - NULL); + exp_info.ops = &dma_buf_ops; + exp_info.size = buffer->size; + exp_info.flags = O_RDWR; + exp_info.priv = buffer; + + dmabuf = dma_buf_export(&exp_info); if (IS_ERR(dmabuf)) { ion_buffer_put(buffer); return dmabuf; diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c index fe1fd05423e9..5af01351306d 100644 --- a/drivers/staging/lustre/lustre/llite/dcache.c +++ b/drivers/staging/lustre/lustre/llite/dcache.c @@ -153,7 +153,7 @@ static int ll_ddelete(const struct dentry *de) CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n", d_lustre_invalid((struct dentry *)de) ? "deleting" : "keeping", - de, de, de->d_parent, de->d_inode, + de, de, de->d_parent, d_inode(de), d_unhashed(de) ? "" : "hashed,", list_empty(&de->d_subdirs) ? "" : "subdirs"); @@ -167,8 +167,8 @@ static int ll_ddelete(const struct dentry *de) #if 0 /* if not ldlm lock for this inode, set i_nlink to 0 so that * this inode can be recycled later b=20433 */ - if (de->d_inode && !find_cbdata(de->d_inode)) - clear_nlink(de->d_inode); + if (d_really_is_positive(de) && !find_cbdata(d_inode(de))) + clear_nlink(d_inode(de)); #endif if (d_lustre_invalid((struct dentry *)de)) @@ -181,7 +181,7 @@ int ll_d_init(struct dentry *de) LASSERT(de != NULL); CDEBUG(D_DENTRY, "ldd on dentry %pd (%p) parent %p inode %p refc %d\n", - de, de, de->d_parent, de->d_inode, + de, de, de->d_parent, d_inode(de), d_count(de)); if (de->d_fsdata == NULL) { @@ -261,7 +261,7 @@ void ll_invalidate_aliases(struct inode *inode) ll_d_hlist_for_each_entry(dentry, p, &inode->i_dentry, d_u.d_alias) { CDEBUG(D_DENTRY, "dentry in drop %pd (%p) parent %p inode %p flags %d\n", dentry, dentry, dentry->d_parent, - dentry->d_inode, dentry->d_flags); + d_inode(dentry), dentry->d_flags); d_lustre_invalidate(dentry, 0); } @@ -309,7 +309,7 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode) static int ll_revalidate_dentry(struct dentry *dentry, unsigned int lookup_flags) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); /* * if open&create is set, talk to MDS to make sure file is created if @@ -329,7 +329,7 @@ static int ll_revalidate_dentry(struct dentry *dentry, if (lookup_flags & LOOKUP_RCU) return -ECHILD; - do_statahead_enter(dir, &dentry, dentry->d_inode == NULL); + do_statahead_enter(dir, &dentry, d_inode(dentry) == NULL); ll_statahead_mark(dir, dentry); return 1; } diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index 529062ea112b..4b44c634fcc3 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -388,7 +388,7 @@ int ll_file_release(struct inode *inode, struct file *file) static int ll_intent_file_open(struct dentry *dentry, void *lmm, int lmmsize, struct lookup_intent *itp) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ll_sb_info *sbi = ll_i2sbi(inode); struct dentry *parent = dentry->d_parent; const char *name = dentry->d_name.name; @@ -413,7 +413,7 @@ static int ll_intent_file_open(struct dentry *dentry, void *lmm, opc = LUSTRE_OPC_CREATE; } - op_data = ll_prep_md_op_data(NULL, parent->d_inode, + op_data = ll_prep_md_op_data(NULL, d_inode(parent), inode, name, len, O_RDWR, opc, NULL); if (IS_ERR(op_data)) @@ -2896,7 +2896,7 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc) static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ptlrpc_request *req = NULL; struct obd_export *exp; int rc = 0; @@ -2948,12 +2948,12 @@ static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) do_lookup() -> ll_revalidate_it(). We cannot use d_drop here to preserve get_cwd functionality on 2.6. Bug 10503 */ - if (!dentry->d_inode->i_nlink) + if (!d_inode(dentry)->i_nlink) d_lustre_invalidate(dentry, 0); ll_lookup_finish_locks(&oit, inode); - } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) { - struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); + } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) { + struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry)); u64 valid = OBD_MD_FLGETATTR; struct md_op_data *op_data; int ealen = 0; @@ -2991,7 +2991,7 @@ out: static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = __ll_inode_revalidate(dentry, ibits); @@ -3019,7 +3019,7 @@ static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) { - struct inode *inode = de->d_inode; + struct inode *inode = d_inode(de); struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); int res = 0; diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index e7422f5c9c6f..5f918e3c4683 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -1488,7 +1488,7 @@ static inline void d_lustre_invalidate(struct dentry *dentry, int nested) { CDEBUG(D_DENTRY, "invalidate dentry %pd (%p) parent %p inode %p refc %d\n", dentry, dentry, - dentry->d_parent, dentry->d_inode, d_count(dentry)); + dentry->d_parent, d_inode(dentry), d_count(dentry)); spin_lock_nested(&dentry->d_lock, nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL); diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index bf1ec277a1dc..a27af7882170 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -1166,7 +1166,7 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, struct md_open_data **mod) { struct lustre_md md; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ll_sb_info *sbi = ll_i2sbi(inode); struct ptlrpc_request *request = NULL; int rc, ia_valid; @@ -1290,7 +1290,7 @@ static int ll_setattr_ost(struct inode *inode, struct iattr *attr) */ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ll_inode_info *lli = ll_i2info(inode); struct md_op_data *op_data = NULL; struct md_open_data *mod = NULL; @@ -1465,7 +1465,7 @@ out: int ll_setattr(struct dentry *de, struct iattr *attr) { - int mode = de->d_inode->i_mode; + int mode = d_inode(de)->i_mode; if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index 243a7840457f..db43b81386f7 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -230,11 +230,11 @@ static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, static int ll_get_name(struct dentry *dentry, char *name, struct dentry *child) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); int rc; struct ll_getname_data lgd = { .lgd_name = name, - .lgd_fid = ll_i2info(child->d_inode)->lli_fid, + .lgd_fid = ll_i2info(d_inode(child))->lli_fid, .ctx.actor = ll_nfs_get_name_filldir, }; @@ -282,7 +282,7 @@ static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, static struct dentry *ll_get_parent(struct dentry *dchild) { struct ptlrpc_request *req = NULL; - struct inode *dir = dchild->d_inode; + struct inode *dir = d_inode(dchild); struct ll_sb_info *sbi; struct dentry *result = NULL; struct mdt_body *body; diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c index 49f1cb067ea2..5a25dcd10126 100644 --- a/drivers/staging/lustre/lustre/llite/namei.c +++ b/drivers/staging/lustre/lustre/llite/namei.c @@ -155,7 +155,7 @@ static void ll_invalidate_negative_children(struct inode *dir) list_for_each_entry_safe(child, tmp_subdir, &dentry->d_subdirs, d_child) { - if (child->d_inode == NULL) + if (d_really_is_negative(child)) d_lustre_invalidate(child, 1); } } @@ -392,7 +392,7 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) iput(inode); CDEBUG(D_DENTRY, "Reuse dentry %p inode %p refc %d flags %#x\n", - new, new->d_inode, d_count(new), new->d_flags); + new, d_inode(new), d_count(new), new->d_flags); return new; } } @@ -401,7 +401,7 @@ struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) return ERR_PTR(rc); d_add(de, inode); CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", - de, de->d_inode, d_count(de), de->d_flags); + de, d_inode(de), d_count(de), de->d_flags); return de; } @@ -448,7 +448,7 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request, !it_disposition(it, DISP_OPEN_CREATE)) { /* With DISP_OPEN_CREATE dentry will instantiated in ll_create_it. */ - LASSERT((*de)->d_inode == NULL); + LASSERT(d_inode(*de) == NULL); d_instantiate(*de, inode); } @@ -541,7 +541,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, goto out; } - inode = dentry->d_inode; + inode = d_inode(dentry); if ((it->it_op & IT_OPEN) && inode && !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) { @@ -638,9 +638,9 @@ static int ll_atomic_open(struct inode *dir, struct dentry *dentry, *opened |= FILE_CREATED; } - if (dentry->d_inode && it_disposition(it, DISP_OPEN_OPEN)) { + if (d_really_is_positive(dentry) && it_disposition(it, DISP_OPEN_OPEN)) { /* Open dentry. */ - if (S_ISFIFO(dentry->d_inode->i_mode)) { + if (S_ISFIFO(d_inode(dentry)->i_mode)) { /* We cannot call open here as it would * deadlock. */ @@ -862,8 +862,8 @@ static int ll_create_nd(struct inode *dir, struct dentry *dentry, static inline void ll_get_child_fid(struct dentry *child, struct lu_fid *fid) { - if (child->d_inode) - *fid = *ll_inode2fid(child->d_inode); + if (d_really_is_positive(child)) + *fid = *ll_inode2fid(d_inode(child)); } /** @@ -1076,7 +1076,7 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry, static int ll_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - struct inode *src = old_dentry->d_inode; + struct inode *src = d_inode(old_dentry); struct ll_sb_info *sbi = ll_i2sbi(dir); struct ptlrpc_request *request = NULL; struct md_op_data *op_data; diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c index b75562c6b5de..7f8071242f23 100644 --- a/drivers/staging/lustre/lustre/llite/statahead.c +++ b/drivers/staging/lustre/lustre/llite/statahead.c @@ -880,7 +880,7 @@ static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct lookup_intent it = { .it_op = IT_GETATTR, .d.lustre.it_lock_handle = 0 }; struct md_enqueue_info *minfo; @@ -926,7 +926,7 @@ static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, static void ll_statahead_one(struct dentry *parent, const char *entry_name, int entry_name_len) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct ll_inode_info *lli = ll_i2info(dir); struct ll_statahead_info *sai = lli->lli_sai; struct dentry *dentry = NULL; @@ -944,8 +944,8 @@ static void ll_statahead_one(struct dentry *parent, const char *entry_name, rc = do_sa_lookup(dir, entry); } else { rc = do_sa_revalidate(dir, entry, dentry); - if (rc == 1 && agl_should_run(sai, dentry->d_inode)) - ll_agl_add(sai, dentry->d_inode, entry->se_index); + if (rc == 1 && agl_should_run(sai, d_inode(dentry))) + ll_agl_add(sai, d_inode(dentry), entry->se_index); } if (dentry != NULL) @@ -968,7 +968,7 @@ static void ll_statahead_one(struct dentry *parent, const char *entry_name, static int ll_agl_thread(void *arg) { struct dentry *parent = (struct dentry *)arg; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct ll_inode_info *plli = ll_i2info(dir); struct ll_inode_info *clli; struct ll_sb_info *sbi = ll_i2sbi(dir); @@ -1042,7 +1042,7 @@ static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", sai, parent); - plli = ll_i2info(parent->d_inode); + plli = ll_i2info(d_inode(parent)); task = kthread_run(ll_agl_thread, parent, "ll_agl_%u", plli->lli_opendir_pid); if (IS_ERR(task)) { @@ -1059,7 +1059,7 @@ static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) static int ll_statahead_thread(void *arg) { struct dentry *parent = (struct dentry *)arg; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct ll_inode_info *plli = ll_i2info(dir); struct ll_inode_info *clli; struct ll_sb_info *sbi = ll_i2sbi(dir); @@ -1604,7 +1604,7 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), &bits); if (rc == 1) { - if ((*dentryp)->d_inode == NULL) { + if (d_inode(*dentryp) == NULL) { struct dentry *alias; alias = ll_splice_alias(inode, @@ -1614,13 +1614,13 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, return PTR_ERR(alias); } *dentryp = alias; - } else if ((*dentryp)->d_inode != inode) { + } else if (d_inode(*dentryp) != inode) { /* revalidate, but inode is recreated */ CDEBUG(D_READA, "stale dentry %pd inode %lu/%u, statahead inode %lu/%u\n", *dentryp, - (*dentryp)->d_inode->i_ino, - (*dentryp)->d_inode->i_generation, + d_inode(*dentryp)->i_ino, + d_inode(*dentryp)->i_generation, inode->i_ino, inode->i_generation); ll_sai_unplug(sai, entry); @@ -1666,8 +1666,8 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, /* get parent reference count here, and put it in ll_statahead_thread */ parent = dget((*dentryp)->d_parent); - if (unlikely(sai->sai_inode != parent->d_inode)) { - struct ll_inode_info *nlli = ll_i2info(parent->d_inode); + if (unlikely(sai->sai_inode != d_inode(parent))) { + struct ll_inode_info *nlli = ll_i2info(d_inode(parent)); CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n", *dentryp, @@ -1689,7 +1689,7 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, ll_sai_get(sai); lli->lli_sai = sai; - plli = ll_i2info(parent->d_inode); + plli = ll_i2info(d_inode(parent)); rc = PTR_ERR(kthread_run(ll_statahead_thread, parent, "ll_sa_%u", plli->lli_opendir_pid)); thread = &sai->sai_thread; diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index 686b6a574cc5..3711e671a4df 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -120,7 +120,7 @@ failed: static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ptlrpc_request *request = NULL; int rc; char *symname = NULL; diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c index b439936b4524..e0fcbe1395fd 100644 --- a/drivers/staging/lustre/lustre/llite/xattr.c +++ b/drivers/staging/lustre/lustre/llite/xattr.c @@ -214,7 +214,7 @@ int ll_setxattr_common(struct inode *inode, const char *name, int ll_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); LASSERT(inode); LASSERT(name); @@ -267,7 +267,7 @@ int ll_setxattr(struct dentry *dentry, const char *name, int ll_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); LASSERT(inode); LASSERT(name); @@ -457,7 +457,7 @@ out: ssize_t ll_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); LASSERT(inode); LASSERT(name); @@ -545,7 +545,7 @@ out: ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc = 0, rc2 = 0; struct lov_mds_md *lmm = NULL; struct ptlrpc_request *request = NULL; diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index 81d44c477a5b..257361280510 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -31,12 +31,13 @@ config TCM_PSCSI Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device -config TCM_USER +config TCM_USER2 tristate "TCM/USER Subsystem Plugin for Linux" depends on UIO && NET help Say Y here to enable the TCM/USER subsystem plugin for a userspace - process to handle requests + process to handle requests. This is version 2 of the ABI; version 1 + is obsolete. source "drivers/target/loopback/Kconfig" source "drivers/target/tcm_fc/Kconfig" diff --git a/drivers/target/Makefile b/drivers/target/Makefile index bbb4a7d638ef..e619c0266a79 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -22,7 +22,7 @@ obj-$(CONFIG_TARGET_CORE) += target_core_mod.o obj-$(CONFIG_TCM_IBLOCK) += target_core_iblock.o obj-$(CONFIG_TCM_FILEIO) += target_core_file.o obj-$(CONFIG_TCM_PSCSI) += target_core_pscsi.o -obj-$(CONFIG_TCM_USER) += target_core_user.o +obj-$(CONFIG_TCM_USER2) += target_core_user.o # Fabric modules obj-$(CONFIG_LOOPBACK_TARGET) += loopback/ diff --git a/drivers/target/iscsi/Makefile b/drivers/target/iscsi/Makefile index 13a92403fe3e..0f43be9c3453 100644 --- a/drivers/target/iscsi/Makefile +++ b/drivers/target/iscsi/Makefile @@ -1,6 +1,5 @@ iscsi_target_mod-y += iscsi_target_parameters.o \ iscsi_target_seq_pdu_list.o \ - iscsi_target_tq.o \ iscsi_target_auth.o \ iscsi_target_datain_values.o \ iscsi_target_device.o \ diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 77d64251af40..34871a628b11 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -33,8 +33,6 @@ #include <target/iscsi/iscsi_target_core.h> #include "iscsi_target_parameters.h" #include "iscsi_target_seq_pdu_list.h" -#include "iscsi_target_tq.h" -#include "iscsi_target_configfs.h" #include "iscsi_target_datain_values.h" #include "iscsi_target_erl0.h" #include "iscsi_target_erl1.h" @@ -537,7 +535,7 @@ static struct iscsit_transport iscsi_target_transport = { static int __init iscsi_target_init_module(void) { - int ret = 0; + int ret = 0, size; pr_debug("iSCSI-Target "ISCSIT_VERSION"\n"); @@ -546,24 +544,21 @@ static int __init iscsi_target_init_module(void) pr_err("Unable to allocate memory for iscsit_global\n"); return -1; } + spin_lock_init(&iscsit_global->ts_bitmap_lock); mutex_init(&auth_id_lock); spin_lock_init(&sess_idr_lock); idr_init(&tiqn_idr); idr_init(&sess_idr); - ret = iscsi_target_register_configfs(); - if (ret < 0) + ret = target_register_template(&iscsi_ops); + if (ret) goto out; - ret = iscsi_thread_set_init(); - if (ret < 0) + size = BITS_TO_LONGS(ISCSIT_BITMAP_BITS) * sizeof(long); + iscsit_global->ts_bitmap = vzalloc(size); + if (!iscsit_global->ts_bitmap) { + pr_err("Unable to allocate iscsit_global->ts_bitmap\n"); goto configfs_out; - - if (iscsi_allocate_thread_sets(TARGET_THREAD_SET_COUNT) != - TARGET_THREAD_SET_COUNT) { - pr_err("iscsi_allocate_thread_sets() returned" - " unexpected value!\n"); - goto ts_out1; } lio_qr_cache = kmem_cache_create("lio_qr_cache", @@ -572,7 +567,7 @@ static int __init iscsi_target_init_module(void) if (!lio_qr_cache) { pr_err("nable to kmem_cache_create() for" " lio_qr_cache\n"); - goto ts_out2; + goto bitmap_out; } lio_dr_cache = kmem_cache_create("lio_dr_cache", @@ -617,12 +612,13 @@ dr_out: kmem_cache_destroy(lio_dr_cache); qr_out: kmem_cache_destroy(lio_qr_cache); -ts_out2: - iscsi_deallocate_thread_sets(); -ts_out1: - iscsi_thread_set_free(); +bitmap_out: + vfree(iscsit_global->ts_bitmap); configfs_out: - iscsi_target_deregister_configfs(); + /* XXX: this probably wants it to be it's own unwind step.. */ + if (iscsit_global->discovery_tpg) + iscsit_tpg_disable_portal_group(iscsit_global->discovery_tpg, 1); + target_unregister_template(&iscsi_ops); out: kfree(iscsit_global); return -ENOMEM; @@ -630,8 +626,6 @@ out: static void __exit iscsi_target_cleanup_module(void) { - iscsi_deallocate_thread_sets(); - iscsi_thread_set_free(); iscsit_release_discovery_tpg(); iscsit_unregister_transport(&iscsi_target_transport); kmem_cache_destroy(lio_qr_cache); @@ -639,8 +633,15 @@ static void __exit iscsi_target_cleanup_module(void) kmem_cache_destroy(lio_ooo_cache); kmem_cache_destroy(lio_r2t_cache); - iscsi_target_deregister_configfs(); + /* + * Shutdown discovery sessions and disable discovery TPG + */ + if (iscsit_global->discovery_tpg) + iscsit_tpg_disable_portal_group(iscsit_global->discovery_tpg, 1); + target_unregister_template(&iscsi_ops); + + vfree(iscsit_global->ts_bitmap); kfree(iscsit_global); } @@ -990,7 +991,7 @@ int iscsit_setup_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd, /* * Initialize struct se_cmd descriptor from target_core_mod infrastructure */ - transport_init_se_cmd(&cmd->se_cmd, &lio_target_fabric_configfs->tf_ops, + transport_init_se_cmd(&cmd->se_cmd, &iscsi_ops, conn->sess->se_sess, be32_to_cpu(hdr->data_length), cmd->data_direction, sam_task_attr, cmd->sense_buffer + 2); @@ -1805,8 +1806,7 @@ iscsit_handle_task_mgt_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd, u8 tcm_function; int ret; - transport_init_se_cmd(&cmd->se_cmd, - &lio_target_fabric_configfs->tf_ops, + transport_init_se_cmd(&cmd->se_cmd, &iscsi_ops, conn->sess->se_sess, 0, DMA_NONE, TCM_SIMPLE_TAG, cmd->sense_buffer + 2); @@ -2155,7 +2155,6 @@ reject: cmd->text_in_ptr = NULL; return iscsit_reject_cmd(cmd, ISCSI_REASON_PROTOCOL_ERROR, buf); } -EXPORT_SYMBOL(iscsit_handle_text_cmd); int iscsit_logout_closesession(struct iscsi_cmd *cmd, struct iscsi_conn *conn) { @@ -3715,17 +3714,16 @@ static int iscsit_send_reject( void iscsit_thread_get_cpumask(struct iscsi_conn *conn) { - struct iscsi_thread_set *ts = conn->thread_set; int ord, cpu; /* - * thread_id is assigned from iscsit_global->ts_bitmap from - * within iscsi_thread_set.c:iscsi_allocate_thread_sets() + * bitmap_id is assigned from iscsit_global->ts_bitmap from + * within iscsit_start_kthreads() * - * Here we use thread_id to determine which CPU that this - * iSCSI connection's iscsi_thread_set will be scheduled to + * Here we use bitmap_id to determine which CPU that this + * iSCSI connection's RX/TX threads will be scheduled to * execute upon. */ - ord = ts->thread_id % cpumask_weight(cpu_online_mask); + ord = conn->bitmap_id % cpumask_weight(cpu_online_mask); for_each_online_cpu(cpu) { if (ord-- == 0) { cpumask_set_cpu(cpu, conn->conn_cpumask); @@ -3914,7 +3912,7 @@ check_rsp_state: switch (state) { case ISTATE_SEND_LOGOUTRSP: if (!iscsit_logout_post_handler(cmd, conn)) - goto restart; + return -ECONNRESET; /* fall through */ case ISTATE_SEND_STATUS: case ISTATE_SEND_ASYNCMSG: @@ -3942,8 +3940,6 @@ check_rsp_state: err: return -1; -restart: - return -EAGAIN; } static int iscsit_handle_response_queue(struct iscsi_conn *conn) @@ -3970,21 +3966,13 @@ static int iscsit_handle_response_queue(struct iscsi_conn *conn) int iscsi_target_tx_thread(void *arg) { int ret = 0; - struct iscsi_conn *conn; - struct iscsi_thread_set *ts = arg; + struct iscsi_conn *conn = arg; /* * Allow ourselves to be interrupted by SIGINT so that a * connection recovery / failure event can be triggered externally. */ allow_signal(SIGINT); -restart: - conn = iscsi_tx_thread_pre_handler(ts); - if (!conn) - goto out; - - ret = 0; - while (!kthread_should_stop()) { /* * Ensure that both TX and RX per connection kthreads @@ -3993,11 +3981,9 @@ restart: iscsit_thread_check_cpumask(conn, current, 1); wait_event_interruptible(conn->queues_wq, - !iscsit_conn_all_queues_empty(conn) || - ts->status == ISCSI_THREAD_SET_RESET); + !iscsit_conn_all_queues_empty(conn)); - if ((ts->status == ISCSI_THREAD_SET_RESET) || - signal_pending(current)) + if (signal_pending(current)) goto transport_err; get_immediate: @@ -4008,15 +3994,14 @@ get_immediate: ret = iscsit_handle_response_queue(conn); if (ret == 1) goto get_immediate; - else if (ret == -EAGAIN) - goto restart; + else if (ret == -ECONNRESET) + goto out; else if (ret < 0) goto transport_err; } transport_err: iscsit_take_action_for_connection_exit(conn); - goto restart; out: return 0; } @@ -4111,8 +4096,7 @@ int iscsi_target_rx_thread(void *arg) int ret; u8 buffer[ISCSI_HDR_LEN], opcode; u32 checksum = 0, digest = 0; - struct iscsi_conn *conn = NULL; - struct iscsi_thread_set *ts = arg; + struct iscsi_conn *conn = arg; struct kvec iov; /* * Allow ourselves to be interrupted by SIGINT so that a @@ -4120,11 +4104,6 @@ int iscsi_target_rx_thread(void *arg) */ allow_signal(SIGINT); -restart: - conn = iscsi_rx_thread_pre_handler(ts); - if (!conn) - goto out; - if (conn->conn_transport->transport_type == ISCSI_INFINIBAND) { struct completion comp; int rc; @@ -4134,7 +4113,7 @@ restart: if (rc < 0) goto transport_err; - goto out; + goto transport_err; } while (!kthread_should_stop()) { @@ -4210,8 +4189,6 @@ transport_err: if (!signal_pending(current)) atomic_set(&conn->transport_failed, 1); iscsit_take_action_for_connection_exit(conn); - goto restart; -out: return 0; } @@ -4273,7 +4250,24 @@ int iscsit_close_connection( if (conn->conn_transport->transport_type == ISCSI_TCP) complete(&conn->conn_logout_comp); - iscsi_release_thread_set(conn); + if (!strcmp(current->comm, ISCSI_RX_THREAD_NAME)) { + if (conn->tx_thread && + cmpxchg(&conn->tx_thread_active, true, false)) { + send_sig(SIGINT, conn->tx_thread, 1); + kthread_stop(conn->tx_thread); + } + } else if (!strcmp(current->comm, ISCSI_TX_THREAD_NAME)) { + if (conn->rx_thread && + cmpxchg(&conn->rx_thread_active, true, false)) { + send_sig(SIGINT, conn->rx_thread, 1); + kthread_stop(conn->rx_thread); + } + } + + spin_lock(&iscsit_global->ts_bitmap_lock); + bitmap_release_region(iscsit_global->ts_bitmap, conn->bitmap_id, + get_order(1)); + spin_unlock(&iscsit_global->ts_bitmap_lock); iscsit_stop_timers_for_cmds(conn); iscsit_stop_nopin_response_timer(conn); @@ -4383,8 +4377,6 @@ int iscsit_close_connection( iscsit_put_transport(conn->conn_transport); - conn->thread_set = NULL; - pr_debug("Moving to TARG_CONN_STATE_FREE.\n"); conn->conn_state = TARG_CONN_STATE_FREE; kfree(conn); @@ -4551,15 +4543,13 @@ static void iscsit_logout_post_handler_closesession( struct iscsi_conn *conn) { struct iscsi_session *sess = conn->sess; - - iscsi_set_thread_clear(conn, ISCSI_CLEAR_TX_THREAD); - iscsi_set_thread_set_signal(conn, ISCSI_SIGNAL_TX_THREAD); + int sleep = cmpxchg(&conn->tx_thread_active, true, false); atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); iscsit_dec_conn_usage_count(conn); - iscsit_stop_session(sess, 1, 1); + iscsit_stop_session(sess, sleep, sleep); iscsit_dec_session_usage_count(sess); target_put_session(sess->se_sess); } @@ -4567,13 +4557,12 @@ static void iscsit_logout_post_handler_closesession( static void iscsit_logout_post_handler_samecid( struct iscsi_conn *conn) { - iscsi_set_thread_clear(conn, ISCSI_CLEAR_TX_THREAD); - iscsi_set_thread_set_signal(conn, ISCSI_SIGNAL_TX_THREAD); + int sleep = cmpxchg(&conn->tx_thread_active, true, false); atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); - iscsit_cause_connection_reinstatement(conn, 1); + iscsit_cause_connection_reinstatement(conn, sleep); iscsit_dec_conn_usage_count(conn); } diff --git a/drivers/target/iscsi/iscsi_target.h b/drivers/target/iscsi/iscsi_target.h index e936d56fb523..7d0f9c00d9c2 100644 --- a/drivers/target/iscsi/iscsi_target.h +++ b/drivers/target/iscsi/iscsi_target.h @@ -35,7 +35,7 @@ extern void iscsit_stop_session(struct iscsi_session *, int, int); extern int iscsit_release_sessions_for_tpg(struct iscsi_portal_group *, int); extern struct iscsit_global *iscsit_global; -extern struct target_fabric_configfs *lio_target_fabric_configfs; +extern const struct target_core_fabric_ops iscsi_ops; extern struct kmem_cache *lio_dr_cache; extern struct kmem_cache *lio_ooo_cache; diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index 48384b675e62..469fce44ebad 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -37,9 +37,6 @@ #include "iscsi_target_util.h" #include "iscsi_target.h" #include <target/iscsi/iscsi_target_stat.h> -#include "iscsi_target_configfs.h" - -struct target_fabric_configfs *lio_target_fabric_configfs; struct lio_target_configfs_attribute { struct configfs_attribute attr; @@ -1052,6 +1049,11 @@ TPG_ATTR(default_erl, S_IRUGO | S_IWUSR); */ DEF_TPG_ATTRIB(t10_pi); TPG_ATTR(t10_pi, S_IRUGO | S_IWUSR); +/* + * Define iscsi_tpg_attrib_s_fabric_prot_type + */ +DEF_TPG_ATTRIB(fabric_prot_type); +TPG_ATTR(fabric_prot_type, S_IRUGO | S_IWUSR); static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = { &iscsi_tpg_attrib_authentication.attr, @@ -1065,6 +1067,7 @@ static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = { &iscsi_tpg_attrib_demo_mode_discovery.attr, &iscsi_tpg_attrib_default_erl.attr, &iscsi_tpg_attrib_t10_pi.attr, + &iscsi_tpg_attrib_fabric_prot_type.attr, NULL, }; @@ -1410,8 +1413,18 @@ out: TF_TPG_BASE_ATTR(lio_target, enable, S_IRUGO | S_IWUSR); +static ssize_t lio_target_tpg_show_dynamic_sessions( + struct se_portal_group *se_tpg, + char *page) +{ + return target_show_dynamic_sessions(se_tpg, page); +} + +TF_TPG_BASE_ATTR_RO(lio_target, dynamic_sessions); + static struct configfs_attribute *lio_target_tpg_attrs[] = { &lio_target_tpg_enable.attr, + &lio_target_tpg_dynamic_sessions.attr, NULL, }; @@ -1450,10 +1463,8 @@ static struct se_portal_group *lio_target_tiqn_addtpg( if (!tpg) return NULL; - ret = core_tpg_register( - &lio_target_fabric_configfs->tf_ops, - wwn, &tpg->tpg_se_tpg, tpg, - TRANSPORT_TPG_TYPE_NORMAL); + ret = core_tpg_register(&iscsi_ops, wwn, &tpg->tpg_se_tpg, + tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) return NULL; @@ -1872,6 +1883,20 @@ static int lio_tpg_check_prod_mode_write_protect( return tpg->tpg_attrib.prod_mode_write_protect; } +static int lio_tpg_check_prot_fabric_only( + struct se_portal_group *se_tpg) +{ + struct iscsi_portal_group *tpg = se_tpg->se_tpg_fabric_ptr; + /* + * Only report fabric_prot_type if t10_pi has also been enabled + * for incoming ib_isert sessions. + */ + if (!tpg->tpg_attrib.t10_pi) + return 0; + + return tpg->tpg_attrib.fabric_prot_type; +} + static void lio_tpg_release_fabric_acl( struct se_portal_group *se_tpg, struct se_node_acl *se_acl) @@ -1953,115 +1978,60 @@ static void lio_release_cmd(struct se_cmd *se_cmd) iscsit_release_cmd(cmd); } -/* End functions for target_core_fabric_ops */ - -int iscsi_target_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - - lio_target_fabric_configfs = NULL; - fabric = target_fabric_configfs_init(THIS_MODULE, "iscsi"); - if (IS_ERR(fabric)) { - pr_err("target_fabric_configfs_init() for" - " LIO-Target failed!\n"); - return PTR_ERR(fabric); - } - /* - * Setup the fabric API of function pointers used by target_core_mod.. - */ - fabric->tf_ops.get_fabric_name = &iscsi_get_fabric_name; - fabric->tf_ops.get_fabric_proto_ident = &iscsi_get_fabric_proto_ident; - fabric->tf_ops.tpg_get_wwn = &lio_tpg_get_endpoint_wwn; - fabric->tf_ops.tpg_get_tag = &lio_tpg_get_tag; - fabric->tf_ops.tpg_get_default_depth = &lio_tpg_get_default_depth; - fabric->tf_ops.tpg_get_pr_transport_id = &iscsi_get_pr_transport_id; - fabric->tf_ops.tpg_get_pr_transport_id_len = - &iscsi_get_pr_transport_id_len; - fabric->tf_ops.tpg_parse_pr_out_transport_id = - &iscsi_parse_pr_out_transport_id; - fabric->tf_ops.tpg_check_demo_mode = &lio_tpg_check_demo_mode; - fabric->tf_ops.tpg_check_demo_mode_cache = - &lio_tpg_check_demo_mode_cache; - fabric->tf_ops.tpg_check_demo_mode_write_protect = - &lio_tpg_check_demo_mode_write_protect; - fabric->tf_ops.tpg_check_prod_mode_write_protect = - &lio_tpg_check_prod_mode_write_protect; - fabric->tf_ops.tpg_alloc_fabric_acl = &lio_tpg_alloc_fabric_acl; - fabric->tf_ops.tpg_release_fabric_acl = &lio_tpg_release_fabric_acl; - fabric->tf_ops.tpg_get_inst_index = &lio_tpg_get_inst_index; - fabric->tf_ops.check_stop_free = &lio_check_stop_free, - fabric->tf_ops.release_cmd = &lio_release_cmd; - fabric->tf_ops.shutdown_session = &lio_tpg_shutdown_session; - fabric->tf_ops.close_session = &lio_tpg_close_session; - fabric->tf_ops.sess_get_index = &lio_sess_get_index; - fabric->tf_ops.sess_get_initiator_sid = &lio_sess_get_initiator_sid; - fabric->tf_ops.write_pending = &lio_write_pending; - fabric->tf_ops.write_pending_status = &lio_write_pending_status; - fabric->tf_ops.set_default_node_attributes = - &lio_set_default_node_attributes; - fabric->tf_ops.get_task_tag = &iscsi_get_task_tag; - fabric->tf_ops.get_cmd_state = &iscsi_get_cmd_state; - fabric->tf_ops.queue_data_in = &lio_queue_data_in; - fabric->tf_ops.queue_status = &lio_queue_status; - fabric->tf_ops.queue_tm_rsp = &lio_queue_tm_rsp; - fabric->tf_ops.aborted_task = &lio_aborted_task; - /* - * Setup function pointers for generic logic in target_core_fabric_configfs.c - */ - fabric->tf_ops.fabric_make_wwn = &lio_target_call_coreaddtiqn; - fabric->tf_ops.fabric_drop_wwn = &lio_target_call_coredeltiqn; - fabric->tf_ops.fabric_make_tpg = &lio_target_tiqn_addtpg; - fabric->tf_ops.fabric_drop_tpg = &lio_target_tiqn_deltpg; - fabric->tf_ops.fabric_post_link = NULL; - fabric->tf_ops.fabric_pre_unlink = NULL; - fabric->tf_ops.fabric_make_np = &lio_target_call_addnptotpg; - fabric->tf_ops.fabric_drop_np = &lio_target_call_delnpfromtpg; - fabric->tf_ops.fabric_make_nodeacl = &lio_target_make_nodeacl; - fabric->tf_ops.fabric_drop_nodeacl = &lio_target_drop_nodeacl; - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - * sturct config_item_type's - */ - fabric->tf_cit_tmpl.tfc_discovery_cit.ct_attrs = lio_target_discovery_auth_attrs; - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = lio_target_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = lio_target_tpg_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = lio_target_tpg_attrib_attrs; - fabric->tf_cit_tmpl.tfc_tpg_auth_cit.ct_attrs = lio_target_tpg_auth_attrs; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = lio_target_tpg_param_attrs; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = lio_target_portal_attrs; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = lio_target_initiator_attrs; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = lio_target_nacl_attrib_attrs; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = lio_target_nacl_auth_attrs; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = lio_target_nacl_param_attrs; - - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() for" - " LIO-Target failed!\n"); - target_fabric_configfs_free(fabric); - return ret; - } - - lio_target_fabric_configfs = fabric; - pr_debug("LIO_TARGET[0] - Set fabric ->" - " lio_target_fabric_configfs\n"); - return 0; -} - - -void iscsi_target_deregister_configfs(void) -{ - if (!lio_target_fabric_configfs) - return; - /* - * Shutdown discovery sessions and disable discovery TPG - */ - if (iscsit_global->discovery_tpg) - iscsit_tpg_disable_portal_group(iscsit_global->discovery_tpg, 1); - - target_fabric_configfs_deregister(lio_target_fabric_configfs); - lio_target_fabric_configfs = NULL; - pr_debug("LIO_TARGET[0] - Cleared" - " lio_target_fabric_configfs\n"); -} +const struct target_core_fabric_ops iscsi_ops = { + .module = THIS_MODULE, + .name = "iscsi", + .get_fabric_name = iscsi_get_fabric_name, + .get_fabric_proto_ident = iscsi_get_fabric_proto_ident, + .tpg_get_wwn = lio_tpg_get_endpoint_wwn, + .tpg_get_tag = lio_tpg_get_tag, + .tpg_get_default_depth = lio_tpg_get_default_depth, + .tpg_get_pr_transport_id = iscsi_get_pr_transport_id, + .tpg_get_pr_transport_id_len = iscsi_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = iscsi_parse_pr_out_transport_id, + .tpg_check_demo_mode = lio_tpg_check_demo_mode, + .tpg_check_demo_mode_cache = lio_tpg_check_demo_mode_cache, + .tpg_check_demo_mode_write_protect = + lio_tpg_check_demo_mode_write_protect, + .tpg_check_prod_mode_write_protect = + lio_tpg_check_prod_mode_write_protect, + .tpg_check_prot_fabric_only = &lio_tpg_check_prot_fabric_only, + .tpg_alloc_fabric_acl = lio_tpg_alloc_fabric_acl, + .tpg_release_fabric_acl = lio_tpg_release_fabric_acl, + .tpg_get_inst_index = lio_tpg_get_inst_index, + .check_stop_free = lio_check_stop_free, + .release_cmd = lio_release_cmd, + .shutdown_session = lio_tpg_shutdown_session, + .close_session = lio_tpg_close_session, + .sess_get_index = lio_sess_get_index, + .sess_get_initiator_sid = lio_sess_get_initiator_sid, + .write_pending = lio_write_pending, + .write_pending_status = lio_write_pending_status, + .set_default_node_attributes = lio_set_default_node_attributes, + .get_task_tag = iscsi_get_task_tag, + .get_cmd_state = iscsi_get_cmd_state, + .queue_data_in = lio_queue_data_in, + .queue_status = lio_queue_status, + .queue_tm_rsp = lio_queue_tm_rsp, + .aborted_task = lio_aborted_task, + .fabric_make_wwn = lio_target_call_coreaddtiqn, + .fabric_drop_wwn = lio_target_call_coredeltiqn, + .fabric_make_tpg = lio_target_tiqn_addtpg, + .fabric_drop_tpg = lio_target_tiqn_deltpg, + .fabric_make_np = lio_target_call_addnptotpg, + .fabric_drop_np = lio_target_call_delnpfromtpg, + .fabric_make_nodeacl = lio_target_make_nodeacl, + .fabric_drop_nodeacl = lio_target_drop_nodeacl, + + .tfc_discovery_attrs = lio_target_discovery_auth_attrs, + .tfc_wwn_attrs = lio_target_wwn_attrs, + .tfc_tpg_base_attrs = lio_target_tpg_attrs, + .tfc_tpg_attrib_attrs = lio_target_tpg_attrib_attrs, + .tfc_tpg_auth_attrs = lio_target_tpg_auth_attrs, + .tfc_tpg_param_attrs = lio_target_tpg_param_attrs, + .tfc_tpg_np_base_attrs = lio_target_portal_attrs, + .tfc_tpg_nacl_base_attrs = lio_target_initiator_attrs, + .tfc_tpg_nacl_attrib_attrs = lio_target_nacl_attrib_attrs, + .tfc_tpg_nacl_auth_attrs = lio_target_nacl_auth_attrs, + .tfc_tpg_nacl_param_attrs = lio_target_nacl_param_attrs, +}; diff --git a/drivers/target/iscsi/iscsi_target_configfs.h b/drivers/target/iscsi/iscsi_target_configfs.h deleted file mode 100644 index 8cd5a63c4edc..000000000000 --- a/drivers/target/iscsi/iscsi_target_configfs.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef ISCSI_TARGET_CONFIGFS_H -#define ISCSI_TARGET_CONFIGFS_H - -extern int iscsi_target_register_configfs(void); -extern void iscsi_target_deregister_configfs(void); - -#endif /* ISCSI_TARGET_CONFIGFS_H */ diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index bdd8731a4daa..959a14c9dd5d 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -23,7 +23,6 @@ #include <target/iscsi/iscsi_target_core.h> #include "iscsi_target_seq_pdu_list.h" -#include "iscsi_target_tq.h" #include "iscsi_target_erl0.h" #include "iscsi_target_erl1.h" #include "iscsi_target_erl2.h" @@ -860,7 +859,10 @@ void iscsit_connection_reinstatement_rcfr(struct iscsi_conn *conn) } spin_unlock_bh(&conn->state_lock); - iscsi_thread_set_force_reinstatement(conn); + if (conn->tx_thread && conn->tx_thread_active) + send_sig(SIGINT, conn->tx_thread, 1); + if (conn->rx_thread && conn->rx_thread_active) + send_sig(SIGINT, conn->rx_thread, 1); sleep: wait_for_completion(&conn->conn_wait_rcfr_comp); @@ -885,10 +887,10 @@ void iscsit_cause_connection_reinstatement(struct iscsi_conn *conn, int sleep) return; } - if (iscsi_thread_set_force_reinstatement(conn) < 0) { - spin_unlock_bh(&conn->state_lock); - return; - } + if (conn->tx_thread && conn->tx_thread_active) + send_sig(SIGINT, conn->tx_thread, 1); + if (conn->rx_thread && conn->rx_thread_active) + send_sig(SIGINT, conn->rx_thread, 1); atomic_set(&conn->connection_reinstatement, 1); if (!sleep) { diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 153fb66ac1b8..8ce94ff744e6 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -26,7 +26,6 @@ #include <target/iscsi/iscsi_target_core.h> #include <target/iscsi/iscsi_target_stat.h> -#include "iscsi_target_tq.h" #include "iscsi_target_device.h" #include "iscsi_target_nego.h" #include "iscsi_target_erl0.h" @@ -699,6 +698,51 @@ static void iscsi_post_login_start_timers(struct iscsi_conn *conn) iscsit_start_nopin_timer(conn); } +static int iscsit_start_kthreads(struct iscsi_conn *conn) +{ + int ret = 0; + + spin_lock(&iscsit_global->ts_bitmap_lock); + conn->bitmap_id = bitmap_find_free_region(iscsit_global->ts_bitmap, + ISCSIT_BITMAP_BITS, get_order(1)); + spin_unlock(&iscsit_global->ts_bitmap_lock); + + if (conn->bitmap_id < 0) { + pr_err("bitmap_find_free_region() failed for" + " iscsit_start_kthreads()\n"); + return -ENOMEM; + } + + conn->tx_thread = kthread_run(iscsi_target_tx_thread, conn, + "%s", ISCSI_TX_THREAD_NAME); + if (IS_ERR(conn->tx_thread)) { + pr_err("Unable to start iscsi_target_tx_thread\n"); + ret = PTR_ERR(conn->tx_thread); + goto out_bitmap; + } + conn->tx_thread_active = true; + + conn->rx_thread = kthread_run(iscsi_target_rx_thread, conn, + "%s", ISCSI_RX_THREAD_NAME); + if (IS_ERR(conn->rx_thread)) { + pr_err("Unable to start iscsi_target_rx_thread\n"); + ret = PTR_ERR(conn->rx_thread); + goto out_tx; + } + conn->rx_thread_active = true; + + return 0; +out_tx: + kthread_stop(conn->tx_thread); + conn->tx_thread_active = false; +out_bitmap: + spin_lock(&iscsit_global->ts_bitmap_lock); + bitmap_release_region(iscsit_global->ts_bitmap, conn->bitmap_id, + get_order(1)); + spin_unlock(&iscsit_global->ts_bitmap_lock); + return ret; +} + int iscsi_post_login_handler( struct iscsi_np *np, struct iscsi_conn *conn, @@ -709,7 +753,7 @@ int iscsi_post_login_handler( struct se_session *se_sess = sess->se_sess; struct iscsi_portal_group *tpg = sess->tpg; struct se_portal_group *se_tpg = &tpg->tpg_se_tpg; - struct iscsi_thread_set *ts; + int rc; iscsit_inc_conn_usage_count(conn); @@ -724,7 +768,6 @@ int iscsi_post_login_handler( /* * SCSI Initiator -> SCSI Target Port Mapping */ - ts = iscsi_get_thread_set(); if (!zero_tsih) { iscsi_set_session_parameters(sess->sess_ops, conn->param_list, 0); @@ -751,9 +794,11 @@ int iscsi_post_login_handler( sess->sess_ops->InitiatorName); spin_unlock_bh(&sess->conn_lock); - iscsi_post_login_start_timers(conn); + rc = iscsit_start_kthreads(conn); + if (rc) + return rc; - iscsi_activate_thread_set(conn, ts); + iscsi_post_login_start_timers(conn); /* * Determine CPU mask to ensure connection's RX and TX kthreads * are scheduled on the same CPU. @@ -810,8 +855,11 @@ int iscsi_post_login_handler( " iSCSI Target Portal Group: %hu\n", tpg->nsessions, tpg->tpgt); spin_unlock_bh(&se_tpg->session_lock); + rc = iscsit_start_kthreads(conn); + if (rc) + return rc; + iscsi_post_login_start_timers(conn); - iscsi_activate_thread_set(conn, ts); /* * Determine CPU mask to ensure connection's RX and TX kthreads * are scheduled on the same CPU. diff --git a/drivers/target/iscsi/iscsi_target_tpg.c b/drivers/target/iscsi/iscsi_target_tpg.c index bdd127c0e3ae..e8a240818353 100644 --- a/drivers/target/iscsi/iscsi_target_tpg.c +++ b/drivers/target/iscsi/iscsi_target_tpg.c @@ -68,10 +68,8 @@ int iscsit_load_discovery_tpg(void) return -1; } - ret = core_tpg_register( - &lio_target_fabric_configfs->tf_ops, - NULL, &tpg->tpg_se_tpg, tpg, - TRANSPORT_TPG_TYPE_DISCOVERY); + ret = core_tpg_register(&iscsi_ops, NULL, &tpg->tpg_se_tpg, + tpg, TRANSPORT_TPG_TYPE_DISCOVERY); if (ret < 0) { kfree(tpg); return -1; @@ -228,6 +226,7 @@ static void iscsit_set_default_tpg_attribs(struct iscsi_portal_group *tpg) a->demo_mode_discovery = TA_DEMO_MODE_DISCOVERY; a->default_erl = TA_DEFAULT_ERL; a->t10_pi = TA_DEFAULT_T10_PI; + a->fabric_prot_type = TA_DEFAULT_FABRIC_PROT_TYPE; } int iscsit_tpg_add_portal_group(struct iscsi_tiqn *tiqn, struct iscsi_portal_group *tpg) @@ -878,3 +877,21 @@ int iscsit_ta_t10_pi( return 0; } + +int iscsit_ta_fabric_prot_type( + struct iscsi_portal_group *tpg, + u32 prot_type) +{ + struct iscsi_tpg_attrib *a = &tpg->tpg_attrib; + + if ((prot_type != 0) && (prot_type != 1) && (prot_type != 3)) { + pr_err("Illegal value for fabric_prot_type: %u\n", prot_type); + return -EINVAL; + } + + a->fabric_prot_type = prot_type; + pr_debug("iSCSI_TPG[%hu] - T10 Fabric Protection Type: %u\n", + tpg->tpgt, prot_type); + + return 0; +} diff --git a/drivers/target/iscsi/iscsi_target_tpg.h b/drivers/target/iscsi/iscsi_target_tpg.h index e7265337bc43..95ff5bdecd71 100644 --- a/drivers/target/iscsi/iscsi_target_tpg.h +++ b/drivers/target/iscsi/iscsi_target_tpg.h @@ -39,5 +39,6 @@ extern int iscsit_ta_prod_mode_write_protect(struct iscsi_portal_group *, u32); extern int iscsit_ta_demo_mode_discovery(struct iscsi_portal_group *, u32); extern int iscsit_ta_default_erl(struct iscsi_portal_group *, u32); extern int iscsit_ta_t10_pi(struct iscsi_portal_group *, u32); +extern int iscsit_ta_fabric_prot_type(struct iscsi_portal_group *, u32); #endif /* ISCSI_TARGET_TPG_H */ diff --git a/drivers/target/iscsi/iscsi_target_tq.c b/drivers/target/iscsi/iscsi_target_tq.c deleted file mode 100644 index 26aa50996473..000000000000 --- a/drivers/target/iscsi/iscsi_target_tq.c +++ /dev/null @@ -1,495 +0,0 @@ -/******************************************************************************* - * This file contains the iSCSI Login Thread and Thread Queue functions. - * - * (c) Copyright 2007-2013 Datera, Inc. - * - * Author: Nicholas A. Bellinger <nab@linux-iscsi.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - ******************************************************************************/ - -#include <linux/kthread.h> -#include <linux/list.h> -#include <linux/bitmap.h> - -#include <target/iscsi/iscsi_target_core.h> -#include "iscsi_target_tq.h" -#include "iscsi_target.h" - -static LIST_HEAD(inactive_ts_list); -static DEFINE_SPINLOCK(inactive_ts_lock); -static DEFINE_SPINLOCK(ts_bitmap_lock); - -static void iscsi_add_ts_to_inactive_list(struct iscsi_thread_set *ts) -{ - if (!list_empty(&ts->ts_list)) { - WARN_ON(1); - return; - } - spin_lock(&inactive_ts_lock); - list_add_tail(&ts->ts_list, &inactive_ts_list); - iscsit_global->inactive_ts++; - spin_unlock(&inactive_ts_lock); -} - -static struct iscsi_thread_set *iscsi_get_ts_from_inactive_list(void) -{ - struct iscsi_thread_set *ts; - - spin_lock(&inactive_ts_lock); - if (list_empty(&inactive_ts_list)) { - spin_unlock(&inactive_ts_lock); - return NULL; - } - - ts = list_first_entry(&inactive_ts_list, struct iscsi_thread_set, ts_list); - - list_del_init(&ts->ts_list); - iscsit_global->inactive_ts--; - spin_unlock(&inactive_ts_lock); - - return ts; -} - -int iscsi_allocate_thread_sets(u32 thread_pair_count) -{ - int allocated_thread_pair_count = 0, i, thread_id; - struct iscsi_thread_set *ts = NULL; - - for (i = 0; i < thread_pair_count; i++) { - ts = kzalloc(sizeof(struct iscsi_thread_set), GFP_KERNEL); - if (!ts) { - pr_err("Unable to allocate memory for" - " thread set.\n"); - return allocated_thread_pair_count; - } - /* - * Locate the next available regision in the thread_set_bitmap - */ - spin_lock(&ts_bitmap_lock); - thread_id = bitmap_find_free_region(iscsit_global->ts_bitmap, - iscsit_global->ts_bitmap_count, get_order(1)); - spin_unlock(&ts_bitmap_lock); - if (thread_id < 0) { - pr_err("bitmap_find_free_region() failed for" - " thread_set_bitmap\n"); - kfree(ts); - return allocated_thread_pair_count; - } - - ts->thread_id = thread_id; - ts->status = ISCSI_THREAD_SET_FREE; - INIT_LIST_HEAD(&ts->ts_list); - spin_lock_init(&ts->ts_state_lock); - init_completion(&ts->rx_restart_comp); - init_completion(&ts->tx_restart_comp); - init_completion(&ts->rx_start_comp); - init_completion(&ts->tx_start_comp); - sema_init(&ts->ts_activate_sem, 0); - - ts->create_threads = 1; - ts->tx_thread = kthread_run(iscsi_target_tx_thread, ts, "%s", - ISCSI_TX_THREAD_NAME); - if (IS_ERR(ts->tx_thread)) { - dump_stack(); - pr_err("Unable to start iscsi_target_tx_thread\n"); - break; - } - - ts->rx_thread = kthread_run(iscsi_target_rx_thread, ts, "%s", - ISCSI_RX_THREAD_NAME); - if (IS_ERR(ts->rx_thread)) { - kthread_stop(ts->tx_thread); - pr_err("Unable to start iscsi_target_rx_thread\n"); - break; - } - ts->create_threads = 0; - - iscsi_add_ts_to_inactive_list(ts); - allocated_thread_pair_count++; - } - - pr_debug("Spawned %d thread set(s) (%d total threads).\n", - allocated_thread_pair_count, allocated_thread_pair_count * 2); - return allocated_thread_pair_count; -} - -static void iscsi_deallocate_thread_one(struct iscsi_thread_set *ts) -{ - spin_lock_bh(&ts->ts_state_lock); - ts->status = ISCSI_THREAD_SET_DIE; - - if (ts->rx_thread) { - complete(&ts->rx_start_comp); - spin_unlock_bh(&ts->ts_state_lock); - kthread_stop(ts->rx_thread); - spin_lock_bh(&ts->ts_state_lock); - } - if (ts->tx_thread) { - complete(&ts->tx_start_comp); - spin_unlock_bh(&ts->ts_state_lock); - kthread_stop(ts->tx_thread); - spin_lock_bh(&ts->ts_state_lock); - } - spin_unlock_bh(&ts->ts_state_lock); - /* - * Release this thread_id in the thread_set_bitmap - */ - spin_lock(&ts_bitmap_lock); - bitmap_release_region(iscsit_global->ts_bitmap, - ts->thread_id, get_order(1)); - spin_unlock(&ts_bitmap_lock); - - kfree(ts); -} - -void iscsi_deallocate_thread_sets(void) -{ - struct iscsi_thread_set *ts = NULL; - u32 released_count = 0; - - while ((ts = iscsi_get_ts_from_inactive_list())) { - - iscsi_deallocate_thread_one(ts); - released_count++; - } - - if (released_count) - pr_debug("Stopped %d thread set(s) (%d total threads)." - "\n", released_count, released_count * 2); -} - -static void iscsi_deallocate_extra_thread_sets(void) -{ - u32 orig_count, released_count = 0; - struct iscsi_thread_set *ts = NULL; - - orig_count = TARGET_THREAD_SET_COUNT; - - while ((iscsit_global->inactive_ts + 1) > orig_count) { - ts = iscsi_get_ts_from_inactive_list(); - if (!ts) - break; - - iscsi_deallocate_thread_one(ts); - released_count++; - } - - if (released_count) - pr_debug("Stopped %d thread set(s) (%d total threads)." - "\n", released_count, released_count * 2); -} - -void iscsi_activate_thread_set(struct iscsi_conn *conn, struct iscsi_thread_set *ts) -{ - spin_lock_bh(&ts->ts_state_lock); - conn->thread_set = ts; - ts->conn = conn; - ts->status = ISCSI_THREAD_SET_ACTIVE; - spin_unlock_bh(&ts->ts_state_lock); - - complete(&ts->rx_start_comp); - complete(&ts->tx_start_comp); - - down(&ts->ts_activate_sem); -} - -struct iscsi_thread_set *iscsi_get_thread_set(void) -{ - struct iscsi_thread_set *ts; - -get_set: - ts = iscsi_get_ts_from_inactive_list(); - if (!ts) { - iscsi_allocate_thread_sets(1); - goto get_set; - } - - ts->delay_inactive = 1; - ts->signal_sent = 0; - ts->thread_count = 2; - init_completion(&ts->rx_restart_comp); - init_completion(&ts->tx_restart_comp); - sema_init(&ts->ts_activate_sem, 0); - - return ts; -} - -void iscsi_set_thread_clear(struct iscsi_conn *conn, u8 thread_clear) -{ - struct iscsi_thread_set *ts = NULL; - - if (!conn->thread_set) { - pr_err("struct iscsi_conn->thread_set is NULL\n"); - return; - } - ts = conn->thread_set; - - spin_lock_bh(&ts->ts_state_lock); - ts->thread_clear &= ~thread_clear; - - if ((thread_clear & ISCSI_CLEAR_RX_THREAD) && - (ts->blocked_threads & ISCSI_BLOCK_RX_THREAD)) - complete(&ts->rx_restart_comp); - else if ((thread_clear & ISCSI_CLEAR_TX_THREAD) && - (ts->blocked_threads & ISCSI_BLOCK_TX_THREAD)) - complete(&ts->tx_restart_comp); - spin_unlock_bh(&ts->ts_state_lock); -} - -void iscsi_set_thread_set_signal(struct iscsi_conn *conn, u8 signal_sent) -{ - struct iscsi_thread_set *ts = NULL; - - if (!conn->thread_set) { - pr_err("struct iscsi_conn->thread_set is NULL\n"); - return; - } - ts = conn->thread_set; - - spin_lock_bh(&ts->ts_state_lock); - ts->signal_sent |= signal_sent; - spin_unlock_bh(&ts->ts_state_lock); -} - -int iscsi_release_thread_set(struct iscsi_conn *conn) -{ - int thread_called = 0; - struct iscsi_thread_set *ts = NULL; - - if (!conn || !conn->thread_set) { - pr_err("connection or thread set pointer is NULL\n"); - BUG(); - } - ts = conn->thread_set; - - spin_lock_bh(&ts->ts_state_lock); - ts->status = ISCSI_THREAD_SET_RESET; - - if (!strncmp(current->comm, ISCSI_RX_THREAD_NAME, - strlen(ISCSI_RX_THREAD_NAME))) - thread_called = ISCSI_RX_THREAD; - else if (!strncmp(current->comm, ISCSI_TX_THREAD_NAME, - strlen(ISCSI_TX_THREAD_NAME))) - thread_called = ISCSI_TX_THREAD; - - if (ts->rx_thread && (thread_called == ISCSI_TX_THREAD) && - (ts->thread_clear & ISCSI_CLEAR_RX_THREAD)) { - - if (!(ts->signal_sent & ISCSI_SIGNAL_RX_THREAD)) { - send_sig(SIGINT, ts->rx_thread, 1); - ts->signal_sent |= ISCSI_SIGNAL_RX_THREAD; - } - ts->blocked_threads |= ISCSI_BLOCK_RX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); - wait_for_completion(&ts->rx_restart_comp); - spin_lock_bh(&ts->ts_state_lock); - ts->blocked_threads &= ~ISCSI_BLOCK_RX_THREAD; - } - if (ts->tx_thread && (thread_called == ISCSI_RX_THREAD) && - (ts->thread_clear & ISCSI_CLEAR_TX_THREAD)) { - - if (!(ts->signal_sent & ISCSI_SIGNAL_TX_THREAD)) { - send_sig(SIGINT, ts->tx_thread, 1); - ts->signal_sent |= ISCSI_SIGNAL_TX_THREAD; - } - ts->blocked_threads |= ISCSI_BLOCK_TX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); - wait_for_completion(&ts->tx_restart_comp); - spin_lock_bh(&ts->ts_state_lock); - ts->blocked_threads &= ~ISCSI_BLOCK_TX_THREAD; - } - - ts->conn = NULL; - ts->status = ISCSI_THREAD_SET_FREE; - spin_unlock_bh(&ts->ts_state_lock); - - return 0; -} - -int iscsi_thread_set_force_reinstatement(struct iscsi_conn *conn) -{ - struct iscsi_thread_set *ts; - - if (!conn->thread_set) - return -1; - ts = conn->thread_set; - - spin_lock_bh(&ts->ts_state_lock); - if (ts->status != ISCSI_THREAD_SET_ACTIVE) { - spin_unlock_bh(&ts->ts_state_lock); - return -1; - } - - if (ts->tx_thread && (!(ts->signal_sent & ISCSI_SIGNAL_TX_THREAD))) { - send_sig(SIGINT, ts->tx_thread, 1); - ts->signal_sent |= ISCSI_SIGNAL_TX_THREAD; - } - if (ts->rx_thread && (!(ts->signal_sent & ISCSI_SIGNAL_RX_THREAD))) { - send_sig(SIGINT, ts->rx_thread, 1); - ts->signal_sent |= ISCSI_SIGNAL_RX_THREAD; - } - spin_unlock_bh(&ts->ts_state_lock); - - return 0; -} - -static void iscsi_check_to_add_additional_sets(void) -{ - int thread_sets_add; - - spin_lock(&inactive_ts_lock); - thread_sets_add = iscsit_global->inactive_ts; - spin_unlock(&inactive_ts_lock); - if (thread_sets_add == 1) - iscsi_allocate_thread_sets(1); -} - -static int iscsi_signal_thread_pre_handler(struct iscsi_thread_set *ts) -{ - spin_lock_bh(&ts->ts_state_lock); - if (ts->status == ISCSI_THREAD_SET_DIE || kthread_should_stop() || - signal_pending(current)) { - spin_unlock_bh(&ts->ts_state_lock); - return -1; - } - spin_unlock_bh(&ts->ts_state_lock); - - return 0; -} - -struct iscsi_conn *iscsi_rx_thread_pre_handler(struct iscsi_thread_set *ts) -{ - int ret; - - spin_lock_bh(&ts->ts_state_lock); - if (ts->create_threads) { - spin_unlock_bh(&ts->ts_state_lock); - goto sleep; - } - - if (ts->status != ISCSI_THREAD_SET_DIE) - flush_signals(current); - - if (ts->delay_inactive && (--ts->thread_count == 0)) { - spin_unlock_bh(&ts->ts_state_lock); - - if (!iscsit_global->in_shutdown) - iscsi_deallocate_extra_thread_sets(); - - iscsi_add_ts_to_inactive_list(ts); - spin_lock_bh(&ts->ts_state_lock); - } - - if ((ts->status == ISCSI_THREAD_SET_RESET) && - (ts->thread_clear & ISCSI_CLEAR_RX_THREAD)) - complete(&ts->rx_restart_comp); - - ts->thread_clear &= ~ISCSI_CLEAR_RX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); -sleep: - ret = wait_for_completion_interruptible(&ts->rx_start_comp); - if (ret != 0) - return NULL; - - if (iscsi_signal_thread_pre_handler(ts) < 0) - return NULL; - - iscsi_check_to_add_additional_sets(); - - spin_lock_bh(&ts->ts_state_lock); - if (!ts->conn) { - pr_err("struct iscsi_thread_set->conn is NULL for" - " RX thread_id: %s/%d\n", current->comm, current->pid); - spin_unlock_bh(&ts->ts_state_lock); - return NULL; - } - ts->thread_clear |= ISCSI_CLEAR_RX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); - - up(&ts->ts_activate_sem); - - return ts->conn; -} - -struct iscsi_conn *iscsi_tx_thread_pre_handler(struct iscsi_thread_set *ts) -{ - int ret; - - spin_lock_bh(&ts->ts_state_lock); - if (ts->create_threads) { - spin_unlock_bh(&ts->ts_state_lock); - goto sleep; - } - - if (ts->status != ISCSI_THREAD_SET_DIE) - flush_signals(current); - - if (ts->delay_inactive && (--ts->thread_count == 0)) { - spin_unlock_bh(&ts->ts_state_lock); - - if (!iscsit_global->in_shutdown) - iscsi_deallocate_extra_thread_sets(); - - iscsi_add_ts_to_inactive_list(ts); - spin_lock_bh(&ts->ts_state_lock); - } - if ((ts->status == ISCSI_THREAD_SET_RESET) && - (ts->thread_clear & ISCSI_CLEAR_TX_THREAD)) - complete(&ts->tx_restart_comp); - - ts->thread_clear &= ~ISCSI_CLEAR_TX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); -sleep: - ret = wait_for_completion_interruptible(&ts->tx_start_comp); - if (ret != 0) - return NULL; - - if (iscsi_signal_thread_pre_handler(ts) < 0) - return NULL; - - iscsi_check_to_add_additional_sets(); - - spin_lock_bh(&ts->ts_state_lock); - if (!ts->conn) { - pr_err("struct iscsi_thread_set->conn is NULL for" - " TX thread_id: %s/%d\n", current->comm, current->pid); - spin_unlock_bh(&ts->ts_state_lock); - return NULL; - } - ts->thread_clear |= ISCSI_CLEAR_TX_THREAD; - spin_unlock_bh(&ts->ts_state_lock); - - up(&ts->ts_activate_sem); - - return ts->conn; -} - -int iscsi_thread_set_init(void) -{ - int size; - - iscsit_global->ts_bitmap_count = ISCSI_TS_BITMAP_BITS; - - size = BITS_TO_LONGS(iscsit_global->ts_bitmap_count) * sizeof(long); - iscsit_global->ts_bitmap = kzalloc(size, GFP_KERNEL); - if (!iscsit_global->ts_bitmap) { - pr_err("Unable to allocate iscsit_global->ts_bitmap\n"); - return -ENOMEM; - } - - return 0; -} - -void iscsi_thread_set_free(void) -{ - kfree(iscsit_global->ts_bitmap); -} diff --git a/drivers/target/iscsi/iscsi_target_tq.h b/drivers/target/iscsi/iscsi_target_tq.h deleted file mode 100644 index cc1eede5ab3a..000000000000 --- a/drivers/target/iscsi/iscsi_target_tq.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef ISCSI_THREAD_QUEUE_H -#define ISCSI_THREAD_QUEUE_H - -/* - * Defines for thread sets. - */ -extern int iscsi_thread_set_force_reinstatement(struct iscsi_conn *); -extern int iscsi_allocate_thread_sets(u32); -extern void iscsi_deallocate_thread_sets(void); -extern void iscsi_activate_thread_set(struct iscsi_conn *, struct iscsi_thread_set *); -extern struct iscsi_thread_set *iscsi_get_thread_set(void); -extern void iscsi_set_thread_clear(struct iscsi_conn *, u8); -extern void iscsi_set_thread_set_signal(struct iscsi_conn *, u8); -extern int iscsi_release_thread_set(struct iscsi_conn *); -extern struct iscsi_conn *iscsi_rx_thread_pre_handler(struct iscsi_thread_set *); -extern struct iscsi_conn *iscsi_tx_thread_pre_handler(struct iscsi_thread_set *); -extern int iscsi_thread_set_init(void); -extern void iscsi_thread_set_free(void); - -extern int iscsi_target_tx_thread(void *); -extern int iscsi_target_rx_thread(void *); - -#define TARGET_THREAD_SET_COUNT 4 - -#define ISCSI_RX_THREAD 1 -#define ISCSI_TX_THREAD 2 -#define ISCSI_RX_THREAD_NAME "iscsi_trx" -#define ISCSI_TX_THREAD_NAME "iscsi_ttx" -#define ISCSI_BLOCK_RX_THREAD 0x1 -#define ISCSI_BLOCK_TX_THREAD 0x2 -#define ISCSI_CLEAR_RX_THREAD 0x1 -#define ISCSI_CLEAR_TX_THREAD 0x2 -#define ISCSI_SIGNAL_RX_THREAD 0x1 -#define ISCSI_SIGNAL_TX_THREAD 0x2 - -/* struct iscsi_thread_set->status */ -#define ISCSI_THREAD_SET_FREE 1 -#define ISCSI_THREAD_SET_ACTIVE 2 -#define ISCSI_THREAD_SET_DIE 3 -#define ISCSI_THREAD_SET_RESET 4 -#define ISCSI_THREAD_SET_DEALLOCATE_THREADS 5 - -/* By default allow a maximum of 32K iSCSI connections */ -#define ISCSI_TS_BITMAP_BITS 32768 - -struct iscsi_thread_set { - /* flags used for blocking and restarting sets */ - int blocked_threads; - /* flag for creating threads */ - int create_threads; - /* flag for delaying readding to inactive list */ - int delay_inactive; - /* status for thread set */ - int status; - /* which threads have had signals sent */ - int signal_sent; - /* flag for which threads exited first */ - int thread_clear; - /* Active threads in the thread set */ - int thread_count; - /* Unique thread ID */ - u32 thread_id; - /* pointer to connection if set is active */ - struct iscsi_conn *conn; - /* used for controlling ts state accesses */ - spinlock_t ts_state_lock; - /* used for restarting thread queue */ - struct completion rx_restart_comp; - /* used for restarting thread queue */ - struct completion tx_restart_comp; - /* used for normal unused blocking */ - struct completion rx_start_comp; - /* used for normal unused blocking */ - struct completion tx_start_comp; - /* OS descriptor for rx thread */ - struct task_struct *rx_thread; - /* OS descriptor for tx thread */ - struct task_struct *tx_thread; - /* struct iscsi_thread_set in list list head*/ - struct list_head ts_list; - struct semaphore ts_activate_sem; -}; - -#endif /*** ISCSI_THREAD_QUEUE_H ***/ diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index 390df8ed72b2..b18edda3e8af 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -33,7 +33,6 @@ #include "iscsi_target_erl1.h" #include "iscsi_target_erl2.h" #include "iscsi_target_tpg.h" -#include "iscsi_target_tq.h" #include "iscsi_target_util.h" #include "iscsi_target.h" diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index c36bd7c29136..51f0c895c6a5 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -41,8 +41,7 @@ #define to_tcm_loop_hba(hba) container_of(hba, struct tcm_loop_hba, dev) -/* Local pointer to allocated TCM configfs fabric module */ -static struct target_fabric_configfs *tcm_loop_fabric_configfs; +static const struct target_core_fabric_ops loop_ops; static struct workqueue_struct *tcm_loop_workqueue; static struct kmem_cache *tcm_loop_cmd_cache; @@ -108,7 +107,7 @@ static struct device_driver tcm_loop_driverfs = { /* * Used with root_device_register() in tcm_loop_alloc_core_bus() below */ -struct device *tcm_loop_primary; +static struct device *tcm_loop_primary; static void tcm_loop_submission_work(struct work_struct *work) { @@ -697,6 +696,13 @@ static int tcm_loop_check_prod_mode_write_protect(struct se_portal_group *se_tpg return 0; } +static int tcm_loop_check_prot_fabric_only(struct se_portal_group *se_tpg) +{ + struct tcm_loop_tpg *tl_tpg = container_of(se_tpg, struct tcm_loop_tpg, + tl_se_tpg); + return tl_tpg->tl_fabric_prot_type; +} + static struct se_node_acl *tcm_loop_tpg_alloc_fabric_acl( struct se_portal_group *se_tpg) { @@ -912,6 +918,46 @@ static void tcm_loop_port_unlink( /* End items for tcm_loop_port_cit */ +static ssize_t tcm_loop_tpg_attrib_show_fabric_prot_type( + struct se_portal_group *se_tpg, + char *page) +{ + struct tcm_loop_tpg *tl_tpg = container_of(se_tpg, struct tcm_loop_tpg, + tl_se_tpg); + + return sprintf(page, "%d\n", tl_tpg->tl_fabric_prot_type); +} + +static ssize_t tcm_loop_tpg_attrib_store_fabric_prot_type( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct tcm_loop_tpg *tl_tpg = container_of(se_tpg, struct tcm_loop_tpg, + tl_se_tpg); + unsigned long val; + int ret = kstrtoul(page, 0, &val); + + if (ret) { + pr_err("kstrtoul() returned %d for fabric_prot_type\n", ret); + return ret; + } + if (val != 0 && val != 1 && val != 3) { + pr_err("Invalid qla2xxx fabric_prot_type: %lu\n", val); + return -EINVAL; + } + tl_tpg->tl_fabric_prot_type = val; + + return count; +} + +TF_TPG_ATTRIB_ATTR(tcm_loop, fabric_prot_type, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *tcm_loop_tpg_attrib_attrs[] = { + &tcm_loop_tpg_attrib_fabric_prot_type.attr, + NULL, +}; + /* Start items for tcm_loop_nexus_cit */ static int tcm_loop_make_nexus( @@ -937,7 +983,8 @@ static int tcm_loop_make_nexus( /* * Initialize the struct se_session pointer */ - tl_nexus->se_sess = transport_init_session(TARGET_PROT_ALL); + tl_nexus->se_sess = transport_init_session( + TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS); if (IS_ERR(tl_nexus->se_sess)) { ret = PTR_ERR(tl_nexus->se_sess); goto out; @@ -1165,21 +1212,19 @@ static struct se_portal_group *tcm_loop_make_naa_tpg( struct tcm_loop_hba *tl_hba = container_of(wwn, struct tcm_loop_hba, tl_hba_wwn); struct tcm_loop_tpg *tl_tpg; - char *tpgt_str, *end_ptr; int ret; - unsigned short int tpgt; + unsigned long tpgt; - tpgt_str = strstr(name, "tpgt_"); - if (!tpgt_str) { + if (strstr(name, "tpgt_") != name) { pr_err("Unable to locate \"tpgt_#\" directory" " group\n"); return ERR_PTR(-EINVAL); } - tpgt_str += 5; /* Skip ahead of "tpgt_" */ - tpgt = (unsigned short int) simple_strtoul(tpgt_str, &end_ptr, 0); + if (kstrtoul(name+5, 10, &tpgt)) + return ERR_PTR(-EINVAL); if (tpgt >= TL_TPGS_PER_HBA) { - pr_err("Passed tpgt: %hu exceeds TL_TPGS_PER_HBA:" + pr_err("Passed tpgt: %lu exceeds TL_TPGS_PER_HBA:" " %u\n", tpgt, TL_TPGS_PER_HBA); return ERR_PTR(-EINVAL); } @@ -1189,14 +1234,13 @@ static struct se_portal_group *tcm_loop_make_naa_tpg( /* * Register the tl_tpg as a emulated SAS TCM Target Endpoint */ - ret = core_tpg_register(&tcm_loop_fabric_configfs->tf_ops, - wwn, &tl_tpg->tl_se_tpg, tl_tpg, + ret = core_tpg_register(&loop_ops, wwn, &tl_tpg->tl_se_tpg, tl_tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) return ERR_PTR(-ENOMEM); pr_debug("TCM_Loop_ConfigFS: Allocated Emulated %s" - " Target Port %s,t,0x%04x\n", tcm_loop_dump_proto_id(tl_hba), + " Target Port %s,t,0x%04lx\n", tcm_loop_dump_proto_id(tl_hba), config_item_name(&wwn->wwn_group.cg_item), tpgt); return &tl_tpg->tl_se_tpg; @@ -1338,127 +1382,51 @@ static struct configfs_attribute *tcm_loop_wwn_attrs[] = { /* End items for tcm_loop_cit */ -static int tcm_loop_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - /* - * Set the TCM Loop HBA counter to zero - */ - tcm_loop_hba_no_cnt = 0; - /* - * Register the top level struct config_item_type with TCM core - */ - fabric = target_fabric_configfs_init(THIS_MODULE, "loopback"); - if (IS_ERR(fabric)) { - pr_err("tcm_loop_register_configfs() failed!\n"); - return PTR_ERR(fabric); - } - /* - * Setup the fabric API of function pointers used by target_core_mod - */ - fabric->tf_ops.get_fabric_name = &tcm_loop_get_fabric_name; - fabric->tf_ops.get_fabric_proto_ident = &tcm_loop_get_fabric_proto_ident; - fabric->tf_ops.tpg_get_wwn = &tcm_loop_get_endpoint_wwn; - fabric->tf_ops.tpg_get_tag = &tcm_loop_get_tag; - fabric->tf_ops.tpg_get_default_depth = &tcm_loop_get_default_depth; - fabric->tf_ops.tpg_get_pr_transport_id = &tcm_loop_get_pr_transport_id; - fabric->tf_ops.tpg_get_pr_transport_id_len = - &tcm_loop_get_pr_transport_id_len; - fabric->tf_ops.tpg_parse_pr_out_transport_id = - &tcm_loop_parse_pr_out_transport_id; - fabric->tf_ops.tpg_check_demo_mode = &tcm_loop_check_demo_mode; - fabric->tf_ops.tpg_check_demo_mode_cache = - &tcm_loop_check_demo_mode_cache; - fabric->tf_ops.tpg_check_demo_mode_write_protect = - &tcm_loop_check_demo_mode_write_protect; - fabric->tf_ops.tpg_check_prod_mode_write_protect = - &tcm_loop_check_prod_mode_write_protect; - /* - * The TCM loopback fabric module runs in demo-mode to a local - * virtual SCSI device, so fabric dependent initator ACLs are - * not required. - */ - fabric->tf_ops.tpg_alloc_fabric_acl = &tcm_loop_tpg_alloc_fabric_acl; - fabric->tf_ops.tpg_release_fabric_acl = - &tcm_loop_tpg_release_fabric_acl; - fabric->tf_ops.tpg_get_inst_index = &tcm_loop_get_inst_index; - /* - * Used for setting up remaining TCM resources in process context - */ - fabric->tf_ops.check_stop_free = &tcm_loop_check_stop_free; - fabric->tf_ops.release_cmd = &tcm_loop_release_cmd; - fabric->tf_ops.shutdown_session = &tcm_loop_shutdown_session; - fabric->tf_ops.close_session = &tcm_loop_close_session; - fabric->tf_ops.sess_get_index = &tcm_loop_sess_get_index; - fabric->tf_ops.sess_get_initiator_sid = NULL; - fabric->tf_ops.write_pending = &tcm_loop_write_pending; - fabric->tf_ops.write_pending_status = &tcm_loop_write_pending_status; - /* - * Not used for TCM loopback - */ - fabric->tf_ops.set_default_node_attributes = - &tcm_loop_set_default_node_attributes; - fabric->tf_ops.get_task_tag = &tcm_loop_get_task_tag; - fabric->tf_ops.get_cmd_state = &tcm_loop_get_cmd_state; - fabric->tf_ops.queue_data_in = &tcm_loop_queue_data_in; - fabric->tf_ops.queue_status = &tcm_loop_queue_status; - fabric->tf_ops.queue_tm_rsp = &tcm_loop_queue_tm_rsp; - fabric->tf_ops.aborted_task = &tcm_loop_aborted_task; - - /* - * Setup function pointers for generic logic in target_core_fabric_configfs.c - */ - fabric->tf_ops.fabric_make_wwn = &tcm_loop_make_scsi_hba; - fabric->tf_ops.fabric_drop_wwn = &tcm_loop_drop_scsi_hba; - fabric->tf_ops.fabric_make_tpg = &tcm_loop_make_naa_tpg; - fabric->tf_ops.fabric_drop_tpg = &tcm_loop_drop_naa_tpg; - /* - * fabric_post_link() and fabric_pre_unlink() are used for - * registration and release of TCM Loop Virtual SCSI LUNs. - */ - fabric->tf_ops.fabric_post_link = &tcm_loop_port_link; - fabric->tf_ops.fabric_pre_unlink = &tcm_loop_port_unlink; - fabric->tf_ops.fabric_make_np = NULL; - fabric->tf_ops.fabric_drop_np = NULL; - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = tcm_loop_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = tcm_loop_tpg_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - /* - * Once fabric->tf_ops has been setup, now register the fabric for - * use within TCM - */ - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() for" - " TCM_Loop failed!\n"); - target_fabric_configfs_free(fabric); - return -1; - } - /* - * Setup our local pointer to *fabric. - */ - tcm_loop_fabric_configfs = fabric; - pr_debug("TCM_LOOP[0] - Set fabric ->" - " tcm_loop_fabric_configfs\n"); - return 0; -} - -static void tcm_loop_deregister_configfs(void) -{ - if (!tcm_loop_fabric_configfs) - return; - - target_fabric_configfs_deregister(tcm_loop_fabric_configfs); - tcm_loop_fabric_configfs = NULL; - pr_debug("TCM_LOOP[0] - Cleared" - " tcm_loop_fabric_configfs\n"); -} +static const struct target_core_fabric_ops loop_ops = { + .module = THIS_MODULE, + .name = "loopback", + .get_fabric_name = tcm_loop_get_fabric_name, + .get_fabric_proto_ident = tcm_loop_get_fabric_proto_ident, + .tpg_get_wwn = tcm_loop_get_endpoint_wwn, + .tpg_get_tag = tcm_loop_get_tag, + .tpg_get_default_depth = tcm_loop_get_default_depth, + .tpg_get_pr_transport_id = tcm_loop_get_pr_transport_id, + .tpg_get_pr_transport_id_len = tcm_loop_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = tcm_loop_parse_pr_out_transport_id, + .tpg_check_demo_mode = tcm_loop_check_demo_mode, + .tpg_check_demo_mode_cache = tcm_loop_check_demo_mode_cache, + .tpg_check_demo_mode_write_protect = + tcm_loop_check_demo_mode_write_protect, + .tpg_check_prod_mode_write_protect = + tcm_loop_check_prod_mode_write_protect, + .tpg_check_prot_fabric_only = tcm_loop_check_prot_fabric_only, + .tpg_alloc_fabric_acl = tcm_loop_tpg_alloc_fabric_acl, + .tpg_release_fabric_acl = tcm_loop_tpg_release_fabric_acl, + .tpg_get_inst_index = tcm_loop_get_inst_index, + .check_stop_free = tcm_loop_check_stop_free, + .release_cmd = tcm_loop_release_cmd, + .shutdown_session = tcm_loop_shutdown_session, + .close_session = tcm_loop_close_session, + .sess_get_index = tcm_loop_sess_get_index, + .write_pending = tcm_loop_write_pending, + .write_pending_status = tcm_loop_write_pending_status, + .set_default_node_attributes = tcm_loop_set_default_node_attributes, + .get_task_tag = tcm_loop_get_task_tag, + .get_cmd_state = tcm_loop_get_cmd_state, + .queue_data_in = tcm_loop_queue_data_in, + .queue_status = tcm_loop_queue_status, + .queue_tm_rsp = tcm_loop_queue_tm_rsp, + .aborted_task = tcm_loop_aborted_task, + .fabric_make_wwn = tcm_loop_make_scsi_hba, + .fabric_drop_wwn = tcm_loop_drop_scsi_hba, + .fabric_make_tpg = tcm_loop_make_naa_tpg, + .fabric_drop_tpg = tcm_loop_drop_naa_tpg, + .fabric_post_link = tcm_loop_port_link, + .fabric_pre_unlink = tcm_loop_port_unlink, + .tfc_wwn_attrs = tcm_loop_wwn_attrs, + .tfc_tpg_base_attrs = tcm_loop_tpg_attrs, + .tfc_tpg_attrib_attrs = tcm_loop_tpg_attrib_attrs, +}; static int __init tcm_loop_fabric_init(void) { @@ -1482,7 +1450,7 @@ static int __init tcm_loop_fabric_init(void) if (ret) goto out_destroy_cache; - ret = tcm_loop_register_configfs(); + ret = target_register_template(&loop_ops); if (ret) goto out_release_core_bus; @@ -1500,7 +1468,7 @@ out: static void __exit tcm_loop_fabric_exit(void) { - tcm_loop_deregister_configfs(); + target_unregister_template(&loop_ops); tcm_loop_release_core_bus(); kmem_cache_destroy(tcm_loop_cmd_cache); destroy_workqueue(tcm_loop_workqueue); diff --git a/drivers/target/loopback/tcm_loop.h b/drivers/target/loopback/tcm_loop.h index 6ae49f272ba6..1e72ff77cac9 100644 --- a/drivers/target/loopback/tcm_loop.h +++ b/drivers/target/loopback/tcm_loop.h @@ -43,6 +43,7 @@ struct tcm_loop_nacl { struct tcm_loop_tpg { unsigned short tl_tpgt; unsigned short tl_transport_status; + enum target_prot_type tl_fabric_prot_type; atomic_t tl_tpg_port_count; struct se_portal_group tl_se_tpg; struct tcm_loop_hba *tl_hba; diff --git a/drivers/target/sbp/sbp_target.c b/drivers/target/sbp/sbp_target.c index 9512af6a8114..18b0f9703ff2 100644 --- a/drivers/target/sbp/sbp_target.c +++ b/drivers/target/sbp/sbp_target.c @@ -42,8 +42,7 @@ #include "sbp_target.h" -/* Local pointer to allocated TCM configfs fabric module */ -static struct target_fabric_configfs *sbp_fabric_configfs; +static const struct target_core_fabric_ops sbp_ops; /* FireWire address region for management and command block address handlers */ static const struct fw_address_region sbp_register_region = { @@ -2215,8 +2214,7 @@ static struct se_portal_group *sbp_make_tpg( goto out_free_tpg; } - ret = core_tpg_register(&sbp_fabric_configfs->tf_ops, wwn, - &tpg->se_tpg, (void *)tpg, + ret = core_tpg_register(&sbp_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) goto out_unreg_mgt_agt; @@ -2503,7 +2501,9 @@ static struct configfs_attribute *sbp_tpg_attrib_attrs[] = { NULL, }; -static struct target_core_fabric_ops sbp_ops = { +static const struct target_core_fabric_ops sbp_ops = { + .module = THIS_MODULE, + .name = "sbp", .get_fabric_name = sbp_get_fabric_name, .get_fabric_proto_ident = sbp_get_fabric_proto_ident, .tpg_get_wwn = sbp_get_fabric_wwn, @@ -2544,68 +2544,20 @@ static struct target_core_fabric_ops sbp_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = sbp_make_nodeacl, .fabric_drop_nodeacl = sbp_drop_nodeacl, -}; - -static int sbp_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - - fabric = target_fabric_configfs_init(THIS_MODULE, "sbp"); - if (IS_ERR(fabric)) { - pr_err("target_fabric_configfs_init() failed\n"); - return PTR_ERR(fabric); - } - - fabric->tf_ops = sbp_ops; - - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = sbp_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = sbp_tpg_base_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = sbp_tpg_attrib_attrs; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() failed for SBP\n"); - return ret; - } - sbp_fabric_configfs = fabric; - - return 0; -}; - -static void sbp_deregister_configfs(void) -{ - if (!sbp_fabric_configfs) - return; - - target_fabric_configfs_deregister(sbp_fabric_configfs); - sbp_fabric_configfs = NULL; + .tfc_wwn_attrs = sbp_wwn_attrs, + .tfc_tpg_base_attrs = sbp_tpg_base_attrs, + .tfc_tpg_attrib_attrs = sbp_tpg_attrib_attrs, }; static int __init sbp_init(void) { - int ret; - - ret = sbp_register_configfs(); - if (ret < 0) - return ret; - - return 0; + return target_register_template(&sbp_ops); }; static void __exit sbp_exit(void) { - sbp_deregister_configfs(); + target_unregister_template(&sbp_ops); }; MODULE_DESCRIPTION("FireWire SBP fabric driver"); diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 75d89adfccc0..ddaf76a4ac2a 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -142,8 +142,8 @@ static struct config_group *target_core_register_fabric( tf = target_core_get_fabric(name); if (!tf) { - pr_err("target_core_register_fabric() trying autoload for %s\n", - name); + pr_debug("target_core_register_fabric() trying autoload for %s\n", + name); /* * Below are some hardcoded request_module() calls to automatically @@ -165,8 +165,8 @@ static struct config_group *target_core_register_fabric( */ ret = request_module("iscsi_target_mod"); if (ret < 0) { - pr_err("request_module() failed for" - " iscsi_target_mod.ko: %d\n", ret); + pr_debug("request_module() failed for" + " iscsi_target_mod.ko: %d\n", ret); return ERR_PTR(-EINVAL); } } else if (!strncmp(name, "loopback", 8)) { @@ -178,8 +178,8 @@ static struct config_group *target_core_register_fabric( */ ret = request_module("tcm_loop"); if (ret < 0) { - pr_err("request_module() failed for" - " tcm_loop.ko: %d\n", ret); + pr_debug("request_module() failed for" + " tcm_loop.ko: %d\n", ret); return ERR_PTR(-EINVAL); } } @@ -188,8 +188,8 @@ static struct config_group *target_core_register_fabric( } if (!tf) { - pr_err("target_core_get_fabric() failed for %s\n", - name); + pr_debug("target_core_get_fabric() failed for %s\n", + name); return ERR_PTR(-EINVAL); } pr_debug("Target_Core_ConfigFS: REGISTER -> Located fabric:" @@ -300,81 +300,17 @@ struct configfs_subsystem *target_core_subsystem[] = { // Start functions called by external Target Fabrics Modules //############################################################################*/ -/* - * First function called by fabric modules to: - * - * 1) Allocate a struct target_fabric_configfs and save the *fabric_cit pointer. - * 2) Add struct target_fabric_configfs to g_tf_list - * 3) Return struct target_fabric_configfs to fabric module to be passed - * into target_fabric_configfs_register(). - */ -struct target_fabric_configfs *target_fabric_configfs_init( - struct module *fabric_mod, - const char *name) +static int target_fabric_tf_ops_check(const struct target_core_fabric_ops *tfo) { - struct target_fabric_configfs *tf; - - if (!(name)) { - pr_err("Unable to locate passed fabric name\n"); - return ERR_PTR(-EINVAL); + if (!tfo->name) { + pr_err("Missing tfo->name\n"); + return -EINVAL; } - if (strlen(name) >= TARGET_FABRIC_NAME_SIZE) { + if (strlen(tfo->name) >= TARGET_FABRIC_NAME_SIZE) { pr_err("Passed name: %s exceeds TARGET_FABRIC" - "_NAME_SIZE\n", name); - return ERR_PTR(-EINVAL); + "_NAME_SIZE\n", tfo->name); + return -EINVAL; } - - tf = kzalloc(sizeof(struct target_fabric_configfs), GFP_KERNEL); - if (!tf) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&tf->tf_list); - atomic_set(&tf->tf_access_cnt, 0); - /* - * Setup the default generic struct config_item_type's (cits) in - * struct target_fabric_configfs->tf_cit_tmpl - */ - tf->tf_module = fabric_mod; - target_fabric_setup_cits(tf); - - tf->tf_subsys = target_core_subsystem[0]; - snprintf(tf->tf_name, TARGET_FABRIC_NAME_SIZE, "%s", name); - - mutex_lock(&g_tf_lock); - list_add_tail(&tf->tf_list, &g_tf_list); - mutex_unlock(&g_tf_lock); - - pr_debug("<<<<<<<<<<<<<<<<<<<<<< BEGIN FABRIC API >>>>>>>>" - ">>>>>>>>>>>>>>\n"); - pr_debug("Initialized struct target_fabric_configfs: %p for" - " %s\n", tf, tf->tf_name); - return tf; -} -EXPORT_SYMBOL(target_fabric_configfs_init); - -/* - * Called by fabric plugins after FAILED target_fabric_configfs_register() call. - */ -void target_fabric_configfs_free( - struct target_fabric_configfs *tf) -{ - mutex_lock(&g_tf_lock); - list_del(&tf->tf_list); - mutex_unlock(&g_tf_lock); - - kfree(tf); -} -EXPORT_SYMBOL(target_fabric_configfs_free); - -/* - * Perform a sanity check of the passed tf->tf_ops before completing - * TCM fabric module registration. - */ -static int target_fabric_tf_ops_check( - struct target_fabric_configfs *tf) -{ - struct target_core_fabric_ops *tfo = &tf->tf_ops; - if (!tfo->get_fabric_name) { pr_err("Missing tfo->get_fabric_name()\n"); return -EINVAL; @@ -508,77 +444,59 @@ static int target_fabric_tf_ops_check( return 0; } -/* - * Called 2nd from fabric module with returned parameter of - * struct target_fabric_configfs * from target_fabric_configfs_init(). - * - * Upon a successful registration, the new fabric's struct config_item is - * return. Also, a pointer to this struct is set in the passed - * struct target_fabric_configfs. - */ -int target_fabric_configfs_register( - struct target_fabric_configfs *tf) +int target_register_template(const struct target_core_fabric_ops *fo) { + struct target_fabric_configfs *tf; int ret; + ret = target_fabric_tf_ops_check(fo); + if (ret) + return ret; + + tf = kzalloc(sizeof(struct target_fabric_configfs), GFP_KERNEL); if (!tf) { - pr_err("Unable to locate target_fabric_configfs" - " pointer\n"); - return -EINVAL; - } - if (!tf->tf_subsys) { - pr_err("Unable to target struct config_subsystem" - " pointer\n"); - return -EINVAL; + pr_err("%s: could not allocate memory!\n", __func__); + return -ENOMEM; } - ret = target_fabric_tf_ops_check(tf); - if (ret < 0) - return ret; - pr_debug("<<<<<<<<<<<<<<<<<<<<<< END FABRIC API >>>>>>>>>>>>" - ">>>>>>>>>>\n"); + INIT_LIST_HEAD(&tf->tf_list); + atomic_set(&tf->tf_access_cnt, 0); + + /* + * Setup the default generic struct config_item_type's (cits) in + * struct target_fabric_configfs->tf_cit_tmpl + */ + tf->tf_module = fo->module; + tf->tf_subsys = target_core_subsystem[0]; + snprintf(tf->tf_name, TARGET_FABRIC_NAME_SIZE, "%s", fo->name); + + tf->tf_ops = *fo; + target_fabric_setup_cits(tf); + + mutex_lock(&g_tf_lock); + list_add_tail(&tf->tf_list, &g_tf_list); + mutex_unlock(&g_tf_lock); + return 0; } -EXPORT_SYMBOL(target_fabric_configfs_register); +EXPORT_SYMBOL(target_register_template); -void target_fabric_configfs_deregister( - struct target_fabric_configfs *tf) +void target_unregister_template(const struct target_core_fabric_ops *fo) { - struct configfs_subsystem *su; + struct target_fabric_configfs *t; - if (!tf) { - pr_err("Unable to locate passed target_fabric_" - "configfs\n"); - return; - } - su = tf->tf_subsys; - if (!su) { - pr_err("Unable to locate passed tf->tf_subsys" - " pointer\n"); - return; - } - pr_debug("<<<<<<<<<<<<<<<<<<<<<< BEGIN FABRIC API >>>>>>>>>>" - ">>>>>>>>>>>>\n"); mutex_lock(&g_tf_lock); - if (atomic_read(&tf->tf_access_cnt)) { - mutex_unlock(&g_tf_lock); - pr_err("Non zero tf->tf_access_cnt for fabric %s\n", - tf->tf_name); - BUG(); + list_for_each_entry(t, &g_tf_list, tf_list) { + if (!strcmp(t->tf_name, fo->name)) { + BUG_ON(atomic_read(&t->tf_access_cnt)); + list_del(&t->tf_list); + kfree(t); + break; + } } - list_del(&tf->tf_list); mutex_unlock(&g_tf_lock); - - pr_debug("Target_Core_ConfigFS: DEREGISTER -> Releasing tf:" - " %s\n", tf->tf_name); - tf->tf_module = NULL; - tf->tf_subsys = NULL; - kfree(tf); - - pr_debug("<<<<<<<<<<<<<<<<<<<<<< END FABRIC API >>>>>>>>>>>>>>>>>" - ">>>>>\n"); } -EXPORT_SYMBOL(target_fabric_configfs_deregister); +EXPORT_SYMBOL(target_unregister_template); /*############################################################################## // Stop functions called by external Target Fabrics Modules @@ -945,7 +863,7 @@ static ssize_t target_core_dev_pr_show_attr_res_pr_holder_tg_port( struct se_lun *lun; struct se_portal_group *se_tpg; struct t10_pr_registration *pr_reg; - struct target_core_fabric_ops *tfo; + const struct target_core_fabric_ops *tfo; ssize_t len = 0; spin_lock(&dev->dev_reservation_lock); @@ -979,7 +897,7 @@ SE_DEV_PR_ATTR_RO(res_pr_holder_tg_port); static ssize_t target_core_dev_pr_show_attr_res_pr_registered_i_pts( struct se_device *dev, char *page) { - struct target_core_fabric_ops *tfo; + const struct target_core_fabric_ops *tfo; struct t10_pr_registration *pr_reg; unsigned char buf[384]; char i_buf[PR_REG_ISID_ID_LEN]; diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 0c3f90130b7d..1f7886bb16bf 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -56,6 +56,20 @@ static void target_fabric_setup_##_name##_cit(struct target_fabric_configfs *tf) pr_debug("Setup generic %s\n", __stringify(_name)); \ } +#define TF_CIT_SETUP_DRV(_name, _item_ops, _group_ops) \ +static void target_fabric_setup_##_name##_cit(struct target_fabric_configfs *tf) \ +{ \ + struct target_fabric_configfs_template *tfc = &tf->tf_cit_tmpl; \ + struct config_item_type *cit = &tfc->tfc_##_name##_cit; \ + struct configfs_attribute **attrs = tf->tf_ops.tfc_##_name##_attrs; \ + \ + cit->ct_item_ops = _item_ops; \ + cit->ct_group_ops = _group_ops; \ + cit->ct_attrs = attrs; \ + cit->ct_owner = tf->tf_module; \ + pr_debug("Setup generic %s\n", __stringify(_name)); \ +} + /* Start of tfc_tpg_mappedlun_cit */ static int target_fabric_mappedlun_link( @@ -278,7 +292,7 @@ static struct configfs_item_operations target_fabric_nacl_attrib_item_ops = { .store_attribute = target_fabric_nacl_attrib_attr_store, }; -TF_CIT_SETUP(tpg_nacl_attrib, &target_fabric_nacl_attrib_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_nacl_attrib, &target_fabric_nacl_attrib_item_ops, NULL); /* End of tfc_tpg_nacl_attrib_cit */ @@ -291,7 +305,7 @@ static struct configfs_item_operations target_fabric_nacl_auth_item_ops = { .store_attribute = target_fabric_nacl_auth_attr_store, }; -TF_CIT_SETUP(tpg_nacl_auth, &target_fabric_nacl_auth_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_nacl_auth, &target_fabric_nacl_auth_item_ops, NULL); /* End of tfc_tpg_nacl_auth_cit */ @@ -304,7 +318,7 @@ static struct configfs_item_operations target_fabric_nacl_param_item_ops = { .store_attribute = target_fabric_nacl_param_attr_store, }; -TF_CIT_SETUP(tpg_nacl_param, &target_fabric_nacl_param_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_nacl_param, &target_fabric_nacl_param_item_ops, NULL); /* End of tfc_tpg_nacl_param_cit */ @@ -461,8 +475,8 @@ static struct configfs_group_operations target_fabric_nacl_base_group_ops = { .drop_item = target_fabric_drop_mappedlun, }; -TF_CIT_SETUP(tpg_nacl_base, &target_fabric_nacl_base_item_ops, - &target_fabric_nacl_base_group_ops, NULL); +TF_CIT_SETUP_DRV(tpg_nacl_base, &target_fabric_nacl_base_item_ops, + &target_fabric_nacl_base_group_ops); /* End of tfc_tpg_nacl_base_cit */ @@ -570,7 +584,7 @@ static struct configfs_item_operations target_fabric_np_base_item_ops = { .store_attribute = target_fabric_np_base_attr_store, }; -TF_CIT_SETUP(tpg_np_base, &target_fabric_np_base_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_np_base, &target_fabric_np_base_item_ops, NULL); /* End of tfc_tpg_np_base_cit */ @@ -966,7 +980,7 @@ static struct configfs_item_operations target_fabric_tpg_attrib_item_ops = { .store_attribute = target_fabric_tpg_attrib_attr_store, }; -TF_CIT_SETUP(tpg_attrib, &target_fabric_tpg_attrib_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_attrib, &target_fabric_tpg_attrib_item_ops, NULL); /* End of tfc_tpg_attrib_cit */ @@ -979,7 +993,7 @@ static struct configfs_item_operations target_fabric_tpg_auth_item_ops = { .store_attribute = target_fabric_tpg_auth_attr_store, }; -TF_CIT_SETUP(tpg_auth, &target_fabric_tpg_auth_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_auth, &target_fabric_tpg_auth_item_ops, NULL); /* End of tfc_tpg_attrib_cit */ @@ -992,7 +1006,7 @@ static struct configfs_item_operations target_fabric_tpg_param_item_ops = { .store_attribute = target_fabric_tpg_param_attr_store, }; -TF_CIT_SETUP(tpg_param, &target_fabric_tpg_param_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_param, &target_fabric_tpg_param_item_ops, NULL); /* End of tfc_tpg_param_cit */ @@ -1018,7 +1032,7 @@ static struct configfs_item_operations target_fabric_tpg_base_item_ops = { .store_attribute = target_fabric_tpg_attr_store, }; -TF_CIT_SETUP(tpg_base, &target_fabric_tpg_base_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(tpg_base, &target_fabric_tpg_base_item_ops, NULL); /* End of tfc_tpg_base_cit */ @@ -1192,7 +1206,7 @@ static struct configfs_item_operations target_fabric_wwn_item_ops = { .store_attribute = target_fabric_wwn_attr_store, }; -TF_CIT_SETUP(wwn, &target_fabric_wwn_item_ops, &target_fabric_wwn_group_ops, NULL); +TF_CIT_SETUP_DRV(wwn, &target_fabric_wwn_item_ops, &target_fabric_wwn_group_ops); /* End of tfc_wwn_cit */ @@ -1206,7 +1220,7 @@ static struct configfs_item_operations target_fabric_discovery_item_ops = { .store_attribute = target_fabric_discovery_attr_store, }; -TF_CIT_SETUP(discovery, &target_fabric_discovery_item_ops, NULL, NULL); +TF_CIT_SETUP_DRV(discovery, &target_fabric_discovery_item_ops, NULL); /* End of tfc_discovery_cit */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 44620fb6bd45..f7e6e51aed36 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -264,40 +264,32 @@ static int fd_do_prot_rw(struct se_cmd *cmd, struct fd_prot *fd_prot, struct se_device *se_dev = cmd->se_dev; struct fd_dev *dev = FD_DEV(se_dev); struct file *prot_fd = dev->fd_prot_file; - struct scatterlist *sg; loff_t pos = (cmd->t_task_lba * se_dev->prot_length); unsigned char *buf; - u32 prot_size, len, size; - int rc, ret = 1, i; + u32 prot_size; + int rc, ret = 1; prot_size = (cmd->data_length / se_dev->dev_attrib.block_size) * se_dev->prot_length; if (!is_write) { - fd_prot->prot_buf = vzalloc(prot_size); + fd_prot->prot_buf = kzalloc(prot_size, GFP_KERNEL); if (!fd_prot->prot_buf) { pr_err("Unable to allocate fd_prot->prot_buf\n"); return -ENOMEM; } buf = fd_prot->prot_buf; - fd_prot->prot_sg_nents = cmd->t_prot_nents; - fd_prot->prot_sg = kzalloc(sizeof(struct scatterlist) * - fd_prot->prot_sg_nents, GFP_KERNEL); + fd_prot->prot_sg_nents = 1; + fd_prot->prot_sg = kzalloc(sizeof(struct scatterlist), + GFP_KERNEL); if (!fd_prot->prot_sg) { pr_err("Unable to allocate fd_prot->prot_sg\n"); - vfree(fd_prot->prot_buf); + kfree(fd_prot->prot_buf); return -ENOMEM; } - size = prot_size; - - for_each_sg(fd_prot->prot_sg, sg, fd_prot->prot_sg_nents, i) { - - len = min_t(u32, PAGE_SIZE, size); - sg_set_buf(sg, buf, len); - size -= len; - buf += len; - } + sg_init_table(fd_prot->prot_sg, fd_prot->prot_sg_nents); + sg_set_buf(fd_prot->prot_sg, buf, prot_size); } if (is_write) { @@ -318,7 +310,7 @@ static int fd_do_prot_rw(struct se_cmd *cmd, struct fd_prot *fd_prot, if (is_write || ret < 0) { kfree(fd_prot->prot_sg); - vfree(fd_prot->prot_buf); + kfree(fd_prot->prot_buf); } return ret; @@ -331,36 +323,33 @@ static int fd_do_rw(struct se_cmd *cmd, struct scatterlist *sgl, struct fd_dev *dev = FD_DEV(se_dev); struct file *fd = dev->fd_file; struct scatterlist *sg; - struct iovec *iov; - mm_segment_t old_fs; + struct iov_iter iter; + struct bio_vec *bvec; + ssize_t len = 0; loff_t pos = (cmd->t_task_lba * se_dev->dev_attrib.block_size); int ret = 0, i; - iov = kzalloc(sizeof(struct iovec) * sgl_nents, GFP_KERNEL); - if (!iov) { + bvec = kcalloc(sgl_nents, sizeof(struct bio_vec), GFP_KERNEL); + if (!bvec) { pr_err("Unable to allocate fd_do_readv iov[]\n"); return -ENOMEM; } for_each_sg(sgl, sg, sgl_nents, i) { - iov[i].iov_len = sg->length; - iov[i].iov_base = kmap(sg_page(sg)) + sg->offset; - } + bvec[i].bv_page = sg_page(sg); + bvec[i].bv_len = sg->length; + bvec[i].bv_offset = sg->offset; - old_fs = get_fs(); - set_fs(get_ds()); + len += sg->length; + } + iov_iter_bvec(&iter, ITER_BVEC, bvec, sgl_nents, len); if (is_write) - ret = vfs_writev(fd, &iov[0], sgl_nents, &pos); + ret = vfs_iter_write(fd, &iter, &pos); else - ret = vfs_readv(fd, &iov[0], sgl_nents, &pos); - - set_fs(old_fs); - - for_each_sg(sgl, sg, sgl_nents, i) - kunmap(sg_page(sg)); + ret = vfs_iter_read(fd, &iter, &pos); - kfree(iov); + kfree(bvec); if (is_write) { if (ret < 0 || ret != cmd->data_length) { @@ -436,59 +425,17 @@ fd_execute_sync_cache(struct se_cmd *cmd) return 0; } -static unsigned char * -fd_setup_write_same_buf(struct se_cmd *cmd, struct scatterlist *sg, - unsigned int len) -{ - struct se_device *se_dev = cmd->se_dev; - unsigned int block_size = se_dev->dev_attrib.block_size; - unsigned int i = 0, end; - unsigned char *buf, *p, *kmap_buf; - - buf = kzalloc(min_t(unsigned int, len, PAGE_SIZE), GFP_KERNEL); - if (!buf) { - pr_err("Unable to allocate fd_execute_write_same buf\n"); - return NULL; - } - - kmap_buf = kmap(sg_page(sg)) + sg->offset; - if (!kmap_buf) { - pr_err("kmap() failed in fd_setup_write_same\n"); - kfree(buf); - return NULL; - } - /* - * Fill local *buf to contain multiple WRITE_SAME blocks up to - * min(len, PAGE_SIZE) - */ - p = buf; - end = min_t(unsigned int, len, PAGE_SIZE); - - while (i < end) { - memcpy(p, kmap_buf, block_size); - - i += block_size; - p += block_size; - } - kunmap(sg_page(sg)); - - return buf; -} - static sense_reason_t fd_execute_write_same(struct se_cmd *cmd) { struct se_device *se_dev = cmd->se_dev; struct fd_dev *fd_dev = FD_DEV(se_dev); - struct file *f = fd_dev->fd_file; - struct scatterlist *sg; - struct iovec *iov; - mm_segment_t old_fs; - sector_t nolb = sbc_get_write_same_sectors(cmd); loff_t pos = cmd->t_task_lba * se_dev->dev_attrib.block_size; - unsigned int len, len_tmp, iov_num; - int i, rc; - unsigned char *buf; + sector_t nolb = sbc_get_write_same_sectors(cmd); + struct iov_iter iter; + struct bio_vec *bvec; + unsigned int len = 0, i; + ssize_t ret; if (!nolb) { target_complete_cmd(cmd, SAM_STAT_GOOD); @@ -499,56 +446,92 @@ fd_execute_write_same(struct se_cmd *cmd) " backends not supported\n"); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } - sg = &cmd->t_data_sg[0]; if (cmd->t_data_nents > 1 || - sg->length != cmd->se_dev->dev_attrib.block_size) { + cmd->t_data_sg[0].length != cmd->se_dev->dev_attrib.block_size) { pr_err("WRITE_SAME: Illegal SGL t_data_nents: %u length: %u" - " block_size: %u\n", cmd->t_data_nents, sg->length, + " block_size: %u\n", + cmd->t_data_nents, + cmd->t_data_sg[0].length, cmd->se_dev->dev_attrib.block_size); return TCM_INVALID_CDB_FIELD; } - len = len_tmp = nolb * se_dev->dev_attrib.block_size; - iov_num = DIV_ROUND_UP(len, PAGE_SIZE); - - buf = fd_setup_write_same_buf(cmd, sg, len); - if (!buf) + bvec = kcalloc(nolb, sizeof(struct bio_vec), GFP_KERNEL); + if (!bvec) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - iov = vzalloc(sizeof(struct iovec) * iov_num); - if (!iov) { - pr_err("Unable to allocate fd_execute_write_same iovecs\n"); - kfree(buf); + for (i = 0; i < nolb; i++) { + bvec[i].bv_page = sg_page(&cmd->t_data_sg[0]); + bvec[i].bv_len = cmd->t_data_sg[0].length; + bvec[i].bv_offset = cmd->t_data_sg[0].offset; + + len += se_dev->dev_attrib.block_size; + } + + iov_iter_bvec(&iter, ITER_BVEC, bvec, nolb, len); + ret = vfs_iter_write(fd_dev->fd_file, &iter, &pos); + + kfree(bvec); + if (ret < 0 || ret != len) { + pr_err("vfs_iter_write() returned %zd for write same\n", ret); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } - /* - * Map the single fabric received scatterlist block now populated - * in *buf into each iovec for I/O submission. - */ - for (i = 0; i < iov_num; i++) { - iov[i].iov_base = buf; - iov[i].iov_len = min_t(unsigned int, len_tmp, PAGE_SIZE); - len_tmp -= iov[i].iov_len; + + target_complete_cmd(cmd, SAM_STAT_GOOD); + return 0; +} + +static int +fd_do_prot_fill(struct se_device *se_dev, sector_t lba, sector_t nolb, + void *buf, size_t bufsize) +{ + struct fd_dev *fd_dev = FD_DEV(se_dev); + struct file *prot_fd = fd_dev->fd_prot_file; + sector_t prot_length, prot; + loff_t pos = lba * se_dev->prot_length; + + if (!prot_fd) { + pr_err("Unable to locate fd_dev->fd_prot_file\n"); + return -ENODEV; } - old_fs = get_fs(); - set_fs(get_ds()); - rc = vfs_writev(f, &iov[0], iov_num, &pos); - set_fs(old_fs); + prot_length = nolb * se_dev->prot_length; - vfree(iov); - kfree(buf); + for (prot = 0; prot < prot_length;) { + sector_t len = min_t(sector_t, bufsize, prot_length - prot); + ssize_t ret = kernel_write(prot_fd, buf, len, pos + prot); - if (rc < 0 || rc != len) { - pr_err("vfs_writev() returned %d for write same\n", rc); - return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + if (ret != len) { + pr_err("vfs_write to prot file failed: %zd\n", ret); + return ret < 0 ? ret : -ENODEV; + } + prot += ret; } - target_complete_cmd(cmd, SAM_STAT_GOOD); return 0; } +static int +fd_do_prot_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb) +{ + void *buf; + int rc; + + buf = (void *)__get_free_page(GFP_KERNEL); + if (!buf) { + pr_err("Unable to allocate FILEIO prot buf\n"); + return -ENOMEM; + } + memset(buf, 0xff, PAGE_SIZE); + + rc = fd_do_prot_fill(cmd->se_dev, lba, nolb, buf, PAGE_SIZE); + + free_page((unsigned long)buf); + + return rc; +} + static sense_reason_t fd_do_unmap(struct se_cmd *cmd, void *priv, sector_t lba, sector_t nolb) { @@ -556,6 +539,12 @@ fd_do_unmap(struct se_cmd *cmd, void *priv, sector_t lba, sector_t nolb) struct inode *inode = file->f_mapping->host; int ret; + if (cmd->se_dev->dev_attrib.pi_prot_type) { + ret = fd_do_prot_unmap(cmd, lba, nolb); + if (ret) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + } + if (S_ISBLK(inode->i_mode)) { /* The backend is block device, use discard */ struct block_device *bdev = inode->i_bdev; @@ -595,7 +584,7 @@ fd_execute_write_same_unmap(struct se_cmd *cmd) struct file *file = fd_dev->fd_file; sector_t lba = cmd->t_task_lba; sector_t nolb = sbc_get_write_same_sectors(cmd); - int ret; + sense_reason_t ret; if (!nolb) { target_complete_cmd(cmd, SAM_STAT_GOOD); @@ -643,7 +632,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (data_direction == DMA_FROM_DEVICE) { memset(&fd_prot, 0, sizeof(struct fd_prot)); - if (cmd->prot_type) { + if (cmd->prot_type && dev->dev_attrib.pi_prot_type) { ret = fd_do_prot_rw(cmd, &fd_prot, false); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -651,23 +640,23 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, ret = fd_do_rw(cmd, sgl, sgl_nents, 0); - if (ret > 0 && cmd->prot_type) { + if (ret > 0 && cmd->prot_type && dev->dev_attrib.pi_prot_type) { u32 sectors = cmd->data_length / dev->dev_attrib.block_size; rc = sbc_dif_verify_read(cmd, cmd->t_task_lba, sectors, 0, fd_prot.prot_sg, 0); if (rc) { kfree(fd_prot.prot_sg); - vfree(fd_prot.prot_buf); + kfree(fd_prot.prot_buf); return rc; } kfree(fd_prot.prot_sg); - vfree(fd_prot.prot_buf); + kfree(fd_prot.prot_buf); } } else { memset(&fd_prot, 0, sizeof(struct fd_prot)); - if (cmd->prot_type) { + if (cmd->prot_type && dev->dev_attrib.pi_prot_type) { u32 sectors = cmd->data_length / dev->dev_attrib.block_size; ret = fd_do_prot_rw(cmd, &fd_prot, false); @@ -678,7 +667,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, 0, fd_prot.prot_sg, 0); if (rc) { kfree(fd_prot.prot_sg); - vfree(fd_prot.prot_buf); + kfree(fd_prot.prot_buf); return rc; } } @@ -705,7 +694,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, vfs_fsync_range(fd_dev->fd_file, start, end, 1); } - if (ret > 0 && cmd->prot_type) { + if (ret > 0 && cmd->prot_type && dev->dev_attrib.pi_prot_type) { ret = fd_do_prot_rw(cmd, &fd_prot, true); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -714,7 +703,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (ret < 0) { kfree(fd_prot.prot_sg); - vfree(fd_prot.prot_buf); + kfree(fd_prot.prot_buf); return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } @@ -878,48 +867,28 @@ static int fd_init_prot(struct se_device *dev) static int fd_format_prot(struct se_device *dev) { - struct fd_dev *fd_dev = FD_DEV(dev); - struct file *prot_fd = fd_dev->fd_prot_file; - sector_t prot_length, prot; unsigned char *buf; - loff_t pos = 0; int unit_size = FDBD_FORMAT_UNIT_SIZE * dev->dev_attrib.block_size; - int rc, ret = 0, size, len; + int ret; if (!dev->dev_attrib.pi_prot_type) { pr_err("Unable to format_prot while pi_prot_type == 0\n"); return -ENODEV; } - if (!prot_fd) { - pr_err("Unable to locate fd_dev->fd_prot_file\n"); - return -ENODEV; - } buf = vzalloc(unit_size); if (!buf) { pr_err("Unable to allocate FILEIO prot buf\n"); return -ENOMEM; } - prot_length = (dev->transport->get_blocks(dev) + 1) * dev->prot_length; - size = prot_length; pr_debug("Using FILEIO prot_length: %llu\n", - (unsigned long long)prot_length); + (unsigned long long)(dev->transport->get_blocks(dev) + 1) * + dev->prot_length); memset(buf, 0xff, unit_size); - for (prot = 0; prot < prot_length; prot += unit_size) { - len = min(unit_size, size); - rc = kernel_write(prot_fd, buf, len, pos); - if (rc != len) { - pr_err("vfs_write to prot file failed: %d\n", rc); - ret = -ENODEV; - goto out; - } - pos += len; - size -= len; - } - -out: + ret = fd_do_prot_fill(dev, 0, dev->transport->get_blocks(dev) + 1, + buf, unit_size); vfree(buf); return ret; } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index d4a4b0fb444a..1b7947c2510f 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -444,7 +444,7 @@ iblock_execute_write_same_unmap(struct se_cmd *cmd) struct block_device *bdev = IBLOCK_DEV(cmd->se_dev)->ibd_bd; sector_t lba = cmd->t_task_lba; sector_t nolb = sbc_get_write_same_sectors(cmd); - int ret; + sense_reason_t ret; ret = iblock_do_unmap(cmd, bdev, lba, nolb); if (ret) @@ -774,7 +774,7 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, sg_num--; } - if (cmd->prot_type) { + if (cmd->prot_type && dev->dev_attrib.pi_prot_type) { int rc = iblock_alloc_bip(cmd, bio_start); if (rc) goto fail_put_bios; diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 60381db90026..874a9bc988d8 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -4,7 +4,13 @@ /* target_core_alua.c */ extern struct t10_alua_lu_gp *default_lu_gp; +/* target_core_configfs.c */ +extern struct configfs_subsystem *target_core_subsystem[]; + /* target_core_device.c */ +extern struct mutex g_device_mutex; +extern struct list_head g_device_list; + struct se_dev_entry *core_get_se_deve_from_rtpi(struct se_node_acl *, u16); int core_free_device_list_for_node(struct se_node_acl *, struct se_portal_group *); diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 2de6fb8cee8d..c1aa9655e96e 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -78,6 +78,22 @@ enum preempt_type { static void __core_scsi3_complete_pro_release(struct se_device *, struct se_node_acl *, struct t10_pr_registration *, int, int); +static int is_reservation_holder( + struct t10_pr_registration *pr_res_holder, + struct t10_pr_registration *pr_reg) +{ + int pr_res_type; + + if (pr_res_holder) { + pr_res_type = pr_res_holder->pr_res_type; + + return pr_res_holder == pr_reg || + pr_res_type == PR_TYPE_WRITE_EXCLUSIVE_ALLREG || + pr_res_type == PR_TYPE_EXCLUSIVE_ACCESS_ALLREG; + } + return 0; +} + static sense_reason_t target_scsi2_reservation_check(struct se_cmd *cmd) { @@ -664,7 +680,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( struct se_dev_entry *deve_tmp; struct se_node_acl *nacl_tmp; struct se_port *port, *port_tmp; - struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; + const struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; struct t10_pr_registration *pr_reg, *pr_reg_atp, *pr_reg_tmp, *pr_reg_tmp_safe; int ret; /* @@ -963,7 +979,7 @@ int core_scsi3_check_aptpl_registration( } static void __core_scsi3_dump_registration( - struct target_core_fabric_ops *tfo, + const struct target_core_fabric_ops *tfo, struct se_device *dev, struct se_node_acl *nacl, struct t10_pr_registration *pr_reg, @@ -1004,7 +1020,7 @@ static void __core_scsi3_add_registration( enum register_type register_type, int register_move) { - struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; + const struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; struct t10_pr_registration *pr_reg_tmp, *pr_reg_tmp_safe; struct t10_reservation *pr_tmpl = &dev->t10_pr; @@ -1220,8 +1236,10 @@ static void __core_scsi3_free_registration( struct t10_pr_registration *pr_reg, struct list_head *preempt_and_abort_list, int dec_holders) + __releases(&pr_tmpl->registration_lock) + __acquires(&pr_tmpl->registration_lock) { - struct target_core_fabric_ops *tfo = + const struct target_core_fabric_ops *tfo = pr_reg->pr_reg_nacl->se_tpg->se_tpg_tfo; struct t10_reservation *pr_tmpl = &dev->t10_pr; char i_buf[PR_REG_ISID_ID_LEN]; @@ -1445,7 +1463,7 @@ core_scsi3_decode_spec_i_port( struct t10_pr_registration *pr_reg_tmp, *pr_reg_tmp_safe; LIST_HEAD(tid_dest_list); struct pr_transport_id_holder *tidh_new, *tidh, *tidh_tmp; - struct target_core_fabric_ops *tmp_tf_ops; + const struct target_core_fabric_ops *tmp_tf_ops; unsigned char *buf; unsigned char *ptr, *i_str = NULL, proto_ident, tmp_proto_ident; char *iport_ptr = NULL, i_buf[PR_REG_ISID_ID_LEN]; @@ -2287,7 +2305,6 @@ core_scsi3_pro_reserve(struct se_cmd *cmd, int type, int scope, u64 res_key) spin_lock(&dev->dev_reservation_lock); pr_res_holder = dev->dev_pr_res_holder; if (pr_res_holder) { - int pr_res_type = pr_res_holder->pr_res_type; /* * From spc4r17 Section 5.7.9: Reserving: * @@ -2298,9 +2315,7 @@ core_scsi3_pro_reserve(struct se_cmd *cmd, int type, int scope, u64 res_key) * the logical unit, then the command shall be completed with * RESERVATION CONFLICT status. */ - if ((pr_res_holder != pr_reg) && - (pr_res_type != PR_TYPE_WRITE_EXCLUSIVE_ALLREG) && - (pr_res_type != PR_TYPE_EXCLUSIVE_ACCESS_ALLREG)) { + if (!is_reservation_holder(pr_res_holder, pr_reg)) { struct se_node_acl *pr_res_nacl = pr_res_holder->pr_reg_nacl; pr_err("SPC-3 PR: Attempted RESERVE from" " [%s]: %s while reservation already held by" @@ -2409,7 +2424,7 @@ static void __core_scsi3_complete_pro_release( int explicit, int unreg) { - struct target_core_fabric_ops *tfo = se_nacl->se_tpg->se_tpg_tfo; + const struct target_core_fabric_ops *tfo = se_nacl->se_tpg->se_tpg_tfo; char i_buf[PR_REG_ISID_ID_LEN]; int pr_res_type = 0, pr_res_scope = 0; @@ -2477,7 +2492,6 @@ core_scsi3_emulate_pro_release(struct se_cmd *cmd, int type, int scope, struct se_lun *se_lun = cmd->se_lun; struct t10_pr_registration *pr_reg, *pr_reg_p, *pr_res_holder; struct t10_reservation *pr_tmpl = &dev->t10_pr; - int all_reg = 0; sense_reason_t ret = 0; if (!se_sess || !se_lun) { @@ -2514,13 +2528,9 @@ core_scsi3_emulate_pro_release(struct se_cmd *cmd, int type, int scope, spin_unlock(&dev->dev_reservation_lock); goto out_put_pr_reg; } - if ((pr_res_holder->pr_res_type == PR_TYPE_WRITE_EXCLUSIVE_ALLREG) || - (pr_res_holder->pr_res_type == PR_TYPE_EXCLUSIVE_ACCESS_ALLREG)) - all_reg = 1; - if ((all_reg == 0) && (pr_res_holder != pr_reg)) { + if (!is_reservation_holder(pr_res_holder, pr_reg)) { /* - * Non 'All Registrants' PR Type cases.. * Release request from a registered I_T nexus that is not a * persistent reservation holder. return GOOD status. */ @@ -2726,7 +2736,7 @@ static void __core_scsi3_complete_pro_preempt( enum preempt_type preempt_type) { struct se_node_acl *nacl = pr_reg->pr_reg_nacl; - struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; + const struct target_core_fabric_ops *tfo = nacl->se_tpg->se_tpg_tfo; char i_buf[PR_REG_ISID_ID_LEN]; memset(i_buf, 0, PR_REG_ISID_ID_LEN); @@ -3111,7 +3121,7 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, struct se_node_acl *pr_res_nacl, *pr_reg_nacl, *dest_node_acl = NULL; struct se_port *se_port; struct se_portal_group *se_tpg, *dest_se_tpg = NULL; - struct target_core_fabric_ops *dest_tf_ops = NULL, *tf_ops; + const struct target_core_fabric_ops *dest_tf_ops = NULL, *tf_ops; struct t10_pr_registration *pr_reg, *pr_res_holder, *dest_pr_reg; struct t10_reservation *pr_tmpl = &dev->t10_pr; unsigned char *buf; @@ -3375,7 +3385,7 @@ after_iport_check: * From spc4r17 section 5.7.8 Table 50 -- * Register behaviors for a REGISTER AND MOVE service action */ - if (pr_res_holder != pr_reg) { + if (!is_reservation_holder(pr_res_holder, pr_reg)) { pr_warn("SPC-3 PR REGISTER_AND_MOVE: Calling I_T" " Nexus is not reservation holder\n"); spin_unlock(&dev->dev_reservation_lock); diff --git a/drivers/target/target_core_rd.c b/drivers/target/target_core_rd.c index 98e83ac5661b..a263bf5fab8d 100644 --- a/drivers/target/target_core_rd.c +++ b/drivers/target/target_core_rd.c @@ -139,10 +139,22 @@ static int rd_allocate_sgl_table(struct rd_dev *rd_dev, struct rd_dev_sg_table * unsigned char *p; while (total_sg_needed) { + unsigned int chain_entry = 0; + sg_per_table = (total_sg_needed > max_sg_per_table) ? max_sg_per_table : total_sg_needed; - sg = kzalloc(sg_per_table * sizeof(struct scatterlist), +#ifdef CONFIG_ARCH_HAS_SG_CHAIN + + /* + * Reserve extra element for chain entry + */ + if (sg_per_table < total_sg_needed) + chain_entry = 1; + +#endif /* CONFIG_ARCH_HAS_SG_CHAIN */ + + sg = kcalloc(sg_per_table + chain_entry, sizeof(*sg), GFP_KERNEL); if (!sg) { pr_err("Unable to allocate scatterlist array" @@ -150,7 +162,16 @@ static int rd_allocate_sgl_table(struct rd_dev *rd_dev, struct rd_dev_sg_table * return -ENOMEM; } - sg_init_table(sg, sg_per_table); + sg_init_table(sg, sg_per_table + chain_entry); + +#ifdef CONFIG_ARCH_HAS_SG_CHAIN + + if (i > 0) { + sg_chain(sg_table[i - 1].sg_table, + max_sg_per_table + 1, sg); + } + +#endif /* CONFIG_ARCH_HAS_SG_CHAIN */ sg_table[i].sg_table = sg; sg_table[i].rd_sg_count = sg_per_table; @@ -382,6 +403,76 @@ static struct rd_dev_sg_table *rd_get_prot_table(struct rd_dev *rd_dev, u32 page return NULL; } +typedef sense_reason_t (*dif_verify)(struct se_cmd *, sector_t, unsigned int, + unsigned int, struct scatterlist *, int); + +static sense_reason_t rd_do_prot_rw(struct se_cmd *cmd, dif_verify dif_verify) +{ + struct se_device *se_dev = cmd->se_dev; + struct rd_dev *dev = RD_DEV(se_dev); + struct rd_dev_sg_table *prot_table; + bool need_to_release = false; + struct scatterlist *prot_sg; + u32 sectors = cmd->data_length / se_dev->dev_attrib.block_size; + u32 prot_offset, prot_page; + u32 prot_npages __maybe_unused; + u64 tmp; + sense_reason_t rc = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + + tmp = cmd->t_task_lba * se_dev->prot_length; + prot_offset = do_div(tmp, PAGE_SIZE); + prot_page = tmp; + + prot_table = rd_get_prot_table(dev, prot_page); + if (!prot_table) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + + prot_sg = &prot_table->sg_table[prot_page - + prot_table->page_start_offset]; + +#ifndef CONFIG_ARCH_HAS_SG_CHAIN + + prot_npages = DIV_ROUND_UP(prot_offset + sectors * se_dev->prot_length, + PAGE_SIZE); + + /* + * Allocate temporaly contiguous scatterlist entries if prot pages + * straddles multiple scatterlist tables. + */ + if (prot_table->page_end_offset < prot_page + prot_npages - 1) { + int i; + + prot_sg = kcalloc(prot_npages, sizeof(*prot_sg), GFP_KERNEL); + if (!prot_sg) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + + need_to_release = true; + sg_init_table(prot_sg, prot_npages); + + for (i = 0; i < prot_npages; i++) { + if (prot_page + i > prot_table->page_end_offset) { + prot_table = rd_get_prot_table(dev, + prot_page + i); + if (!prot_table) { + kfree(prot_sg); + return rc; + } + sg_unmark_end(&prot_sg[i - 1]); + } + prot_sg[i] = prot_table->sg_table[prot_page + i - + prot_table->page_start_offset]; + } + } + +#endif /* !CONFIG_ARCH_HAS_SG_CHAIN */ + + rc = dif_verify(cmd, cmd->t_task_lba, sectors, 0, prot_sg, prot_offset); + if (need_to_release) + kfree(prot_sg); + + return rc; +} + static sense_reason_t rd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, enum dma_data_direction data_direction) @@ -419,24 +510,9 @@ rd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, data_direction == DMA_FROM_DEVICE ? "Read" : "Write", cmd->t_task_lba, rd_size, rd_page, rd_offset); - if (cmd->prot_type && data_direction == DMA_TO_DEVICE) { - struct rd_dev_sg_table *prot_table; - struct scatterlist *prot_sg; - u32 sectors = cmd->data_length / se_dev->dev_attrib.block_size; - u32 prot_offset, prot_page; - - tmp = cmd->t_task_lba * se_dev->prot_length; - prot_offset = do_div(tmp, PAGE_SIZE); - prot_page = tmp; - - prot_table = rd_get_prot_table(dev, prot_page); - if (!prot_table) - return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - - prot_sg = &prot_table->sg_table[prot_page - prot_table->page_start_offset]; - - rc = sbc_dif_verify_write(cmd, cmd->t_task_lba, sectors, 0, - prot_sg, prot_offset); + if (cmd->prot_type && se_dev->dev_attrib.pi_prot_type && + data_direction == DMA_TO_DEVICE) { + rc = rd_do_prot_rw(cmd, sbc_dif_verify_write); if (rc) return rc; } @@ -502,24 +578,9 @@ rd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, } sg_miter_stop(&m); - if (cmd->prot_type && data_direction == DMA_FROM_DEVICE) { - struct rd_dev_sg_table *prot_table; - struct scatterlist *prot_sg; - u32 sectors = cmd->data_length / se_dev->dev_attrib.block_size; - u32 prot_offset, prot_page; - - tmp = cmd->t_task_lba * se_dev->prot_length; - prot_offset = do_div(tmp, PAGE_SIZE); - prot_page = tmp; - - prot_table = rd_get_prot_table(dev, prot_page); - if (!prot_table) - return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - - prot_sg = &prot_table->sg_table[prot_page - prot_table->page_start_offset]; - - rc = sbc_dif_verify_read(cmd, cmd->t_task_lba, sectors, 0, - prot_sg, prot_offset); + if (cmd->prot_type && se_dev->dev_attrib.pi_prot_type && + data_direction == DMA_FROM_DEVICE) { + rc = rd_do_prot_rw(cmd, sbc_dif_verify_read); if (rc) return rc; } diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index 3e7297411110..8855781ac653 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -93,6 +93,8 @@ sbc_emulate_readcapacity_16(struct se_cmd *cmd) { struct se_device *dev = cmd->se_dev; struct se_session *sess = cmd->se_sess; + int pi_prot_type = dev->dev_attrib.pi_prot_type; + unsigned char *rbuf; unsigned char buf[32]; unsigned long long blocks = dev->transport->get_blocks(dev); @@ -114,8 +116,15 @@ sbc_emulate_readcapacity_16(struct se_cmd *cmd) * Set P_TYPE and PROT_EN bits for DIF support */ if (sess->sup_prot_ops & (TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS)) { - if (dev->dev_attrib.pi_prot_type) - buf[12] = (dev->dev_attrib.pi_prot_type - 1) << 1 | 0x1; + /* + * Only override a device's pi_prot_type if no T10-PI is + * available, and sess_prot_type has been explicitly enabled. + */ + if (!pi_prot_type) + pi_prot_type = sess->sess_prot_type; + + if (pi_prot_type) + buf[12] = (pi_prot_type - 1) << 1 | 0x1; } if (dev->transport->get_lbppbe) @@ -312,7 +321,7 @@ sbc_setup_write_same(struct se_cmd *cmd, unsigned char *flags, struct sbc_ops *o return 0; } -static sense_reason_t xdreadwrite_callback(struct se_cmd *cmd) +static sense_reason_t xdreadwrite_callback(struct se_cmd *cmd, bool success) { unsigned char *buf, *addr; struct scatterlist *sg; @@ -376,7 +385,7 @@ sbc_execute_rw(struct se_cmd *cmd) cmd->data_direction); } -static sense_reason_t compare_and_write_post(struct se_cmd *cmd) +static sense_reason_t compare_and_write_post(struct se_cmd *cmd, bool success) { struct se_device *dev = cmd->se_dev; @@ -399,7 +408,7 @@ static sense_reason_t compare_and_write_post(struct se_cmd *cmd) return TCM_NO_SENSE; } -static sense_reason_t compare_and_write_callback(struct se_cmd *cmd) +static sense_reason_t compare_and_write_callback(struct se_cmd *cmd, bool success) { struct se_device *dev = cmd->se_dev; struct scatterlist *write_sg = NULL, *sg; @@ -414,11 +423,16 @@ static sense_reason_t compare_and_write_callback(struct se_cmd *cmd) /* * Handle early failure in transport_generic_request_failure(), - * which will not have taken ->caw_mutex yet.. + * which will not have taken ->caw_sem yet.. */ - if (!cmd->t_data_sg || !cmd->t_bidi_data_sg) + if (!success && (!cmd->t_data_sg || !cmd->t_bidi_data_sg)) return TCM_NO_SENSE; /* + * Handle special case for zero-length COMPARE_AND_WRITE + */ + if (!cmd->data_length) + goto out; + /* * Immediately exit + release dev->caw_sem if command has already * been failed with a non-zero SCSI status. */ @@ -581,12 +595,13 @@ sbc_compare_and_write(struct se_cmd *cmd) } static int -sbc_set_prot_op_checks(u8 protect, enum target_prot_type prot_type, +sbc_set_prot_op_checks(u8 protect, bool fabric_prot, enum target_prot_type prot_type, bool is_write, struct se_cmd *cmd) { if (is_write) { - cmd->prot_op = protect ? TARGET_PROT_DOUT_PASS : - TARGET_PROT_DOUT_INSERT; + cmd->prot_op = fabric_prot ? TARGET_PROT_DOUT_STRIP : + protect ? TARGET_PROT_DOUT_PASS : + TARGET_PROT_DOUT_INSERT; switch (protect) { case 0x0: case 0x3: @@ -610,8 +625,9 @@ sbc_set_prot_op_checks(u8 protect, enum target_prot_type prot_type, return -EINVAL; } } else { - cmd->prot_op = protect ? TARGET_PROT_DIN_PASS : - TARGET_PROT_DIN_STRIP; + cmd->prot_op = fabric_prot ? TARGET_PROT_DIN_INSERT : + protect ? TARGET_PROT_DIN_PASS : + TARGET_PROT_DIN_STRIP; switch (protect) { case 0x0: case 0x1: @@ -644,11 +660,15 @@ sbc_check_prot(struct se_device *dev, struct se_cmd *cmd, unsigned char *cdb, u32 sectors, bool is_write) { u8 protect = cdb[1] >> 5; + int sp_ops = cmd->se_sess->sup_prot_ops; + int pi_prot_type = dev->dev_attrib.pi_prot_type; + bool fabric_prot = false; if (!cmd->t_prot_sg || !cmd->t_prot_nents) { - if (protect && !dev->dev_attrib.pi_prot_type) { - pr_err("CDB contains protect bit, but device does not" - " advertise PROTECT=1 feature bit\n"); + if (unlikely(protect && + !dev->dev_attrib.pi_prot_type && !cmd->se_sess->sess_prot_type)) { + pr_err("CDB contains protect bit, but device + fabric does" + " not advertise PROTECT=1 feature bit\n"); return TCM_INVALID_CDB_FIELD; } if (cmd->prot_pto) @@ -669,15 +689,32 @@ sbc_check_prot(struct se_device *dev, struct se_cmd *cmd, unsigned char *cdb, cmd->reftag_seed = cmd->t_task_lba; break; case TARGET_DIF_TYPE0_PROT: + /* + * See if the fabric supports T10-PI, and the session has been + * configured to allow export PROTECT=1 feature bit with backend + * devices that don't support T10-PI. + */ + fabric_prot = is_write ? + !!(sp_ops & (TARGET_PROT_DOUT_PASS | TARGET_PROT_DOUT_STRIP)) : + !!(sp_ops & (TARGET_PROT_DIN_PASS | TARGET_PROT_DIN_INSERT)); + + if (fabric_prot && cmd->se_sess->sess_prot_type) { + pi_prot_type = cmd->se_sess->sess_prot_type; + break; + } + if (!protect) + return TCM_NO_SENSE; + /* Fallthrough */ default: - return TCM_NO_SENSE; + pr_err("Unable to determine pi_prot_type for CDB: 0x%02x " + "PROTECT: 0x%02x\n", cdb[0], protect); + return TCM_INVALID_CDB_FIELD; } - if (sbc_set_prot_op_checks(protect, dev->dev_attrib.pi_prot_type, - is_write, cmd)) + if (sbc_set_prot_op_checks(protect, fabric_prot, pi_prot_type, is_write, cmd)) return TCM_INVALID_CDB_FIELD; - cmd->prot_type = dev->dev_attrib.pi_prot_type; + cmd->prot_type = pi_prot_type; cmd->prot_length = dev->prot_length * sectors; /** @@ -1166,14 +1203,16 @@ sbc_dif_generate(struct se_cmd *cmd) sdt = paddr + offset; sdt->guard_tag = cpu_to_be16(crc_t10dif(daddr + j, dev->dev_attrib.block_size)); - if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE1_PROT) + if (cmd->prot_type == TARGET_DIF_TYPE1_PROT) sdt->ref_tag = cpu_to_be32(sector & 0xffffffff); sdt->app_tag = 0; - pr_debug("DIF WRITE INSERT sector: %llu guard_tag: 0x%04x" + pr_debug("DIF %s INSERT sector: %llu guard_tag: 0x%04x" " app_tag: 0x%04x ref_tag: %u\n", - (unsigned long long)sector, sdt->guard_tag, - sdt->app_tag, be32_to_cpu(sdt->ref_tag)); + (cmd->data_direction == DMA_TO_DEVICE) ? + "WRITE" : "READ", (unsigned long long)sector, + sdt->guard_tag, sdt->app_tag, + be32_to_cpu(sdt->ref_tag)); sector++; offset += sizeof(struct se_dif_v1_tuple); @@ -1185,12 +1224,16 @@ sbc_dif_generate(struct se_cmd *cmd) } static sense_reason_t -sbc_dif_v1_verify(struct se_device *dev, struct se_dif_v1_tuple *sdt, +sbc_dif_v1_verify(struct se_cmd *cmd, struct se_dif_v1_tuple *sdt, const void *p, sector_t sector, unsigned int ei_lba) { + struct se_device *dev = cmd->se_dev; int block_size = dev->dev_attrib.block_size; __be16 csum; + if (!(cmd->prot_checks & TARGET_DIF_CHECK_GUARD)) + goto check_ref; + csum = cpu_to_be16(crc_t10dif(p, block_size)); if (sdt->guard_tag != csum) { @@ -1200,7 +1243,11 @@ sbc_dif_v1_verify(struct se_device *dev, struct se_dif_v1_tuple *sdt, return TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED; } - if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE1_PROT && +check_ref: + if (!(cmd->prot_checks & TARGET_DIF_CHECK_REFTAG)) + return 0; + + if (cmd->prot_type == TARGET_DIF_TYPE1_PROT && be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) { pr_err("DIFv1 Type 1 reference failed on sector: %llu tag: 0x%08x" " sector MSB: 0x%08x\n", (unsigned long long)sector, @@ -1208,7 +1255,7 @@ sbc_dif_v1_verify(struct se_device *dev, struct se_dif_v1_tuple *sdt, return TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; } - if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE2_PROT && + if (cmd->prot_type == TARGET_DIF_TYPE2_PROT && be32_to_cpu(sdt->ref_tag) != ei_lba) { pr_err("DIFv1 Type 2 reference failed on sector: %llu tag: 0x%08x" " ei_lba: 0x%08x\n", (unsigned long long)sector, @@ -1229,6 +1276,9 @@ sbc_dif_copy_prot(struct se_cmd *cmd, unsigned int sectors, bool read, unsigned int i, len, left; unsigned int offset = sg_off; + if (!sg) + return; + left = sectors * dev->prot_length; for_each_sg(cmd->t_prot_sg, psg, cmd->t_prot_nents, i) { @@ -1292,7 +1342,7 @@ sbc_dif_verify_write(struct se_cmd *cmd, sector_t start, unsigned int sectors, (unsigned long long)sector, sdt->guard_tag, sdt->app_tag, be32_to_cpu(sdt->ref_tag)); - rc = sbc_dif_v1_verify(dev, sdt, daddr + j, sector, + rc = sbc_dif_v1_verify(cmd, sdt, daddr + j, sector, ei_lba); if (rc) { kunmap_atomic(paddr); @@ -1309,6 +1359,9 @@ sbc_dif_verify_write(struct se_cmd *cmd, sector_t start, unsigned int sectors, kunmap_atomic(paddr); kunmap_atomic(daddr); } + if (!sg) + return 0; + sbc_dif_copy_prot(cmd, sectors, false, sg, sg_off); return 0; @@ -1353,7 +1406,7 @@ __sbc_dif_verify_read(struct se_cmd *cmd, sector_t start, unsigned int sectors, continue; } - rc = sbc_dif_v1_verify(dev, sdt, daddr + j, sector, + rc = sbc_dif_v1_verify(cmd, sdt, daddr + j, sector, ei_lba); if (rc) { kunmap_atomic(paddr); diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index 6c8bd6bc175c..7912aa124385 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -103,10 +103,12 @@ spc_emulate_inquiry_std(struct se_cmd *cmd, unsigned char *buf) buf[5] |= 0x8; /* * Set Protection (PROTECT) bit when DIF has been enabled on the - * device, and the transport supports VERIFY + PASS. + * device, and the fabric supports VERIFY + PASS. Also report + * PROTECT=1 if sess_prot_type has been configured to allow T10-PI + * to unprotected devices. */ if (sess->sup_prot_ops & (TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS)) { - if (dev->dev_attrib.pi_prot_type) + if (dev->dev_attrib.pi_prot_type || cmd->se_sess->sess_prot_type) buf[5] |= 0x1; } @@ -467,9 +469,11 @@ spc_emulate_evpd_86(struct se_cmd *cmd, unsigned char *buf) * only for TYPE3 protection. */ if (sess->sup_prot_ops & (TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS)) { - if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE1_PROT) + if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE1_PROT || + cmd->se_sess->sess_prot_type == TARGET_DIF_TYPE1_PROT) buf[4] = 0x5; - else if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE3_PROT) + else if (dev->dev_attrib.pi_prot_type == TARGET_DIF_TYPE3_PROT || + cmd->se_sess->sess_prot_type == TARGET_DIF_TYPE3_PROT) buf[4] = 0x4; } @@ -861,7 +865,7 @@ static int spc_modesense_control(struct se_cmd *cmd, u8 pc, u8 *p) * TAG field. */ if (sess->sup_prot_ops & (TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS)) { - if (dev->dev_attrib.pi_prot_type) + if (dev->dev_attrib.pi_prot_type || sess->sess_prot_type) p[5] |= 0x80; } @@ -1099,7 +1103,7 @@ static sense_reason_t spc_emulate_modeselect(struct se_cmd *cmd) unsigned char *buf; unsigned char tbuf[SE_MODE_PAGE_BUF]; int length; - int ret = 0; + sense_reason_t ret = 0; int i; if (!cmd->data_length) { diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index fa5e157db47b..315ec3458eeb 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -125,8 +125,8 @@ void core_tmr_abort_task( if (dev != se_cmd->se_dev) continue; - /* skip se_cmd associated with tmr */ - if (tmr->task_cmd == se_cmd) + /* skip task management functions, including tmr->task_cmd */ + if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) continue; ref_tag = se_cmd->se_tfo->get_task_tag(se_cmd); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 0696de9553d3..47f064415bf6 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -672,7 +672,7 @@ static int core_tpg_setup_virtual_lun0(struct se_portal_group *se_tpg) } int core_tpg_register( - struct target_core_fabric_ops *tfo, + const struct target_core_fabric_ops *tfo, struct se_wwn *se_wwn, struct se_portal_group *se_tpg, void *tpg_fabric_ptr, diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ac3cbabdbdf0..3fe5cb240b6f 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -322,6 +322,7 @@ void __transport_register_session( struct se_session *se_sess, void *fabric_sess_ptr) { + const struct target_core_fabric_ops *tfo = se_tpg->se_tpg_tfo; unsigned char buf[PR_REG_ISID_LEN]; se_sess->se_tpg = se_tpg; @@ -334,6 +335,21 @@ void __transport_register_session( */ if (se_nacl) { /* + * + * Determine if fabric allows for T10-PI feature bits exposed to + * initiators for device backends with !dev->dev_attrib.pi_prot_type. + * + * If so, then always save prot_type on a per se_node_acl node + * basis and re-instate the previous sess_prot_type to avoid + * disabling PI from below any previously initiator side + * registered LUNs. + */ + if (se_nacl->saved_prot_type) + se_sess->sess_prot_type = se_nacl->saved_prot_type; + else if (tfo->tpg_check_prot_fabric_only) + se_sess->sess_prot_type = se_nacl->saved_prot_type = + tfo->tpg_check_prot_fabric_only(se_tpg); + /* * If the fabric module supports an ISID based TransportID, * save this value in binary from the fabric I_T Nexus now. */ @@ -404,6 +420,30 @@ void target_put_session(struct se_session *se_sess) } EXPORT_SYMBOL(target_put_session); +ssize_t target_show_dynamic_sessions(struct se_portal_group *se_tpg, char *page) +{ + struct se_session *se_sess; + ssize_t len = 0; + + spin_lock_bh(&se_tpg->session_lock); + list_for_each_entry(se_sess, &se_tpg->tpg_sess_list, sess_list) { + if (!se_sess->se_node_acl) + continue; + if (!se_sess->se_node_acl->dynamic_node_acl) + continue; + if (strlen(se_sess->se_node_acl->initiatorname) + 1 + len > PAGE_SIZE) + break; + + len += snprintf(page + len, PAGE_SIZE - len, "%s\n", + se_sess->se_node_acl->initiatorname); + len += 1; /* Include NULL terminator */ + } + spin_unlock_bh(&se_tpg->session_lock); + + return len; +} +EXPORT_SYMBOL(target_show_dynamic_sessions); + static void target_complete_nacl(struct kref *kref) { struct se_node_acl *nacl = container_of(kref, @@ -462,7 +502,7 @@ EXPORT_SYMBOL(transport_free_session); void transport_deregister_session(struct se_session *se_sess) { struct se_portal_group *se_tpg = se_sess->se_tpg; - struct target_core_fabric_ops *se_tfo; + const struct target_core_fabric_ops *se_tfo; struct se_node_acl *se_nacl; unsigned long flags; bool comp_nacl = true; @@ -1118,7 +1158,7 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size) */ void transport_init_se_cmd( struct se_cmd *cmd, - struct target_core_fabric_ops *tfo, + const struct target_core_fabric_ops *tfo, struct se_session *se_sess, u32 data_length, int data_direction, @@ -1570,6 +1610,8 @@ EXPORT_SYMBOL(target_submit_tmr); * has completed. */ bool target_stop_cmd(struct se_cmd *cmd, unsigned long *flags) + __releases(&cmd->t_state_lock) + __acquires(&cmd->t_state_lock) { bool was_active = false; @@ -1615,11 +1657,11 @@ void transport_generic_request_failure(struct se_cmd *cmd, transport_complete_task_attr(cmd); /* * Handle special case for COMPARE_AND_WRITE failure, where the - * callback is expected to drop the per device ->caw_mutex. + * callback is expected to drop the per device ->caw_sem. */ if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && cmd->transport_complete_callback) - cmd->transport_complete_callback(cmd); + cmd->transport_complete_callback(cmd, false); switch (sense_reason) { case TCM_NON_EXISTENT_LUN: @@ -1706,6 +1748,41 @@ void __target_execute_cmd(struct se_cmd *cmd) } } +static int target_write_prot_action(struct se_cmd *cmd) +{ + u32 sectors; + /* + * Perform WRITE_INSERT of PI using software emulation when backend + * device has PI enabled, if the transport has not already generated + * PI using hardware WRITE_INSERT offload. + */ + switch (cmd->prot_op) { + case TARGET_PROT_DOUT_INSERT: + if (!(cmd->se_sess->sup_prot_ops & TARGET_PROT_DOUT_INSERT)) + sbc_dif_generate(cmd); + break; + case TARGET_PROT_DOUT_STRIP: + if (cmd->se_sess->sup_prot_ops & TARGET_PROT_DOUT_STRIP) + break; + + sectors = cmd->data_length >> ilog2(cmd->se_dev->dev_attrib.block_size); + cmd->pi_err = sbc_dif_verify_write(cmd, cmd->t_task_lba, + sectors, 0, NULL, 0); + if (unlikely(cmd->pi_err)) { + spin_lock_irq(&cmd->t_state_lock); + cmd->transport_state &= ~CMD_T_BUSY|CMD_T_SENT; + spin_unlock_irq(&cmd->t_state_lock); + transport_generic_request_failure(cmd, cmd->pi_err); + return -1; + } + break; + default: + break; + } + + return 0; +} + static bool target_handle_task_attr(struct se_cmd *cmd) { struct se_device *dev = cmd->se_dev; @@ -1785,15 +1862,9 @@ void target_execute_cmd(struct se_cmd *cmd) cmd->t_state = TRANSPORT_PROCESSING; cmd->transport_state |= CMD_T_ACTIVE|CMD_T_BUSY|CMD_T_SENT; spin_unlock_irq(&cmd->t_state_lock); - /* - * Perform WRITE_INSERT of PI using software emulation when backend - * device has PI enabled, if the transport has not already generated - * PI using hardware WRITE_INSERT offload. - */ - if (cmd->prot_op == TARGET_PROT_DOUT_INSERT) { - if (!(cmd->se_sess->sup_prot_ops & TARGET_PROT_DOUT_INSERT)) - sbc_dif_generate(cmd); - } + + if (target_write_prot_action(cmd)) + return; if (target_handle_task_attr(cmd)) { spin_lock_irq(&cmd->t_state_lock); @@ -1919,16 +1990,28 @@ static void transport_handle_queue_full( schedule_work(&cmd->se_dev->qf_work_queue); } -static bool target_check_read_strip(struct se_cmd *cmd) +static bool target_read_prot_action(struct se_cmd *cmd) { sense_reason_t rc; - if (!(cmd->se_sess->sup_prot_ops & TARGET_PROT_DIN_STRIP)) { - rc = sbc_dif_read_strip(cmd); - if (rc) { - cmd->pi_err = rc; - return true; + switch (cmd->prot_op) { + case TARGET_PROT_DIN_STRIP: + if (!(cmd->se_sess->sup_prot_ops & TARGET_PROT_DIN_STRIP)) { + rc = sbc_dif_read_strip(cmd); + if (rc) { + cmd->pi_err = rc; + return true; + } } + break; + case TARGET_PROT_DIN_INSERT: + if (cmd->se_sess->sup_prot_ops & TARGET_PROT_DIN_INSERT) + break; + + sbc_dif_generate(cmd); + break; + default: + break; } return false; @@ -1975,8 +2058,12 @@ static void target_complete_ok_work(struct work_struct *work) if (cmd->transport_complete_callback) { sense_reason_t rc; - rc = cmd->transport_complete_callback(cmd); + rc = cmd->transport_complete_callback(cmd, true); if (!rc && !(cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE_POST)) { + if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && + !cmd->data_length) + goto queue_rsp; + return; } else if (rc) { ret = transport_send_check_condition_and_sense(cmd, @@ -1990,6 +2077,7 @@ static void target_complete_ok_work(struct work_struct *work) } } +queue_rsp: switch (cmd->data_direction) { case DMA_FROM_DEVICE: spin_lock(&cmd->se_lun->lun_sep_lock); @@ -2003,8 +2091,7 @@ static void target_complete_ok_work(struct work_struct *work) * backend had PI enabled, if the transport will not be * performing hardware READ_STRIP offload. */ - if (cmd->prot_op == TARGET_PROT_DIN_STRIP && - target_check_read_strip(cmd)) { + if (target_read_prot_action(cmd)) { ret = transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0); if (ret == -EAGAIN || ret == -ENOMEM) @@ -2094,6 +2181,16 @@ static inline void transport_reset_sgl_orig(struct se_cmd *cmd) static inline void transport_free_pages(struct se_cmd *cmd) { if (cmd->se_cmd_flags & SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC) { + /* + * Release special case READ buffer payload required for + * SG_TO_MEM_NOALLOC to function with COMPARE_AND_WRITE + */ + if (cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) { + transport_free_sgl(cmd->t_bidi_data_sg, + cmd->t_bidi_data_nents); + cmd->t_bidi_data_sg = NULL; + cmd->t_bidi_data_nents = 0; + } transport_reset_sgl_orig(cmd); return; } @@ -2246,6 +2343,7 @@ sense_reason_t transport_generic_new_cmd(struct se_cmd *cmd) { int ret = 0; + bool zero_flag = !(cmd->se_cmd_flags & SCF_SCSI_DATA_CDB); /* * Determine is the TCM fabric module has already allocated physical @@ -2254,7 +2352,6 @@ transport_generic_new_cmd(struct se_cmd *cmd) */ if (!(cmd->se_cmd_flags & SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC) && cmd->data_length) { - bool zero_flag = !(cmd->se_cmd_flags & SCF_SCSI_DATA_CDB); if ((cmd->se_cmd_flags & SCF_BIDI) || (cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE)) { @@ -2285,6 +2382,20 @@ transport_generic_new_cmd(struct se_cmd *cmd) cmd->data_length, zero_flag); if (ret < 0) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + } else if ((cmd->se_cmd_flags & SCF_COMPARE_AND_WRITE) && + cmd->data_length) { + /* + * Special case for COMPARE_AND_WRITE with fabrics + * using SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC. + */ + u32 caw_length = cmd->t_task_nolb * + cmd->se_dev->dev_attrib.block_size; + + ret = target_alloc_sgl(&cmd->t_bidi_data_sg, + &cmd->t_bidi_data_nents, + caw_length, zero_flag); + if (ret < 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } /* * If this command is not a write we can execute it right here, @@ -2376,10 +2487,8 @@ int target_get_sess_cmd(struct se_session *se_sess, struct se_cmd *se_cmd, * fabric acknowledgement that requires two target_put_sess_cmd() * invocations before se_cmd descriptor release. */ - if (ack_kref) { + if (ack_kref) kref_get(&se_cmd->cmd_kref); - se_cmd->se_cmd_flags |= SCF_ACK_KREF; - } spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); if (se_sess->sess_tearing_down) { @@ -2398,6 +2507,7 @@ out: EXPORT_SYMBOL(target_get_sess_cmd); static void target_release_cmd_kref(struct kref *kref) + __releases(&se_cmd->se_sess->sess_cmd_lock) { struct se_cmd *se_cmd = container_of(kref, struct se_cmd, cmd_kref); struct se_session *se_sess = se_cmd->se_sess; diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 1a1bcf71ec9d..dbc872a6c981 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -344,8 +344,11 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) entry = (void *) mb + CMDR_OFF + cmd_head; tcmu_flush_dcache_range(entry, sizeof(*entry)); - tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); - tcmu_hdr_set_len(&entry->hdr, pad_size); + tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_PAD); + tcmu_hdr_set_len(&entry->hdr.len_op, pad_size); + entry->hdr.cmd_id = 0; /* not used for PAD */ + entry->hdr.kflags = 0; + entry->hdr.uflags = 0; UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size); @@ -355,9 +358,11 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) entry = (void *) mb + CMDR_OFF + cmd_head; tcmu_flush_dcache_range(entry, sizeof(*entry)); - tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD); - tcmu_hdr_set_len(&entry->hdr, command_size); - entry->cmd_id = tcmu_cmd->cmd_id; + tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_CMD); + tcmu_hdr_set_len(&entry->hdr.len_op, command_size); + entry->hdr.cmd_id = tcmu_cmd->cmd_id; + entry->hdr.kflags = 0; + entry->hdr.uflags = 0; /* * Fix up iovecs, and handle if allocation in data ring wrapped. @@ -376,7 +381,8 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) /* Even iov_base is relative to mb_addr */ iov->iov_len = copy_bytes; - iov->iov_base = (void *) udev->data_off + udev->data_head; + iov->iov_base = (void __user *) udev->data_off + + udev->data_head; iov_cnt++; iov++; @@ -388,7 +394,8 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) copy_bytes = sg->length - copy_bytes; iov->iov_len = copy_bytes; - iov->iov_base = (void *) udev->data_off + udev->data_head; + iov->iov_base = (void __user *) udev->data_off + + udev->data_head; if (se_cmd->data_direction == DMA_TO_DEVICE) { to = (void *) mb + udev->data_off + udev->data_head; @@ -405,6 +412,8 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) kunmap_atomic(from); } entry->req.iov_cnt = iov_cnt; + entry->req.iov_bidi_cnt = 0; + entry->req.iov_dif_cnt = 0; /* All offsets relative to mb_addr, not start of entry! */ cdb_off = CMDR_OFF + cmd_head + base_command_size; @@ -462,6 +471,17 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * return; } + if (entry->hdr.uflags & TCMU_UFLAG_UNKNOWN_OP) { + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n", + cmd->se_cmd); + transport_generic_request_failure(cmd->se_cmd, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE); + cmd->se_cmd = NULL; + kmem_cache_free(tcmu_cmd_cache, cmd); + return; + } + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, se_cmd->scsi_sense_length); @@ -540,14 +560,16 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) tcmu_flush_dcache_range(entry, sizeof(*entry)); - if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) { - UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + if (tcmu_hdr_get_op(entry->hdr.len_op) == TCMU_OP_PAD) { + UPDATE_HEAD(udev->cmdr_last_cleaned, + tcmu_hdr_get_len(entry->hdr.len_op), + udev->cmdr_size); continue; } - WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD); + WARN_ON(tcmu_hdr_get_op(entry->hdr.len_op) != TCMU_OP_CMD); spin_lock(&udev->commands_lock); - cmd = idr_find(&udev->commands, entry->cmd_id); + cmd = idr_find(&udev->commands, entry->hdr.cmd_id); if (cmd) idr_remove(&udev->commands, cmd->cmd_id); spin_unlock(&udev->commands_lock); @@ -560,7 +582,9 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) tcmu_handle_completion(cmd, entry); - UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + UPDATE_HEAD(udev->cmdr_last_cleaned, + tcmu_hdr_get_len(entry->hdr.len_op), + udev->cmdr_size); handled++; } @@ -838,14 +862,14 @@ static int tcmu_configure_device(struct se_device *dev) udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; mb = udev->mb_addr; - mb->version = 1; + mb->version = TCMU_MAILBOX_VERSION; mb->cmdr_off = CMDR_OFF; mb->cmdr_size = udev->cmdr_size; WARN_ON(!PAGE_ALIGNED(udev->data_off)); WARN_ON(udev->data_size % PAGE_SIZE); - info->version = "1"; + info->version = xstr(TCMU_MAILBOX_VERSION); info->mem[0].name = "tcm-user command & data buffer"; info->mem[0].addr = (phys_addr_t) udev->mb_addr; diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 33ac39bf75e5..a600ff15dcfd 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -34,20 +34,12 @@ #include <target/target_core_fabric.h> #include <target/target_core_configfs.h> +#include "target_core_internal.h" #include "target_core_pr.h" #include "target_core_ua.h" #include "target_core_xcopy.h" static struct workqueue_struct *xcopy_wq = NULL; -/* - * From target_core_device.c - */ -extern struct mutex g_device_mutex; -extern struct list_head g_device_list; -/* - * From target_core_configfs.c - */ -extern struct configfs_subsystem *target_core_subsystem[]; static int target_xcopy_gen_naa_ieee(struct se_device *dev, unsigned char *buf) { @@ -433,7 +425,7 @@ static int xcopy_pt_queue_status(struct se_cmd *se_cmd) return 0; } -static struct target_core_fabric_ops xcopy_pt_tfo = { +static const struct target_core_fabric_ops xcopy_pt_tfo = { .get_fabric_name = xcopy_pt_get_fabric_name, .get_task_tag = xcopy_pt_get_tag, .get_cmd_state = xcopy_pt_get_cmd_state, @@ -548,33 +540,22 @@ static void target_xcopy_setup_pt_port( } } -static int target_xcopy_init_pt_lun( - struct xcopy_pt_cmd *xpt_cmd, - struct xcopy_op *xop, - struct se_device *se_dev, - struct se_cmd *pt_cmd, - bool remote_port) +static void target_xcopy_init_pt_lun(struct se_device *se_dev, + struct se_cmd *pt_cmd, bool remote_port) { /* * Don't allocate + init an pt_cmd->se_lun if honoring local port for * reservations. The pt_cmd->se_lun pointer will be setup from within * target_xcopy_setup_pt_port() */ - if (!remote_port) { - pt_cmd->se_cmd_flags |= SCF_SE_LUN_CMD | SCF_CMD_XCOPY_PASSTHROUGH; - return 0; + if (remote_port) { + pr_debug("Setup emulated se_dev: %p from se_dev\n", + pt_cmd->se_dev); + pt_cmd->se_lun = &se_dev->xcopy_lun; + pt_cmd->se_dev = se_dev; } - pt_cmd->se_lun = &se_dev->xcopy_lun; - pt_cmd->se_dev = se_dev; - - pr_debug("Setup emulated se_dev: %p from se_dev\n", pt_cmd->se_dev); - pt_cmd->se_cmd_flags |= SCF_SE_LUN_CMD | SCF_CMD_XCOPY_PASSTHROUGH; - - pr_debug("Setup emulated se_dev: %p to pt_cmd->se_lun->lun_se_dev\n", - pt_cmd->se_lun->lun_se_dev); - - return 0; + pt_cmd->se_cmd_flags |= SCF_SE_LUN_CMD; } static int target_xcopy_setup_pt_cmd( @@ -592,11 +573,8 @@ static int target_xcopy_setup_pt_cmd( * Setup LUN+port to honor reservations based upon xop->op_origin for * X-COPY PUSH or X-COPY PULL based upon where the CDB was received. */ - rc = target_xcopy_init_pt_lun(xpt_cmd, xop, se_dev, cmd, remote_port); - if (rc < 0) { - ret = rc; - goto out; - } + target_xcopy_init_pt_lun(se_dev, cmd, remote_port); + xpt_cmd->xcopy_op = xop; target_xcopy_setup_pt_port(xpt_cmd, xop, remote_port); diff --git a/drivers/target/tcm_fc/tcm_fc.h b/drivers/target/tcm_fc/tcm_fc.h index a0bcfd3e7e7d..881deb3d499a 100644 --- a/drivers/target/tcm_fc/tcm_fc.h +++ b/drivers/target/tcm_fc/tcm_fc.h @@ -129,7 +129,6 @@ struct ft_cmd { extern struct mutex ft_lport_lock; extern struct fc4_prov ft_prov; -extern struct target_fabric_configfs *ft_configfs; extern unsigned int ft_debug_logging; /* diff --git a/drivers/target/tcm_fc/tfc_conf.c b/drivers/target/tcm_fc/tfc_conf.c index efdcb9663a1a..65dce1345966 100644 --- a/drivers/target/tcm_fc/tfc_conf.c +++ b/drivers/target/tcm_fc/tfc_conf.c @@ -48,7 +48,7 @@ #include "tcm_fc.h" -struct target_fabric_configfs *ft_configfs; +static const struct target_core_fabric_ops ft_fabric_ops; static LIST_HEAD(ft_wwn_list); DEFINE_MUTEX(ft_lport_lock); @@ -337,7 +337,7 @@ static struct se_portal_group *ft_add_tpg( return NULL; } - ret = core_tpg_register(&ft_configfs->tf_ops, wwn, &tpg->se_tpg, + ret = core_tpg_register(&ft_fabric_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { destroy_workqueue(wq); @@ -507,7 +507,9 @@ static u32 ft_tpg_get_inst_index(struct se_portal_group *se_tpg) return tpg->index; } -static struct target_core_fabric_ops ft_fabric_ops = { +static const struct target_core_fabric_ops ft_fabric_ops = { + .module = THIS_MODULE, + .name = "fc", .get_fabric_name = ft_get_fabric_name, .get_fabric_proto_ident = fc_get_fabric_proto_ident, .tpg_get_wwn = ft_get_fabric_wwn, @@ -552,62 +554,10 @@ static struct target_core_fabric_ops ft_fabric_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = &ft_add_acl, .fabric_drop_nodeacl = &ft_del_acl, -}; - -static int ft_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - - /* - * Register the top level struct config_item_type with TCM core - */ - fabric = target_fabric_configfs_init(THIS_MODULE, "fc"); - if (IS_ERR(fabric)) { - pr_err("%s: target_fabric_configfs_init() failed!\n", - __func__); - return PTR_ERR(fabric); - } - fabric->tf_ops = ft_fabric_ops; - - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = ft_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = - ft_nacl_base_attrs; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - /* - * register the fabric for use within TCM - */ - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_debug("target_fabric_configfs_register() for" - " FC Target failed!\n"); - target_fabric_configfs_free(fabric); - return -1; - } - - /* - * Setup our local pointer to *fabric. - */ - ft_configfs = fabric; - return 0; -} -static void ft_deregister_configfs(void) -{ - if (!ft_configfs) - return; - target_fabric_configfs_deregister(ft_configfs); - ft_configfs = NULL; -} + .tfc_wwn_attrs = ft_wwn_attrs, + .tfc_tpg_nacl_base_attrs = ft_nacl_base_attrs, +}; static struct notifier_block ft_notifier = { .notifier_call = ft_lport_notify @@ -615,15 +565,24 @@ static struct notifier_block ft_notifier = { static int __init ft_init(void) { - if (ft_register_configfs()) - return -1; - if (fc_fc4_register_provider(FC_TYPE_FCP, &ft_prov)) { - ft_deregister_configfs(); - return -1; - } + int ret; + + ret = target_register_template(&ft_fabric_ops); + if (ret) + goto out; + + ret = fc_fc4_register_provider(FC_TYPE_FCP, &ft_prov); + if (ret) + goto out_unregister_template; + blocking_notifier_chain_register(&fc_lport_notifier_head, &ft_notifier); fc_lport_iterate(ft_lport_add, NULL); return 0; + +out_unregister_template: + target_unregister_template(&ft_fabric_ops); +out: + return ret; } static void __exit ft_exit(void) @@ -632,7 +591,7 @@ static void __exit ft_exit(void) &ft_notifier); fc_fc4_deregister_provider(FC_TYPE_FCP, &ft_prov); fc_lport_iterate(ft_lport_del, NULL); - ft_deregister_configfs(); + target_unregister_template(&ft_fabric_ops); synchronize_rcu(); } diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 422ebea96a64..4506e405c8f3 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -450,6 +450,18 @@ static unsigned int mem32_serial_in(struct uart_port *p, int offset) return readl(p->membase + offset); } +static void mem32be_serial_out(struct uart_port *p, int offset, int value) +{ + offset = offset << p->regshift; + iowrite32be(value, p->membase + offset); +} + +static unsigned int mem32be_serial_in(struct uart_port *p, int offset) +{ + offset = offset << p->regshift; + return ioread32be(p->membase + offset); +} + static unsigned int io_serial_in(struct uart_port *p, int offset) { offset = offset << p->regshift; @@ -488,6 +500,11 @@ static void set_io_from_upio(struct uart_port *p) p->serial_out = mem32_serial_out; break; + case UPIO_MEM32BE: + p->serial_in = mem32be_serial_in; + p->serial_out = mem32be_serial_out; + break; + #if defined(CONFIG_MIPS_ALCHEMY) || defined(CONFIG_SERIAL_8250_RT288X) case UPIO_AU: p->serial_in = au_serial_in; @@ -513,6 +530,7 @@ serial_port_out_sync(struct uart_port *p, int offset, int value) switch (p->iotype) { case UPIO_MEM: case UPIO_MEM32: + case UPIO_MEM32BE: case UPIO_AU: p->serial_out(p, offset, value); p->serial_in(p, UART_LCR); /* safe, no side-effects */ @@ -2748,6 +2766,7 @@ static int serial8250_request_std_resource(struct uart_8250_port *up) case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: + case UPIO_MEM32BE: case UPIO_MEM: if (!port->mapbase) break; @@ -2784,6 +2803,7 @@ static void serial8250_release_std_resource(struct uart_8250_port *up) case UPIO_AU: case UPIO_TSI: case UPIO_MEM32: + case UPIO_MEM32BE: case UPIO_MEM: if (!port->mapbase) break; diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 8e119682266a..6c0fd8b9d1c3 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -42,6 +42,8 @@ unsigned int __weak __init serial8250_early_in(struct uart_port *port, int offse return readb(port->membase + offset); case UPIO_MEM32: return readl(port->membase + (offset << 2)); + case UPIO_MEM32BE: + return ioread32be(port->membase + (offset << 2)); case UPIO_PORT: return inb(port->iobase + offset); default: @@ -58,6 +60,9 @@ void __weak __init serial8250_early_out(struct uart_port *port, int offset, int case UPIO_MEM32: writel(value, port->membase + (offset << 2)); break; + case UPIO_MEM32BE: + iowrite32be(value, port->membase + (offset << 2)); + break; case UPIO_PORT: outb(value, port->iobase + offset); break; diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c index aa00154c4a6d..5b73afb9f9f3 100644 --- a/drivers/tty/serial/of_serial.c +++ b/drivers/tty/serial/of_serial.c @@ -116,7 +116,8 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->iotype = UPIO_MEM; break; case 4: - port->iotype = UPIO_MEM32; + port->iotype = of_device_is_big_endian(np) ? + UPIO_MEM32BE : UPIO_MEM32; break; default: dev_warn(&ofdev->dev, "unsupported reg-io-width (%d)\n", diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 6af58c6dba5e..2030565c6789 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1505,7 +1505,7 @@ static void destroy_ep_files (struct dev_data *dev) list_del_init (&ep->epfiles); dentry = ep->dentry; ep->dentry = NULL; - parent = dentry->d_parent->d_inode; + parent = d_inode(dentry->d_parent); /* break link to controller */ if (ep->state == STATE_EP_ENABLED) diff --git a/drivers/usb/gadget/legacy/tcm_usb_gadget.c b/drivers/usb/gadget/legacy/tcm_usb_gadget.c index 6e0a019aad54..8b80addc4ce6 100644 --- a/drivers/usb/gadget/legacy/tcm_usb_gadget.c +++ b/drivers/usb/gadget/legacy/tcm_usb_gadget.c @@ -29,7 +29,7 @@ USB_GADGET_COMPOSITE_OPTIONS(); -static struct target_fabric_configfs *usbg_fabric_configfs; +static const struct target_core_fabric_ops usbg_ops; static inline struct f_uas *to_f_uas(struct usb_function *f) { @@ -1572,8 +1572,7 @@ static struct se_portal_group *usbg_make_tpg( tpg->tport = tport; tpg->tport_tpgt = tpgt; - ret = core_tpg_register(&usbg_fabric_configfs->tf_ops, wwn, - &tpg->se_tpg, tpg, + ret = core_tpg_register(&usbg_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { destroy_workqueue(tpg->workqueue); @@ -1864,7 +1863,9 @@ static int usbg_check_stop_free(struct se_cmd *se_cmd) return 1; } -static struct target_core_fabric_ops usbg_ops = { +static const struct target_core_fabric_ops usbg_ops = { + .module = THIS_MODULE, + .name = "usb_gadget", .get_fabric_name = usbg_get_fabric_name, .get_fabric_proto_ident = usbg_get_fabric_proto_ident, .tpg_get_wwn = usbg_get_fabric_wwn, @@ -1906,46 +1907,9 @@ static struct target_core_fabric_ops usbg_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = usbg_make_nodeacl, .fabric_drop_nodeacl = usbg_drop_nodeacl, -}; - -static int usbg_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - - fabric = target_fabric_configfs_init(THIS_MODULE, "usb_gadget"); - if (IS_ERR(fabric)) { - printk(KERN_ERR "target_fabric_configfs_init() failed\n"); - return PTR_ERR(fabric); - } - - fabric->tf_ops = usbg_ops; - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = usbg_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = usbg_base_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - printk(KERN_ERR "target_fabric_configfs_register() failed" - " for usb-gadget\n"); - return ret; - } - usbg_fabric_configfs = fabric; - return 0; -}; -static void usbg_deregister_configfs(void) -{ - if (!(usbg_fabric_configfs)) - return; - - target_fabric_configfs_deregister(usbg_fabric_configfs); - usbg_fabric_configfs = NULL; + .tfc_wwn_attrs = usbg_wwn_attrs, + .tfc_tpg_base_attrs = usbg_base_attrs, }; /* Start gadget.c code */ @@ -2454,16 +2418,13 @@ static void usbg_detach(struct usbg_tpg *tpg) static int __init usb_target_gadget_init(void) { - int ret; - - ret = usbg_register_configfs(); - return ret; + return target_register_template(&usbg_ops); } module_init(usb_target_gadget_init); static void __exit usb_target_gadget_exit(void) { - usbg_deregister_configfs(); + target_unregister_template(&usbg_ops); } module_exit(usb_target_gadget_exit); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 71df240a467a..5e19bb53b3a9 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -131,6 +131,8 @@ struct vhost_scsi_tpg { int tv_tpg_port_count; /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ int tv_tpg_vhost_count; + /* Used for enabling T10-PI with legacy devices */ + int tv_fabric_prot_type; /* list for vhost_scsi_list */ struct list_head tv_tpg_list; /* Used to protect access for tpg_nexus */ @@ -214,9 +216,7 @@ struct vhost_scsi { int vs_events_nr; /* num of pending events, protected by vq->mutex */ }; -/* Local pointer to allocated TCM configfs fabric module */ -static struct target_fabric_configfs *vhost_scsi_fabric_configfs; - +static struct target_core_fabric_ops vhost_scsi_ops; static struct workqueue_struct *vhost_scsi_workqueue; /* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ @@ -431,6 +431,14 @@ vhost_scsi_parse_pr_out_transport_id(struct se_portal_group *se_tpg, port_nexus_ptr); } +static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + return tpg->tv_fabric_prot_type; +} + static struct se_node_acl * vhost_scsi_alloc_fabric_acl(struct se_portal_group *se_tpg) { @@ -1878,6 +1886,45 @@ static void vhost_scsi_free_cmd_map_res(struct vhost_scsi_nexus *nexus, } } +static ssize_t vhost_scsi_tpg_attrib_store_fabric_prot_type( + struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + unsigned long val; + int ret = kstrtoul(page, 0, &val); + + if (ret) { + pr_err("kstrtoul() returned %d for fabric_prot_type\n", ret); + return ret; + } + if (val != 0 && val != 1 && val != 3) { + pr_err("Invalid vhost_scsi fabric_prot_type: %lu\n", val); + return -EINVAL; + } + tpg->tv_fabric_prot_type = val; + + return count; +} + +static ssize_t vhost_scsi_tpg_attrib_show_fabric_prot_type( + struct se_portal_group *se_tpg, + char *page) +{ + struct vhost_scsi_tpg *tpg = container_of(se_tpg, + struct vhost_scsi_tpg, se_tpg); + + return sprintf(page, "%d\n", tpg->tv_fabric_prot_type); +} +TF_TPG_ATTRIB_ATTR(vhost_scsi, fabric_prot_type, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = { + &vhost_scsi_tpg_attrib_fabric_prot_type.attr, + NULL, +}; + static int vhost_scsi_make_nexus(struct vhost_scsi_tpg *tpg, const char *name) { @@ -2155,7 +2202,7 @@ vhost_scsi_make_tpg(struct se_wwn *wwn, tpg->tport = tport; tpg->tport_tpgt = tpgt; - ret = core_tpg_register(&vhost_scsi_fabric_configfs->tf_ops, wwn, + ret = core_tpg_register(&vhost_scsi_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { kfree(tpg); @@ -2277,6 +2324,8 @@ static struct configfs_attribute *vhost_scsi_wwn_attrs[] = { }; static struct target_core_fabric_ops vhost_scsi_ops = { + .module = THIS_MODULE, + .name = "vhost", .get_fabric_name = vhost_scsi_get_fabric_name, .get_fabric_proto_ident = vhost_scsi_get_fabric_proto_ident, .tpg_get_wwn = vhost_scsi_get_fabric_wwn, @@ -2289,6 +2338,7 @@ static struct target_core_fabric_ops vhost_scsi_ops = { .tpg_check_demo_mode_cache = vhost_scsi_check_true, .tpg_check_demo_mode_write_protect = vhost_scsi_check_false, .tpg_check_prod_mode_write_protect = vhost_scsi_check_false, + .tpg_check_prot_fabric_only = vhost_scsi_check_prot_fabric_only, .tpg_alloc_fabric_acl = vhost_scsi_alloc_fabric_acl, .tpg_release_fabric_acl = vhost_scsi_release_fabric_acl, .tpg_get_inst_index = vhost_scsi_tpg_get_inst_index, @@ -2320,70 +2370,20 @@ static struct target_core_fabric_ops vhost_scsi_ops = { .fabric_drop_np = NULL, .fabric_make_nodeacl = vhost_scsi_make_nodeacl, .fabric_drop_nodeacl = vhost_scsi_drop_nodeacl, + + .tfc_wwn_attrs = vhost_scsi_wwn_attrs, + .tfc_tpg_base_attrs = vhost_scsi_tpg_attrs, + .tfc_tpg_attrib_attrs = vhost_scsi_tpg_attrib_attrs, }; -static int vhost_scsi_register_configfs(void) +static int __init vhost_scsi_init(void) { - struct target_fabric_configfs *fabric; - int ret; + int ret = -ENOMEM; - pr_debug("vhost-scsi fabric module %s on %s/%s" + pr_debug("TCM_VHOST fabric module %s on %s/%s" " on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, utsname()->machine); - /* - * Register the top level struct config_item_type with TCM core - */ - fabric = target_fabric_configfs_init(THIS_MODULE, "vhost"); - if (IS_ERR(fabric)) { - pr_err("target_fabric_configfs_init() failed\n"); - return PTR_ERR(fabric); - } - /* - * Setup fabric->tf_ops from our local vhost_scsi_ops - */ - fabric->tf_ops = vhost_scsi_ops; - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = vhost_scsi_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = vhost_scsi_tpg_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - /* - * Register the fabric for use within TCM - */ - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - pr_err("target_fabric_configfs_register() failed" - " for TCM_VHOST\n"); - return ret; - } - /* - * Setup our local pointer to *fabric - */ - vhost_scsi_fabric_configfs = fabric; - pr_debug("TCM_VHOST[0] - Set fabric -> vhost_scsi_fabric_configfs\n"); - return 0; -}; - -static void vhost_scsi_deregister_configfs(void) -{ - if (!vhost_scsi_fabric_configfs) - return; - - target_fabric_configfs_deregister(vhost_scsi_fabric_configfs); - vhost_scsi_fabric_configfs = NULL; - pr_debug("TCM_VHOST[0] - Cleared vhost_scsi_fabric_configfs\n"); -}; -static int __init vhost_scsi_init(void) -{ - int ret = -ENOMEM; /* * Use our own dedicated workqueue for submitting I/O into * target core to avoid contention within system_wq. @@ -2396,7 +2396,7 @@ static int __init vhost_scsi_init(void) if (ret < 0) goto out_destroy_workqueue; - ret = vhost_scsi_register_configfs(); + ret = target_register_template(&vhost_scsi_ops); if (ret < 0) goto out_vhost_scsi_deregister; @@ -2412,7 +2412,7 @@ out: static void vhost_scsi_exit(void) { - vhost_scsi_deregister_configfs(); + target_unregister_template(&vhost_scsi_ops); vhost_scsi_deregister(); destroy_workqueue(vhost_scsi_workqueue); }; diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index b546da5d8ea3..cab9f3f63a38 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -48,6 +48,16 @@ config VIRTIO_BALLOON If unsure, say M. +config VIRTIO_INPUT + tristate "Virtio input driver" + depends on VIRTIO + depends on INPUT + ---help--- + This driver supports virtio input devices such as + keyboards, mice and tablets. + + If unsure, say M. + config VIRTIO_MMIO tristate "Platform bus driver for memory mapped virtio devices" depends on HAS_IOMEM diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index d85565b8ea46..41e30e3dc842 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o +obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 5ce2aa48fc6e..b1877d73fa56 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -278,12 +278,6 @@ static struct bus_type virtio_bus = { .remove = virtio_dev_remove, }; -bool virtio_device_is_legacy_only(struct virtio_device_id id) -{ - return id.device == VIRTIO_ID_BALLOON; -} -EXPORT_SYMBOL_GPL(virtio_device_is_legacy_only); - int register_virtio_driver(struct virtio_driver *driver) { /* Catch this early. */ diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 6a356e344f82..82e80e034f25 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -214,8 +214,8 @@ static inline void update_stat(struct virtio_balloon *vb, int idx, u16 tag, u64 val) { BUG_ON(idx >= VIRTIO_BALLOON_S_NR); - vb->stats[idx].tag = tag; - vb->stats[idx].val = val; + vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag); + vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val); } #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) @@ -283,18 +283,27 @@ static void virtballoon_changed(struct virtio_device *vdev) static inline s64 towards_target(struct virtio_balloon *vb) { - __le32 v; s64 target; + u32 num_pages; - virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages, &v); + virtio_cread(vb->vdev, struct virtio_balloon_config, num_pages, + &num_pages); - target = le32_to_cpu(v); + /* Legacy balloon config space is LE, unlike all other devices. */ + if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1)) + num_pages = le32_to_cpu((__force __le32)num_pages); + + target = num_pages; return target - vb->num_pages; } static void update_balloon_size(struct virtio_balloon *vb) { - __le32 actual = cpu_to_le32(vb->num_pages); + u32 actual = vb->num_pages; + + /* Legacy balloon config space is LE, unlike all other devices. */ + if (!virtio_has_feature(vb->vdev, VIRTIO_F_VERSION_1)) + actual = (__force u32)cpu_to_le32(actual); virtio_cwrite(vb->vdev, struct virtio_balloon_config, actual, &actual); diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c new file mode 100644 index 000000000000..60e2a1677563 --- /dev/null +++ b/drivers/virtio/virtio_input.c @@ -0,0 +1,384 @@ +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/input.h> + +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/virtio_input.h> + +struct virtio_input { + struct virtio_device *vdev; + struct input_dev *idev; + char name[64]; + char serial[64]; + char phys[64]; + struct virtqueue *evt, *sts; + struct virtio_input_event evts[64]; + spinlock_t lock; + bool ready; +}; + +static void virtinput_queue_evtbuf(struct virtio_input *vi, + struct virtio_input_event *evtbuf) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, evtbuf, sizeof(*evtbuf)); + virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); +} + +static void virtinput_recv_events(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *event; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + while ((event = virtqueue_get_buf(vi->evt, &len)) != NULL) { + spin_unlock_irqrestore(&vi->lock, flags); + input_event(vi->idev, + le16_to_cpu(event->type), + le16_to_cpu(event->code), + le32_to_cpu(event->value)); + spin_lock_irqsave(&vi->lock, flags); + virtinput_queue_evtbuf(vi, event); + } + virtqueue_kick(vq); + } + spin_unlock_irqrestore(&vi->lock, flags); +} + +/* + * On error we are losing the status update, which isn't critical as + * this is typically used for stuff like keyboard leds. + */ +static int virtinput_send_status(struct virtio_input *vi, + u16 type, u16 code, s32 value) +{ + struct virtio_input_event *stsbuf; + struct scatterlist sg[1]; + unsigned long flags; + int rc; + + stsbuf = kzalloc(sizeof(*stsbuf), GFP_ATOMIC); + if (!stsbuf) + return -ENOMEM; + + stsbuf->type = cpu_to_le16(type); + stsbuf->code = cpu_to_le16(code); + stsbuf->value = cpu_to_le32(value); + sg_init_one(sg, stsbuf, sizeof(*stsbuf)); + + spin_lock_irqsave(&vi->lock, flags); + if (vi->ready) { + rc = virtqueue_add_outbuf(vi->sts, sg, 1, stsbuf, GFP_ATOMIC); + virtqueue_kick(vi->sts); + } else { + rc = -ENODEV; + } + spin_unlock_irqrestore(&vi->lock, flags); + + if (rc != 0) + kfree(stsbuf); + return rc; +} + +static void virtinput_recv_status(struct virtqueue *vq) +{ + struct virtio_input *vi = vq->vdev->priv; + struct virtio_input_event *stsbuf; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vi->lock, flags); + while ((stsbuf = virtqueue_get_buf(vi->sts, &len)) != NULL) + kfree(stsbuf); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_status(struct input_dev *idev, unsigned int type, + unsigned int code, int value) +{ + struct virtio_input *vi = input_get_drvdata(idev); + + return virtinput_send_status(vi, type, code, value); +} + +static u8 virtinput_cfg_select(struct virtio_input *vi, + u8 select, u8 subsel) +{ + u8 size; + + virtio_cwrite(vi->vdev, struct virtio_input_config, select, &select); + virtio_cwrite(vi->vdev, struct virtio_input_config, subsel, &subsel); + virtio_cread(vi->vdev, struct virtio_input_config, size, &size); + return size; +} + +static void virtinput_cfg_bits(struct virtio_input *vi, int select, int subsel, + unsigned long *bits, unsigned int bitcount) +{ + unsigned int bit; + u8 *virtio_bits; + u8 bytes; + + bytes = virtinput_cfg_select(vi, select, subsel); + if (!bytes) + return; + if (bitcount > bytes * 8) + bitcount = bytes * 8; + + /* + * Bitmap in virtio config space is a simple stream of bytes, + * with the first byte carrying bits 0-7, second bits 8-15 and + * so on. + */ + virtio_bits = kzalloc(bytes, GFP_KERNEL); + if (!virtio_bits) + return; + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.bitmap), + virtio_bits, bytes); + for (bit = 0; bit < bitcount; bit++) { + if (virtio_bits[bit / 8] & (1 << (bit % 8))) + __set_bit(bit, bits); + } + kfree(virtio_bits); + + if (select == VIRTIO_INPUT_CFG_EV_BITS) + __set_bit(subsel, vi->idev->evbit); +} + +static void virtinput_cfg_abs(struct virtio_input *vi, int abs) +{ + u32 mi, ma, re, fu, fl; + + virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.min, &mi); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.max, &ma); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.res, &re); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu); + virtio_cread(vi->vdev, struct virtio_input_config, u.abs.flat, &fl); + input_set_abs_params(vi->idev, abs, mi, ma, fu, fl); + input_abs_set_res(vi->idev, abs, re); +} + +static int virtinput_init_vqs(struct virtio_input *vi) +{ + struct virtqueue *vqs[2]; + vq_callback_t *cbs[] = { virtinput_recv_events, + virtinput_recv_status }; + static const char *names[] = { "events", "status" }; + int err; + + err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names); + if (err) + return err; + vi->evt = vqs[0]; + vi->sts = vqs[1]; + + return 0; +} + +static void virtinput_fill_evt(struct virtio_input *vi) +{ + unsigned long flags; + int i, size; + + spin_lock_irqsave(&vi->lock, flags); + size = virtqueue_get_vring_size(vi->evt); + if (size > ARRAY_SIZE(vi->evts)) + size = ARRAY_SIZE(vi->evts); + for (i = 0; i < size; i++) + virtinput_queue_evtbuf(vi, &vi->evts[i]); + virtqueue_kick(vi->evt); + spin_unlock_irqrestore(&vi->lock, flags); +} + +static int virtinput_probe(struct virtio_device *vdev) +{ + struct virtio_input *vi; + unsigned long flags; + size_t size; + int abs, err; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return -ENODEV; + + vi = kzalloc(sizeof(*vi), GFP_KERNEL); + if (!vi) + return -ENOMEM; + + vdev->priv = vi; + vi->vdev = vdev; + spin_lock_init(&vi->lock); + + err = virtinput_init_vqs(vi); + if (err) + goto err_init_vq; + + vi->idev = input_allocate_device(); + if (!vi->idev) { + err = -ENOMEM; + goto err_input_alloc; + } + input_set_drvdata(vi->idev, vi); + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_NAME, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->name, min(size, sizeof(vi->name))); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_SERIAL, 0); + virtio_cread_bytes(vi->vdev, offsetof(struct virtio_input_config, + u.string), + vi->serial, min(size, sizeof(vi->serial))); + snprintf(vi->phys, sizeof(vi->phys), + "virtio%d/input0", vdev->index); + vi->idev->name = vi->name; + vi->idev->phys = vi->phys; + vi->idev->uniq = vi->serial; + + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ID_DEVIDS, 0); + if (size >= sizeof(struct virtio_input_devids)) { + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.bustype, &vi->idev->id.bustype); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.vendor, &vi->idev->id.vendor); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.product, &vi->idev->id.product); + virtio_cread(vi->vdev, struct virtio_input_config, + u.ids.version, &vi->idev->id.version); + } else { + vi->idev->id.bustype = BUS_VIRTUAL; + } + + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_PROP_BITS, 0, + vi->idev->propbit, INPUT_PROP_CNT); + size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REP); + if (size) + __set_bit(EV_REP, vi->idev->evbit); + + vi->idev->dev.parent = &vdev->dev; + vi->idev->event = virtinput_status; + + /* device -> kernel */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_KEY, + vi->idev->keybit, KEY_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_REL, + vi->idev->relbit, REL_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_ABS, + vi->idev->absbit, ABS_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_MSC, + vi->idev->mscbit, MSC_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SW, + vi->idev->swbit, SW_CNT); + + /* kernel -> device */ + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_LED, + vi->idev->ledbit, LED_CNT); + virtinput_cfg_bits(vi, VIRTIO_INPUT_CFG_EV_BITS, EV_SND, + vi->idev->sndbit, SND_CNT); + + if (test_bit(EV_ABS, vi->idev->evbit)) { + for (abs = 0; abs < ABS_CNT; abs++) { + if (!test_bit(abs, vi->idev->absbit)) + continue; + virtinput_cfg_abs(vi, abs); + } + } + + virtio_device_ready(vdev); + vi->ready = true; + err = input_register_device(vi->idev); + if (err) + goto err_input_register; + + virtinput_fill_evt(vi); + return 0; + +err_input_register: + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + input_free_device(vi->idev); +err_input_alloc: + vdev->config->del_vqs(vdev); +err_init_vq: + kfree(vi); + return err; +} + +static void virtinput_remove(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + input_unregister_device(vi->idev); + vdev->config->del_vqs(vdev); + kfree(vi); +} + +#ifdef CONFIG_PM_SLEEP +static int virtinput_freeze(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + unsigned long flags; + + spin_lock_irqsave(&vi->lock, flags); + vi->ready = false; + spin_unlock_irqrestore(&vi->lock, flags); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtinput_restore(struct virtio_device *vdev) +{ + struct virtio_input *vi = vdev->priv; + int err; + + err = virtinput_init_vqs(vi); + if (err) + return err; + + virtio_device_ready(vdev); + vi->ready = true; + virtinput_fill_evt(vi); + return 0; +} +#endif + +static unsigned int features[] = { + /* none */ +}; +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_INPUT, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_input_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .id_table = id_table, + .probe = virtinput_probe, + .remove = virtinput_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtinput_freeze, + .restore = virtinput_restore, +#endif +}; + +module_virtio_driver(virtio_input_driver); +MODULE_DEVICE_TABLE(virtio, id_table); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Virtio input device driver"); +MODULE_AUTHOR("Gerd Hoffmann <kraxel@redhat.com>"); diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 6010d7ec0a0f..7a5e60dea6c5 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -581,14 +581,6 @@ static int virtio_mmio_probe(struct platform_device *pdev) } vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); - /* Reject legacy-only IDs for version 2 devices */ - if (vm_dev->version == 2 && - virtio_device_is_legacy_only(vm_dev->vdev.id)) { - dev_err(&pdev->dev, "Version 2 not supported for devices %u!\n", - vm_dev->vdev.id.device); - return -ENODEV; - } - if (vm_dev->version == 1) writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 2aa38e59db2e..e88e0997a889 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -20,6 +20,50 @@ #define VIRTIO_PCI_NO_LEGACY #include "virtio_pci_common.h" +/* + * Type-safe wrappers for io accesses. + * Use these to enforce at compile time the following spec requirement: + * + * The driver MUST access each field using the “natural” access + * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses + * for 16-bit fields and 8-bit accesses for 8-bit fields. + */ +static inline u8 vp_ioread8(u8 __iomem *addr) +{ + return ioread8(addr); +} +static inline u16 vp_ioread16 (u16 __iomem *addr) +{ + return ioread16(addr); +} + +static inline u32 vp_ioread32(u32 __iomem *addr) +{ + return ioread32(addr); +} + +static inline void vp_iowrite8(u8 value, u8 __iomem *addr) +{ + iowrite8(value, addr); +} + +static inline void vp_iowrite16(u16 value, u16 __iomem *addr) +{ + iowrite16(value, addr); +} + +static inline void vp_iowrite32(u32 value, u32 __iomem *addr) +{ + iowrite32(value, addr); +} + +static void vp_iowrite64_twopart(u64 val, + __le32 __iomem *lo, __le32 __iomem *hi) +{ + vp_iowrite32((u32)val, lo); + vp_iowrite32(val >> 32, hi); +} + static void __iomem *map_capability(struct pci_dev *dev, int off, size_t minlen, u32 align, @@ -94,22 +138,16 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, return p; } -static void iowrite64_twopart(u64 val, __le32 __iomem *lo, __le32 __iomem *hi) -{ - iowrite32((u32)val, lo); - iowrite32(val >> 32, hi); -} - /* virtio config->get_features() implementation */ static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); u64 features; - iowrite32(0, &vp_dev->common->device_feature_select); - features = ioread32(&vp_dev->common->device_feature); - iowrite32(1, &vp_dev->common->device_feature_select); - features |= ((u64)ioread32(&vp_dev->common->device_feature) << 32); + vp_iowrite32(0, &vp_dev->common->device_feature_select); + features = vp_ioread32(&vp_dev->common->device_feature); + vp_iowrite32(1, &vp_dev->common->device_feature_select); + features |= ((u64)vp_ioread32(&vp_dev->common->device_feature) << 32); return features; } @@ -128,10 +166,10 @@ static int vp_finalize_features(struct virtio_device *vdev) return -EINVAL; } - iowrite32(0, &vp_dev->common->guest_feature_select); - iowrite32((u32)vdev->features, &vp_dev->common->guest_feature); - iowrite32(1, &vp_dev->common->guest_feature_select); - iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature); + vp_iowrite32(0, &vp_dev->common->guest_feature_select); + vp_iowrite32((u32)vdev->features, &vp_dev->common->guest_feature); + vp_iowrite32(1, &vp_dev->common->guest_feature_select); + vp_iowrite32(vdev->features >> 32, &vp_dev->common->guest_feature); return 0; } @@ -210,14 +248,14 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, static u32 vp_generation(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - return ioread8(&vp_dev->common->config_generation); + return vp_ioread8(&vp_dev->common->config_generation); } /* config->{get,set}_status() implementations */ static u8 vp_get_status(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - return ioread8(&vp_dev->common->device_status); + return vp_ioread8(&vp_dev->common->device_status); } static void vp_set_status(struct virtio_device *vdev, u8 status) @@ -225,17 +263,17 @@ static void vp_set_status(struct virtio_device *vdev, u8 status) struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* We should never be setting status to 0. */ BUG_ON(status == 0); - iowrite8(status, &vp_dev->common->device_status); + vp_iowrite8(status, &vp_dev->common->device_status); } static void vp_reset(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* 0 status means a reset. */ - iowrite8(0, &vp_dev->common->device_status); + vp_iowrite8(0, &vp_dev->common->device_status); /* Flush out the status write, and flush in device writes, * including MSI-X interrupts, if any. */ - ioread8(&vp_dev->common->device_status); + vp_ioread8(&vp_dev->common->device_status); /* Flush pending VQ/configuration callbacks. */ vp_synchronize_vectors(vdev); } @@ -243,10 +281,10 @@ static void vp_reset(struct virtio_device *vdev) static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) { /* Setup the vector used for configuration events */ - iowrite16(vector, &vp_dev->common->msix_config); + vp_iowrite16(vector, &vp_dev->common->msix_config); /* Verify we had enough resources to assign the vector */ /* Will also flush the write out to device */ - return ioread16(&vp_dev->common->msix_config); + return vp_ioread16(&vp_dev->common->msix_config); } static size_t vring_pci_size(u16 num) @@ -286,15 +324,15 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, u16 num, off; int err; - if (index >= ioread16(&cfg->num_queues)) + if (index >= vp_ioread16(&cfg->num_queues)) return ERR_PTR(-ENOENT); /* Select the queue we're interested in */ - iowrite16(index, &cfg->queue_select); + vp_iowrite16(index, &cfg->queue_select); /* Check if queue is either not available or already active. */ - num = ioread16(&cfg->queue_size); - if (!num || ioread16(&cfg->queue_enable)) + num = vp_ioread16(&cfg->queue_size); + if (!num || vp_ioread16(&cfg->queue_enable)) return ERR_PTR(-ENOENT); if (num & (num - 1)) { @@ -303,7 +341,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } /* get offset of notification word for this vq */ - off = ioread16(&cfg->queue_notify_off); + off = vp_ioread16(&cfg->queue_notify_off); info->num = num; info->msix_vector = msix_vec; @@ -322,13 +360,13 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } /* activate the queue */ - iowrite16(num, &cfg->queue_size); - iowrite64_twopart(virt_to_phys(info->queue), - &cfg->queue_desc_lo, &cfg->queue_desc_hi); - iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)), - &cfg->queue_avail_lo, &cfg->queue_avail_hi); - iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)), - &cfg->queue_used_lo, &cfg->queue_used_hi); + vp_iowrite16(num, &cfg->queue_size); + vp_iowrite64_twopart(virt_to_phys(info->queue), + &cfg->queue_desc_lo, &cfg->queue_desc_hi); + vp_iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)), + &cfg->queue_avail_lo, &cfg->queue_avail_hi); + vp_iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)), + &cfg->queue_used_lo, &cfg->queue_used_hi); if (vp_dev->notify_base) { /* offset should not wrap */ @@ -357,8 +395,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } if (msix_vec != VIRTIO_MSI_NO_VECTOR) { - iowrite16(msix_vec, &cfg->queue_msix_vector); - msix_vec = ioread16(&cfg->queue_msix_vector); + vp_iowrite16(msix_vec, &cfg->queue_msix_vector); + msix_vec = vp_ioread16(&cfg->queue_msix_vector); if (msix_vec == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto err_assign_vector; @@ -393,8 +431,8 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, * this, there's no way to go back except reset. */ list_for_each_entry(vq, &vdev->vqs, list) { - iowrite16(vq->index, &vp_dev->common->queue_select); - iowrite16(1, &vp_dev->common->queue_enable); + vp_iowrite16(vq->index, &vp_dev->common->queue_select); + vp_iowrite16(1, &vp_dev->common->queue_enable); } return 0; @@ -405,13 +443,13 @@ static void del_vq(struct virtio_pci_vq_info *info) struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - iowrite16(vq->index, &vp_dev->common->queue_select); + vp_iowrite16(vq->index, &vp_dev->common->queue_select); if (vp_dev->msix_enabled) { - iowrite16(VIRTIO_MSI_NO_VECTOR, - &vp_dev->common->queue_msix_vector); + vp_iowrite16(VIRTIO_MSI_NO_VECTOR, + &vp_dev->common->queue_msix_vector); /* Flush the write out to device */ - ioread16(&vp_dev->common->queue_msix_vector); + vp_ioread16(&vp_dev->common->queue_msix_vector); } if (!vp_dev->notify_base) @@ -577,9 +615,6 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) } vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor; - if (virtio_device_is_legacy_only(vp_dev->vdev.id)) - return -ENODEV; - /* check for a common config: if not, use legacy mode (bar 0). */ common = virtio_pci_find_capability(pci_dev, VIRTIO_PCI_CAP_COMMON_CFG, IORESOURCE_IO | IORESOURCE_MEM); diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index ce4f3a7f95fd..e5e7c5505de7 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -169,7 +169,6 @@ config AT91SAM9X_WATCHDOG config CADENCE_WATCHDOG tristate "Cadence Watchdog Timer" - depends on ARM select WATCHDOG_CORE help Say Y here if you want to include support for the watchdog @@ -1190,6 +1189,7 @@ config OCTEON_WDT tristate "Cavium OCTEON SOC family Watchdog Timer" depends on CAVIUM_OCTEON_SOC default y + select WATCHDOG_CORE select EXPORT_UASM if OCTEON_WDT = m help Hardware driver for OCTEON's on chip watchdog timer. diff --git a/drivers/watchdog/bcm_kona_wdt.c b/drivers/watchdog/bcm_kona_wdt.c index 4e37db3539a4..22d8ae65772a 100644 --- a/drivers/watchdog/bcm_kona_wdt.c +++ b/drivers/watchdog/bcm_kona_wdt.c @@ -99,12 +99,14 @@ static int secure_register_read(struct bcm_kona_wdt *wdt, uint32_t offset) static int bcm_kona_wdt_dbg_show(struct seq_file *s, void *data) { - int ctl_val, cur_val, ret; + int ctl_val, cur_val; unsigned long flags; struct bcm_kona_wdt *wdt = s->private; - if (!wdt) - return seq_puts(s, "No device pointer\n"); + if (!wdt) { + seq_puts(s, "No device pointer\n"); + return 0; + } spin_lock_irqsave(&wdt->lock, flags); ctl_val = secure_register_read(wdt, SECWDOG_CTRL_REG); @@ -112,7 +114,7 @@ static int bcm_kona_wdt_dbg_show(struct seq_file *s, void *data) spin_unlock_irqrestore(&wdt->lock, flags); if (ctl_val < 0 || cur_val < 0) { - ret = seq_puts(s, "Error accessing hardware\n"); + seq_puts(s, "Error accessing hardware\n"); } else { int ctl, cur, ctl_sec, cur_sec, res; @@ -121,15 +123,18 @@ static int bcm_kona_wdt_dbg_show(struct seq_file *s, void *data) cur = cur_val & SECWDOG_COUNT_MASK; ctl_sec = TICKS_TO_SECS(ctl, wdt); cur_sec = TICKS_TO_SECS(cur, wdt); - ret = seq_printf(s, "Resolution: %d / %d\n" - "Control: %d s / %d (%#x) ticks\n" - "Current: %d s / %d (%#x) ticks\n" - "Busy count: %lu\n", res, - wdt->resolution, ctl_sec, ctl, ctl, cur_sec, - cur, cur, wdt->busy_count); + seq_printf(s, + "Resolution: %d / %d\n" + "Control: %d s / %d (%#x) ticks\n" + "Current: %d s / %d (%#x) ticks\n" + "Busy count: %lu\n", + res, wdt->resolution, + ctl_sec, ctl, ctl, + cur_sec, cur, cur, + wdt->busy_count); } - return ret; + return 0; } static int bcm_kona_dbg_open(struct inode *inode, struct file *file) diff --git a/drivers/watchdog/octeon-wdt-main.c b/drivers/watchdog/octeon-wdt-main.c index 8453531545df..14521c8b3d5a 100644 --- a/drivers/watchdog/octeon-wdt-main.c +++ b/drivers/watchdog/octeon-wdt-main.c @@ -3,6 +3,8 @@ * * Copyright (C) 2007, 2008, 2009, 2010 Cavium Networks * + * Converted to use WATCHDOG_CORE by Aaro Koskinen <aaro.koskinen@iki.fi>. + * * Some parts derived from wdt.c * * (c) Copyright 1996-1997 Alan Cox <alan@lxorguk.ukuu.org.uk>, @@ -103,13 +105,10 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); -static unsigned long octeon_wdt_is_open; -static char expect_close; - -static u32 __initdata nmi_stage1_insns[64]; +static u32 nmi_stage1_insns[64] __initdata; /* We need one branch and therefore one relocation per target label. */ -static struct uasm_label __initdata labels[5]; -static struct uasm_reloc __initdata relocs[5]; +static struct uasm_label labels[5] __initdata; +static struct uasm_reloc relocs[5] __initdata; enum lable_id { label_enter_bootloader = 1 @@ -218,7 +217,8 @@ static void __init octeon_wdt_build_stage1(void) pr_debug("\t.set pop\n"); if (len > 32) - panic("NMI stage 1 handler exceeds 32 instructions, was %d\n", len); + panic("NMI stage 1 handler exceeds 32 instructions, was %d\n", + len); } static int cpu2core(int cpu) @@ -294,6 +294,7 @@ static void octeon_wdt_write_hex(u64 value, int digits) { int d; int v; + for (d = 0; d < digits; d++) { v = (value >> ((digits - d - 1) * 4)) & 0xf; if (v >= 10) @@ -303,7 +304,7 @@ static void octeon_wdt_write_hex(u64 value, int digits) } } -const char *reg_name[] = { +static const char reg_name[][3] = { "$0", "at", "v0", "v1", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "t0", "t1", "t2", "t3", "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", @@ -444,7 +445,7 @@ static int octeon_wdt_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static void octeon_wdt_ping(void) +static int octeon_wdt_ping(struct watchdog_device __always_unused *wdog) { int cpu; int coreid; @@ -457,10 +458,12 @@ static void octeon_wdt_ping(void) !cpumask_test_cpu(cpu, &irq_enabled_cpus)) { /* We have to enable the irq */ int irq = OCTEON_IRQ_WDOG0 + coreid; + enable_irq(irq); cpumask_set_cpu(cpu, &irq_enabled_cpus); } } + return 0; } static void octeon_wdt_calc_parameters(int t) @@ -489,7 +492,8 @@ static void octeon_wdt_calc_parameters(int t) timeout_cnt = ((octeon_get_io_clock_rate() >> 8) * timeout_sec) >> 8; } -static int octeon_wdt_set_heartbeat(int t) +static int octeon_wdt_set_timeout(struct watchdog_device *wdog, + unsigned int t) { int cpu; int coreid; @@ -509,158 +513,45 @@ static int octeon_wdt_set_heartbeat(int t) cvmx_write_csr(CVMX_CIU_WDOGX(coreid), ciu_wdog.u64); cvmx_write_csr(CVMX_CIU_PP_POKEX(coreid), 1); } - octeon_wdt_ping(); /* Get the irqs back on. */ + octeon_wdt_ping(wdog); /* Get the irqs back on. */ return 0; } -/** - * octeon_wdt_write: - * @file: file handle to the watchdog - * @buf: buffer to write (unused as data does not matter here - * @count: count of bytes - * @ppos: pointer to the position to write. No seeks allowed - * - * A write to a watchdog device is defined as a keepalive signal. Any - * write of data will do, as we we don't define content meaning. - */ - -static ssize_t octeon_wdt_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - if (!nowayout) { - size_t i; - - /* In case it was set long ago */ - expect_close = 0; - - for (i = 0; i != count; i++) { - char c; - if (get_user(c, buf + i)) - return -EFAULT; - if (c == 'V') - expect_close = 1; - } - } - octeon_wdt_ping(); - } - return count; -} - -/** - * octeon_wdt_ioctl: - * @file: file handle to the device - * @cmd: watchdog command - * @arg: argument pointer - * - * The watchdog API defines a common set of functions for all - * watchdogs according to their available features. We only - * actually usefully support querying capabilities and setting - * the timeout. - */ - -static long octeon_wdt_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int __user *p = argp; - int new_heartbeat; - - static struct watchdog_info ident = { - .options = WDIOF_SETTIMEOUT| - WDIOF_MAGICCLOSE| - WDIOF_KEEPALIVEPING, - .firmware_version = 1, - .identity = "OCTEON", - }; - - switch (cmd) { - case WDIOC_GETSUPPORT: - return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0; - case WDIOC_GETSTATUS: - case WDIOC_GETBOOTSTATUS: - return put_user(0, p); - case WDIOC_KEEPALIVE: - octeon_wdt_ping(); - return 0; - case WDIOC_SETTIMEOUT: - if (get_user(new_heartbeat, p)) - return -EFAULT; - if (octeon_wdt_set_heartbeat(new_heartbeat)) - return -EINVAL; - /* Fall through. */ - case WDIOC_GETTIMEOUT: - return put_user(heartbeat, p); - default: - return -ENOTTY; - } -} - -/** - * octeon_wdt_open: - * @inode: inode of device - * @file: file handle to device - * - * The watchdog device has been opened. The watchdog device is single - * open and on opening we do a ping to reset the counters. - */ - -static int octeon_wdt_open(struct inode *inode, struct file *file) +static int octeon_wdt_start(struct watchdog_device *wdog) { - if (test_and_set_bit(0, &octeon_wdt_is_open)) - return -EBUSY; - /* - * Activate - */ - octeon_wdt_ping(); + octeon_wdt_ping(wdog); do_coundown = 1; - return nonseekable_open(inode, file); + return 0; } -/** - * octeon_wdt_release: - * @inode: inode to board - * @file: file handle to board - * - * The watchdog has a configurable API. There is a religious dispute - * between people who want their watchdog to be able to shut down and - * those who want to be sure if the watchdog manager dies the machine - * reboots. In the former case we disable the counters, in the latter - * case you have to open it again very soon. - */ - -static int octeon_wdt_release(struct inode *inode, struct file *file) +static int octeon_wdt_stop(struct watchdog_device *wdog) { - if (expect_close) { - do_coundown = 0; - octeon_wdt_ping(); - } else { - pr_crit("WDT device closed unexpectedly. WDT will not stop!\n"); - } - clear_bit(0, &octeon_wdt_is_open); - expect_close = 0; + do_coundown = 0; + octeon_wdt_ping(wdog); return 0; } -static const struct file_operations octeon_wdt_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .write = octeon_wdt_write, - .unlocked_ioctl = octeon_wdt_ioctl, - .open = octeon_wdt_open, - .release = octeon_wdt_release, +static struct notifier_block octeon_wdt_cpu_notifier = { + .notifier_call = octeon_wdt_cpu_callback, }; -static struct miscdevice octeon_wdt_miscdev = { - .minor = WATCHDOG_MINOR, - .name = "watchdog", - .fops = &octeon_wdt_fops, +static const struct watchdog_info octeon_wdt_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING, + .identity = "OCTEON", }; -static struct notifier_block octeon_wdt_cpu_notifier = { - .notifier_call = octeon_wdt_cpu_callback, +static const struct watchdog_ops octeon_wdt_ops = { + .owner = THIS_MODULE, + .start = octeon_wdt_start, + .stop = octeon_wdt_stop, + .ping = octeon_wdt_ping, + .set_timeout = octeon_wdt_set_timeout, }; +static struct watchdog_device octeon_wdt = { + .info = &octeon_wdt_info, + .ops = &octeon_wdt_ops, +}; /** * Module/ driver initialization. @@ -685,7 +576,8 @@ static int __init octeon_wdt_init(void) max_timeout_sec = 6; do { max_timeout_sec--; - timeout_cnt = ((octeon_get_io_clock_rate() >> 8) * max_timeout_sec) >> 8; + timeout_cnt = ((octeon_get_io_clock_rate() >> 8) * + max_timeout_sec) >> 8; } while (timeout_cnt > 65535); BUG_ON(timeout_cnt == 0); @@ -694,11 +586,15 @@ static int __init octeon_wdt_init(void) pr_info("Initial granularity %d Sec\n", timeout_sec); - ret = misc_register(&octeon_wdt_miscdev); + octeon_wdt.timeout = timeout_sec; + octeon_wdt.max_timeout = UINT_MAX; + + watchdog_set_nowayout(&octeon_wdt, nowayout); + + ret = watchdog_register_device(&octeon_wdt); if (ret) { - pr_err("cannot register miscdev on minor=%d (err=%d)\n", - WATCHDOG_MINOR, ret); - goto out; + pr_err("watchdog_register_device() failed: %d\n", ret); + return ret; } /* Build the NMI handler ... */ @@ -721,8 +617,7 @@ static int __init octeon_wdt_init(void) __register_hotcpu_notifier(&octeon_wdt_cpu_notifier); cpu_notifier_register_done(); -out: - return ret; + return 0; } /** @@ -732,7 +627,7 @@ static void __exit octeon_wdt_cleanup(void) { int cpu; - misc_deregister(&octeon_wdt_miscdev); + watchdog_unregister_device(&octeon_wdt); cpu_notifier_register_begin(); __unregister_hotcpu_notifier(&octeon_wdt_cpu_notifier); diff --git a/drivers/watchdog/pnx4008_wdt.c b/drivers/watchdog/pnx4008_wdt.c index 55e220150103..b9c6049c3e78 100644 --- a/drivers/watchdog/pnx4008_wdt.c +++ b/drivers/watchdog/pnx4008_wdt.c @@ -216,7 +216,7 @@ static struct platform_driver platform_wdt_driver = { module_platform_driver(platform_wdt_driver); MODULE_AUTHOR("MontaVista Software, Inc. <source@mvista.com>"); -MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>"); +MODULE_AUTHOR("Wolfram Sang <kernel@pengutronix.de>"); MODULE_DESCRIPTION("PNX4008 Watchdog Driver"); module_param(heartbeat, uint, 0); diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index aa85618c4d03..aa03ca8f2d9b 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -20,9 +20,9 @@ #include <linux/reboot.h> #include <linux/watchdog.h> -#define WDT_RST 0x0 -#define WDT_EN 0x8 -#define WDT_BITE_TIME 0x24 +#define WDT_RST 0x38 +#define WDT_EN 0x40 +#define WDT_BITE_TIME 0x5C struct qcom_wdt { struct watchdog_device wdd; @@ -117,6 +117,8 @@ static int qcom_wdt_probe(struct platform_device *pdev) { struct qcom_wdt *wdt; struct resource *res; + struct device_node *np = pdev->dev.of_node; + u32 percpu_offset; int ret; wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); @@ -124,6 +126,14 @@ static int qcom_wdt_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + /* We use CPU0's DGT for the watchdog */ + if (of_property_read_u32(np, "cpu-offset", &percpu_offset)) + percpu_offset = 0; + + res->start += percpu_offset; + res->end += percpu_offset; + wdt->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(wdt->base)) return PTR_ERR(wdt->base); @@ -203,9 +213,8 @@ static int qcom_wdt_remove(struct platform_device *pdev) } static const struct of_device_id qcom_wdt_of_table[] = { - { .compatible = "qcom,kpss-wdt-msm8960", }, - { .compatible = "qcom,kpss-wdt-apq8064", }, - { .compatible = "qcom,kpss-wdt-ipq8064", }, + { .compatible = "qcom,kpss-timer" }, + { .compatible = "qcom,scss-timer" }, { }, }; MODULE_DEVICE_TABLE(of, qcom_wdt_of_table); diff --git a/drivers/watchdog/stmp3xxx_rtc_wdt.c b/drivers/watchdog/stmp3xxx_rtc_wdt.c index a62b1b6decf4..e7f0d5b60d3d 100644 --- a/drivers/watchdog/stmp3xxx_rtc_wdt.c +++ b/drivers/watchdog/stmp3xxx_rtc_wdt.c @@ -1,7 +1,7 @@ /* * Watchdog driver for the RTC based watchdog in STMP3xxx and i.MX23/28 * - * Author: Wolfram Sang <w.sang@pengutronix.de> + * Author: Wolfram Sang <kernel@pengutronix.de> * * Copyright (C) 2011-12 Wolfram Sang, Pengutronix * @@ -129,4 +129,4 @@ module_platform_driver(stmp3xxx_wdt_driver); MODULE_DESCRIPTION("STMP3XXX RTC Watchdog Driver"); MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Wolfram Sang <w.sang@pengutronix.de>"); +MODULE_AUTHOR("Wolfram Sang <kernel@pengutronix.de>"); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index a270004c9605..7cd226da15fe 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -276,4 +276,8 @@ config XEN_AUTO_XLATE help Support for auto-translated physmap guests. +config XEN_ACPI + def_bool y + depends on X86 && ACPI + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 40edd1cbb60d..e293bc507cbc 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_efi.o += -fshort-wchar dom0-$(CONFIG_PCI) += pci.o dom0-$(CONFIG_USB_SUPPORT) += dbgp.o -dom0-$(CONFIG_ACPI) += acpi.o $(xen-pad-y) +dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y) xen-pad-$(CONFIG_X86) += xen-acpi-pad.o dom0-$(CONFIG_X86) += pcpu.o obj-$(CONFIG_XEN_DOM0) += $(dom0-y) diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 07ef38325223..b7f51504f85a 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -204,8 +204,7 @@ static LIST_HEAD(scsiback_free_pages); static DEFINE_MUTEX(scsiback_mutex); static LIST_HEAD(scsiback_list); -/* Local pointer to allocated TCM configfs fabric module */ -static struct target_fabric_configfs *scsiback_fabric_configfs; +static const struct target_core_fabric_ops scsiback_ops; static void scsiback_get(struct vscsibk_info *info) { @@ -1902,7 +1901,7 @@ scsiback_make_tpg(struct se_wwn *wwn, tpg->tport = tport; tpg->tport_tpgt = tpgt; - ret = core_tpg_register(&scsiback_fabric_configfs->tf_ops, wwn, + ret = core_tpg_register(&scsiback_ops, wwn, &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); if (ret < 0) { kfree(tpg); @@ -1944,7 +1943,9 @@ static int scsiback_check_false(struct se_portal_group *se_tpg) return 0; } -static struct target_core_fabric_ops scsiback_ops = { +static const struct target_core_fabric_ops scsiback_ops = { + .module = THIS_MODULE, + .name = "xen-pvscsi", .get_fabric_name = scsiback_get_fabric_name, .get_fabric_proto_ident = scsiback_get_fabric_proto_ident, .tpg_get_wwn = scsiback_get_fabric_wwn, @@ -1991,62 +1992,10 @@ static struct target_core_fabric_ops scsiback_ops = { .fabric_make_nodeacl = scsiback_make_nodeacl, .fabric_drop_nodeacl = scsiback_drop_nodeacl, #endif -}; - -static int scsiback_register_configfs(void) -{ - struct target_fabric_configfs *fabric; - int ret; - - pr_debug("fabric module %s on %s/%s on "UTS_RELEASE"\n", - VSCSI_VERSION, utsname()->sysname, utsname()->machine); - /* - * Register the top level struct config_item_type with TCM core - */ - fabric = target_fabric_configfs_init(THIS_MODULE, "xen-pvscsi"); - if (IS_ERR(fabric)) - return PTR_ERR(fabric); - /* - * Setup fabric->tf_ops from our local scsiback_ops - */ - fabric->tf_ops = scsiback_ops; - /* - * Setup default attribute lists for various fabric->tf_cit_tmpl - */ - fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = scsiback_wwn_attrs; - fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = scsiback_tpg_attrs; - fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = scsiback_param_attrs; - fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL; - fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL; - /* - * Register the fabric for use within TCM - */ - ret = target_fabric_configfs_register(fabric); - if (ret < 0) { - target_fabric_configfs_free(fabric); - return ret; - } - /* - * Setup our local pointer to *fabric - */ - scsiback_fabric_configfs = fabric; - pr_debug("Set fabric -> scsiback_fabric_configfs\n"); - return 0; -}; - -static void scsiback_deregister_configfs(void) -{ - if (!scsiback_fabric_configfs) - return; - - target_fabric_configfs_deregister(scsiback_fabric_configfs); - scsiback_fabric_configfs = NULL; - pr_debug("Cleared scsiback_fabric_configfs\n"); + .tfc_wwn_attrs = scsiback_wwn_attrs, + .tfc_tpg_base_attrs = scsiback_tpg_attrs, + .tfc_tpg_param_attrs = scsiback_param_attrs, }; static const struct xenbus_device_id scsiback_ids[] = { @@ -2078,6 +2027,9 @@ static int __init scsiback_init(void) if (!xen_domain()) return -ENODEV; + pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); + scsiback_cachep = kmem_cache_create("vscsiif_cache", sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend); if (!scsiback_cachep) @@ -2087,7 +2039,7 @@ static int __init scsiback_init(void) if (ret) goto out_cache_destroy; - ret = scsiback_register_configfs(); + ret = target_register_template(&scsiback_ops); if (ret) goto out_unregister_xenbus; @@ -2110,7 +2062,7 @@ static void __exit scsiback_exit(void) BUG(); gnttab_free_pages(1, &page); } - scsiback_deregister_configfs(); + target_unregister_template(&scsiback_ops); xenbus_unregister_driver(&scsiback_driver); kmem_cache_destroy(scsiback_cachep); } diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 8482f2d11606..31c010372660 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -247,7 +247,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) return v9fs_remote_get_acl(dentry, name, buffer, size, type); - acl = v9fs_get_cached_acl(dentry->d_inode, type); + acl = v9fs_get_cached_acl(d_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -285,7 +285,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index a345b2d659cc..bd456c668d39 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,7 +53,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) dentry, dentry); /* Don't cache negative dentries */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 1; return 0; } @@ -83,7 +83,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto out_valid; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 76c3b1ab6361..5cc00e56206e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -138,6 +138,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) &err); if (err) return err; + if (n == 0) + return 0; rdir->head = 0; rdir->tail = n; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3662f1d1d9cf..703342e309f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); @@ -864,7 +864,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); err = 0; @@ -881,7 +881,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - v9inode = V9FS_I(dentry->d_inode); + v9inode = V9FS_I(d_inode(dentry)); mutex_lock(&v9inode->v_mutex); if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && !v9inode->writeback_fid && @@ -908,7 +908,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(dentry->d_inode, file); + v9fs_cache_inode_set_cookie(d_inode(dentry), file); *opened |= FILE_CREATED; out: @@ -969,8 +969,8 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); v9ses = v9fs_inode2v9ses(old_inode); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) @@ -1061,7 +1061,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1072,8 +1072,8 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb); + generic_fillattr(d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1095,7 +1095,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(dentry->d_inode, iattr); + retval = inode_change_ok(d_inode(dentry), iattr); if (retval) return retval; @@ -1128,20 +1128,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(dentry->d_inode->i_mapping); + filemap_write_and_wait(d_inode(dentry)->i_mapping); retval = p9_client_wstat(fid, &wstat); if (retval < 0) return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(dentry->d_inode)) - truncate_setsize(dentry->d_inode, iattr->ia_size); + iattr->ia_size != i_size_read(d_inode(dentry))) + truncate_setsize(d_inode(dentry), iattr->ia_size); - v9fs_invalidate_inode_attr(dentry->d_inode); + v9fs_invalidate_inode_attr(d_inode(dentry)); - setattr_copy(dentry->d_inode, iattr); - mark_inode_dirty(dentry->d_inode); + setattr_copy(d_inode(dentry), iattr); + mark_inode_dirty(d_inode(dentry)); return 0; } @@ -1403,7 +1403,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); __putname(name); if (!retval) { - v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_refresh_inode(oldfid, d_inode(old_dentry)); v9fs_invalidate_inode_attr(dir); } clunk_fid: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 6054c16b8fae..9861c7c951a6 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -265,7 +265,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); @@ -481,7 +481,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -496,8 +496,8 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, dentry->d_inode); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode_dotl(st, d_inode(dentry)); + generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -557,7 +557,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) int retval; struct p9_fid *fid; struct p9_iattr_dotl p9attr; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -795,10 +795,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(fid)) return PTR_ERR(fid); - v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); + v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); } - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); return err; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 0afd0382822b..e99a338a4638 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -168,8 +168,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(st); goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, root->d_inode); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode_dotl(st, d_inode(root)); kfree(st); } else { struct p9_wstat *st = NULL; @@ -179,8 +179,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, d_inode(root), sb); p9stat_free(st); kfree(st); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index b9acadafa4a1..335055d828e4 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -298,7 +298,7 @@ out: int adfs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 5022ac96aa40..a8f463c028ce 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -138,7 +138,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) static int affs_remove_link(struct dentry *dentry) { - struct inode *dir, *inode = dentry->d_inode; + struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL, *link_bh = NULL; u32 link_ino, ino; @@ -268,11 +268,11 @@ affs_remove_header(struct dentry *dentry) struct buffer_head *bh = NULL; int retval; - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); sb = dir->i_sb; retval = -ENOENT; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto done; @@ -471,10 +471,9 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) bool affs_nofilenametruncate(const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE); - } /* Check if the name is valid for a affs object. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 9628003ccd2f..a022f4accd76 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -213,7 +213,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) int affs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index ec8ca0efb960..181e05b46e72 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -251,7 +251,7 @@ int affs_unlink(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -320,7 +320,7 @@ int affs_rmdir(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -403,7 +403,7 @@ err: int affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); @@ -430,13 +430,13 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { retval = affs_remove_header(new_dentry); if (retval) return retval; } - bh = affs_bread(sb, old_dentry->d_inode->i_ino); + bh = affs_bread(sb, d_inode(old_dentry)->i_ino); if (!bh) return -EIO; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4ec35e9130e1..e10e17788f06 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -505,7 +505,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _enter("{%x:%u},%p{%pd},", vnode->fid.vid, vnode->fid.vnode, dentry, dentry); - ASSERTCMP(dentry->d_inode, ==, NULL); + ASSERTCMP(d_inode(dentry), ==, NULL); if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); @@ -563,8 +563,8 @@ success: _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, - dentry->d_inode->i_ino, - dentry->d_inode->i_generation); + d_inode(dentry)->i_ino, + d_inode(dentry)->i_generation); return NULL; } @@ -586,9 +586,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) _enter("{v={%x:%u} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); @@ -601,7 +601,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) /* lock down the parent dentry so we can peer at it */ parent = dget_parent(dentry); - dir = AFS_FS_I(parent->d_inode); + dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) @@ -623,9 +623,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) switch (ret) { case 0: /* the filename maps to something */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad; - if (is_bad_inode(dentry->d_inode)) { + if (is_bad_inode(d_inode(dentry))) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); goto out_bad; @@ -647,7 +647,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - dentry->d_inode->i_generation); + d_inode(dentry)->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); @@ -658,7 +658,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: /* the filename is unknown */ _debug("%pd: dirent not found", dentry); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto not_found; goto out_valid; @@ -703,9 +703,9 @@ static int afs_d_delete(const struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto zap; - if (dentry->d_inode && - (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) || - test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags))) + if (d_really_is_positive(dentry) && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags))) goto zap; _leave(" = 0 [keep]"); @@ -814,8 +814,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (ret < 0) goto rmdir_error; - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); afs_discard_callback_on_delete(vnode); @@ -856,8 +856,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) goto error; } - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); /* make sure we have a callback promise on the victim */ ret = afs_validate(vnode, key); @@ -869,7 +869,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (ret < 0) goto remove_error; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { /* if the file wasn't deleted due to excess hard links, the * fileserver will break the callback promise on the file - if * it had one - before it returns to us, and if it was deleted, @@ -879,7 +879,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) * or it was outstanding on a different server, then it won't * break it either... */ - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) _debug("AFS_VNODE_DELETED"); if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) @@ -977,7 +977,7 @@ static int afs_link(struct dentry *from, struct inode *dir, struct key *key; int ret; - vnode = AFS_FS_I(from->d_inode); + vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); _enter("{%x:%u},{%x:%u},{%pd}", @@ -1089,7 +1089,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; - vnode = AFS_FS_I(old_dentry->d_inode); + vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 8a1d38ef0fc2..e06f5a23352a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -379,7 +379,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct inode *inode; - inode = dentry->d_inode; + inode = d_inode(dentry); _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); @@ -458,7 +458,7 @@ void afs_evict_inode(struct inode *inode) */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 938c5ab06d5a..ccd0b212e82a 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -134,7 +134,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) _enter("{%pd}", mntpt); - BUG_ON(!mntpt->d_inode); + BUG_ON(!d_inode(mntpt)); ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); @@ -145,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (!options) goto error_no_options; - vnode = AFS_FS_I(mntpt->d_inode); + vnode = AFS_FS_I(d_inode(mntpt)); if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ static const char afs_root_cell[] = ":root.cell."; @@ -169,14 +169,14 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) } } else { /* read the contents of the AFS special symlink */ - loff_t size = i_size_read(mntpt->d_inode); + loff_t size = i_size_read(d_inode(mntpt)); char *buf; ret = -EINVAL; if (size > PAGE_SIZE - 1) goto error_no_page; - page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); + page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); goto error_no_page; diff --git a/fs/afs/super.c b/fs/afs/super.c index c4861557e385..1fb4a5129f7d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -529,7 +529,7 @@ static void afs_destroy_inode(struct inode *inode) static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct afs_volume_status vs; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d10e619632ab..5b700ef1e59d 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -235,12 +235,12 @@ static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) { - return sbi->sb->s_root->d_inode->i_ino; + return d_inode(sbi->sb->s_root)->i_ino; } static inline int simple_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } static inline void __autofs4_add_expiring(struct dentry *dentry) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 11dd118f75e2..1cebc3c52fa5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { DPRINTK("checking symlink %p %pd", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 1c55388ae633..a3ae0b2aeb5a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -71,7 +71,7 @@ void autofs4_kill_sb(struct super_block *sb) static int autofs4_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); - struct inode *root_inode = root->d_sb->s_root->d_inode; + struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; @@ -352,8 +352,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_mode = mode; if (sb->s_root) { - inode->i_uid = sb->s_root->d_inode->i_uid; - inode->i_gid = sb->s_root->d_inode->i_gid; + inode->i_uid = d_inode(sb->s_root)->i_uid; + inode->i_gid = d_inode(sb->s_root)->i_gid; } inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = get_next_ino(); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7e44fdd03e2d..c6d7d3dbd52a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -240,7 +240,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, spin_lock(&expiring->d_lock); /* We've already been dentry_iput or unlinked */ - if (!expiring->d_inode) + if (d_really_is_negative(expiring)) goto next; qstr = &expiring->d_name; @@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -459,7 +459,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) return 0; if (d_mountpoint(dentry)) return 0; - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; if (list_empty(&dentry->d_subdirs)) @@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * an incorrect ELOOP error return. */ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || - (dentry->d_inode && d_is_symlink(dentry))) + (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); @@ -625,8 +625,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); dir->i_mtime = CURRENT_TIME; @@ -719,8 +719,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) atomic_dec(&p_ino->count); } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); if (dir->i_nlink) drop_nlink(dir); @@ -839,7 +839,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) */ int is_autofs4_dentry(struct dentry *dentry) { - return dentry && dentry->d_inode && + return dentry && d_really_is_positive(dentry) && dentry->d_op == &autofs4_dentry_operations && dentry->d_fsdata != NULL; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 1e8ea192be2b..de58cc7b8076 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -18,7 +18,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; - nd_set_link(nd, dentry->d_inode->i_private); + nd_set_link(nd, d_inode(dentry)->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 2ad05ab93db8..35b755e79c2d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -322,7 +322,7 @@ static int validate_request(struct autofs_wait_queue **wait, * continue on and create a new request. */ if (!IS_ROOT(dentry)) { - if (dentry->d_inode && d_unhashed(dentry)) { + if (d_really_is_positive(dentry) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; new = d_lookup(parent, &dentry->d_name); if (new) @@ -364,7 +364,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (pid == 0 || tgid == 0) return -ENOENT; - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * A wait for a negative dentry is invalid for certain * cases. A direct or offset mount "always" has its mount diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 16e0a48bfccd..7943533c3868 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -471,7 +471,7 @@ static void * befs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct super_block *sb = dentry->d_sb; - struct befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; char *link; @@ -501,7 +501,7 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) static void * befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); nd_set_link(nd, befs_ino->i_data.symlink); return NULL; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 7a8182770649..3ec6113146c0 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -153,7 +153,7 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { - struct inode *inode = old->d_inode; + struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; @@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); @@ -216,7 +216,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, int error = -ENOENT; old_bh = new_bh = NULL; - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; @@ -231,7 +231,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; error = -EPERM; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, new_dentry->d_name.name, new_dentry->d_name.len, &new_de); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9dcb05409ba7..78f005f37847 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -591,7 +591,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); kill_node(e); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: @@ -675,14 +675,14 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; err = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out2; inode = bm_get_inode(sb, S_IFREG | 0644); @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index 897ee0503932..c7e4163ede87 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -152,7 +152,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct inode *inode = file->f_mapping->host; return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, - blkdev_get_block, NULL, NULL, 0); + blkdev_get_block, NULL, NULL, + DIO_SKIP_DIO_COUNT); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -1716,7 +1717,7 @@ struct block_device *lookup_bdev(const char *pathname) if (error) return ERR_PTR(error); - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4dabeb893b7c..df9932b00d08 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -132,7 +132,7 @@ static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh) { diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e386c29ef1f6..ec2ee477f8ba 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f55721ff9385..9de772ee0031 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; tmp = ulist_alloc(GFP_NOFS); @@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de5e4f2adfea..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,7 +66,11 @@ struct btrfs_inode { */ struct btrfs_key location; - /* Lock for counters */ + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ @@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); if (BTRFS_I(inode)->logged_trans == generation && BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && @@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) */ smp_mb(); if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) - return 1; + ret = 1; } - return 0; + spin_unlock(&BTRFS_I(inode)->lock); + return ret; } #define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index d897ef803b3b..ce7dec88f4b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, - GFP_NOFS); + mapped_datav = kmalloc_array(bio->bi_vcnt, + sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; @@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - if (is_vmalloc_addr(state)) - vfree(state); - else - kfree(state); + kvfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e9df8862012c..ce62324c78e7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->orig_bio = bio; nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); - cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) goto fail1; @@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -static struct btrfs_compress_op *btrfs_compress_op[] = { +static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d181f70caae0..13a4dc0436c9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ struct btrfs_compress_op { size_t srclen, size_t destlen); }; -extern struct btrfs_compress_op btrfs_zlib_compress; -extern struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d67f32e648d..0f11ebc92f02 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); if (!tm_list) return -ENOMEM; @@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), flags); if (!tm_list) { ret = -ENOMEM; @@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return 0; - tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) return 0; nritems = btrfs_header_nritems(eb); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), - GFP_NOFS); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); *last_ref = 1; } return 0; @@ -1678,7 +1677,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr); + cur = btrfs_find_tree_block(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -1943,7 +1942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1997,7 +1996,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2041,7 +2040,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2259,7 +2258,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root, search); + eb = btrfs_find_tree_block(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2319,7 +2318,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1); + eb = btrfs_find_tree_block(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2332,7 +2331,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2); + eb = btrfs_find_tree_block(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2450,7 +2449,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root, blocknr); + tmp = btrfs_find_tree_block(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -3126,7 +3125,8 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -3137,7 +3137,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + tree_mod_log_set_node_key(fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -3151,7 +3151,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -3173,7 +3174,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* @@ -3692,7 +3693,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root, left); + clean_tree_block(trans, root->fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3704,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); + clean_tree_block(trans, root->fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3928,10 +3929,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -4168,6 +4169,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -4271,10 +4273,10 @@ again: btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, + write_extent_buffer(right, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + write_extent_buffer(right, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); @@ -4297,7 +4299,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4615,7 +4617,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } item = btrfs_item_nr(slot); @@ -4716,7 +4718,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4888,7 +4890,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4981,7 +4983,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); + clean_tree_block(trans, root->fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -4990,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9c89cae39ee..6f364e1d8d3d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1061,6 +1061,12 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + /* * is subvolume quota turned on? */ @@ -1256,6 +1262,20 @@ struct btrfs_caching_control { atomic_t count; }; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache { /* For dirty block groups */ struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; }; /* delayed seq elem */ @@ -1329,6 +1352,8 @@ struct seq_list { u64 seq; }; +#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -1472,6 +1497,12 @@ struct btrfs_fs_info { struct mutex chunk_mutex; struct mutex volume_mutex; + /* + * this is taken to make sure we don't set block groups ro after + * the free space cache has been allocated on them + */ + struct mutex ro_block_group_mutex; + /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time @@ -1513,6 +1544,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + struct rw_semaphore delayed_iput_sem; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -3295,6 +3327,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) } /* extent-tree.c */ + +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); + static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { @@ -3385,6 +3420,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int no_quota); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, @@ -3417,7 +3454,7 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); @@ -3486,7 +3524,8 @@ int btrfs_previous_item(struct btrfs_root *root, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -4180,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || - (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) return 1; return 0; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 82f0c7c95474..cde698a07d21 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1383,7 +1383,7 @@ out: static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { struct btrfs_async_delayed_work *async_work; @@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; - btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); return 0; } @@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + struct btrfs_fs_info *fs_info = root->fs_info; delayed_root = btrfs_get_delayed_root(root); @@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) seq = atomic_read(&delayed_root->items_seq); - ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); if (ret) return; @@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) return; } - btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6d16bea94e1c..8f8ed7d20bac 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, * existing and update must have the same bytenr */ static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, +update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; + int old_ref_mod; existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); @@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ + old_ref_mod = existing_ref->total_ref_mod; existing->ref_mod += update->ref_mod; + existing_ref->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing_ref->is_data) { + if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + delayed_refs->pending_csums -= existing->num_bytes; + if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + delayed_refs->pending_csums += existing->num_bytes; + } spin_unlock(&existing_ref->lock); } @@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->is_data = is_data; head_ref->ref_root = RB_ROOT; head_ref->processing = 0; + head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(&existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (is_data && count_mod < 0) + delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a764e2340d48..5eb0892396d0 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head { struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + u64 pending_csums; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5ec03d999c37..0573848c7333 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; kfree(status_args); - do_div(progress, 10); + progress = div_u64(progress, 10); printk_in_rcu(KERN_INFO "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", dev_replace->srcdev->missing ? "<missing disk>" : diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639f2663ed3f..2ef9a4b72d06 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -54,7 +54,7 @@ #include <asm/cpufeature.h> #endif -static struct extent_io_ops btree_extent_io_ops; +static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result) * compute the csum for a btree block, and either verify it or write it * into the csum field of the block. */ -static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, +static int csum_tree_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *buf, int verify) { - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, offset += cur_len; } if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + result = kzalloc(csum_size, GFP_NOFS); if (!result) return 1; } else { @@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, buf->start, + fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); @@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb) if (memcmp(raw_disk_sb, result, csum_size)) ret = 1; - - if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING - "BTRFS: super block crcs don't match, older mkfs detected\n"); - ret = 0; - } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { @@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * we only fill in the checksum field in the first page of a multi-page block */ -static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; @@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) found_start = btrfs_header_bytenr(eb); if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - csum_tree_block(root, eb, 0); + csum_tree_block(fs_info, eb, 0); return 0; } -static int check_tree_block_fsid(struct btrfs_root *root, +static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; @@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(root, eb)) { + if (check_tree_block_fsid(root->fs_info, eb)) { printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; @@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root, eb, 1); + ret = csum_tree_block(root->fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) break; } @@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { - return find_extent_buffer(root->fs_info, bytenr); + return find_extent_buffer(fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -1165,11 +1160,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, } -void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -2146,6 +2140,267 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } } +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); +} + +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(fs_info->btree_inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + init_waitqueue_head(&fs_info->replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); + + /* + * a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "BTRFS: log replay required " + "on RO media\n"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, log_tree_root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + fs_info->generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return -EIO; + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return ret; + } + + if (fs_info->sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + if (ret) + return ret; + } + + return 0; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); + + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + return ret; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2160,21 +2415,12 @@ int open_ctree(struct super_block *sb, struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; - struct btrfs_root *extent_root; - struct btrfs_root *csum_root; struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; int max_active; - int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - bool create_uuid_tree; - bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); @@ -2241,11 +2487,12 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); - mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); + init_rwsem(&fs_info->delayed_iput_sem); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2276,7 +2523,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); + fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2294,55 +2541,18 @@ int open_ctree(struct super_block *sb, } btrfs_init_delayed_root(fs_info->delayed_root); - mutex_init(&fs_info->scrub_lock); - atomic_set(&fs_info->scrubs_running, 0); - atomic_set(&fs_info->scrub_pause_req, 0); - atomic_set(&fs_info->scrubs_paused, 0); - atomic_set(&fs_info->scrub_cancel_req, 0); - init_waitqueue_head(&fs_info->replace_wait); - init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; #endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); - /* - * we set the i_size on the btree inode to the max possible int. - * the real end of the address space is determined by all of - * the devices in the system - */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); - - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - set_bit(BTRFS_INODE_DUMMY, - &BTRFS_I(fs_info->btree_inode)->runtime_flags); - btrfs_insert_inode_hash(fs_info->btree_inode); + btrfs_init_btree_inode(fs_info, tree_root); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2363,26 +2573,14 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); - fs_info->dev_replace.lock_owner = 0; - atomic_set(&fs_info->dev_replace.nesting_level, 0); - mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); - spin_lock_init(&fs_info->qgroup_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - fs_info->qgroup_seq = 1; - fs_info->quota_enabled = 0; - fs_info->pending_quota_state = 0; - fs_info->qgroup_ulist = NULL; - mutex_init(&fs_info->qgroup_rescan_lock); + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2554,75 +2752,9 @@ int open_ctree(struct super_block *sb, max_active = fs_info->thread_pool_size; - fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); - - fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - - fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - - fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); - - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - - fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); - - /* - * endios are largely parallel and should have a very - * low idle thresh - */ - fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); - fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); - fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); - fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); - fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); - fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); - fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); - fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); - - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && - fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && - fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && - fs_info->qgroup_rescan_workers)) { - err = -ENOMEM; + ret = btrfs_init_workqueues(fs_info, fs_devices); + if (ret) { + err = ret; goto fail_sb_buffer; } @@ -2688,7 +2820,7 @@ int open_ctree(struct super_block *sb, * keep the device that is marked to be the target device for the * dev_replace procedure */ - btrfs_close_extra_devices(fs_info, fs_devices, 0); + btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_ERR "BTRFS: failed to read devices on %s\n", @@ -2714,61 +2846,9 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - extent_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(extent_root)) { - ret = PTR_ERR(extent_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); - fs_info->extent_root = extent_root; - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - dev_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(dev_root)) { - ret = PTR_ERR(dev_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); - fs_info->dev_root = dev_root; - btrfs_init_devices_late(fs_info); - - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - csum_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(csum_root)) { - ret = PTR_ERR(csum_root); + ret = btrfs_read_roots(fs_info, tree_root); + if (ret) goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); - fs_info->csum_root = csum_root; - - location.objectid = BTRFS_QUOTA_TREE_OBJECTID; - quota_root = btrfs_read_tree_root(tree_root, &location); - if (!IS_ERR(quota_root)) { - set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); - fs_info->quota_enabled = 1; - fs_info->pending_quota_state = 1; - fs_info->quota_root = quota_root; - } - - location.objectid = BTRFS_UUID_TREE_OBJECTID; - uuid_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - if (ret != -ENOENT) - goto recovery_tree_root; - create_uuid_tree = true; - check_uuid_tree = false; - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); - fs_info->uuid_root = uuid_root; - create_uuid_tree = false; - check_uuid_tree = - generation != btrfs_super_uuid_tree_generation(disk_super); - } fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2792,7 +2872,7 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_info, fs_devices, 1); + btrfs_close_extra_devices(fs_devices, 1); ret = btrfs_sysfs_add_one(fs_info); if (ret) { @@ -2806,7 +2886,7 @@ retry_root_backup: goto fail_sysfs; } - ret = btrfs_read_block_groups(extent_root); + ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); goto fail_sysfs; @@ -2864,48 +2944,11 @@ retry_root_backup: /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { - u64 bytenr = btrfs_super_log_root(disk_super); - - if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); - err = -EIO; - goto fail_qgroup; - } - - log_tree_root = btrfs_alloc_root(fs_info); - if (!log_tree_root) { - err = -ENOMEM; - goto fail_qgroup; - } - - __setup_root(nodesize, sectorsize, stripesize, - log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - log_tree_root->node = read_tree_block(tree_root, bytenr, - generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); - goto fail_qgroup; - } - /* returns with log_tree_root freed on success */ - ret = btrfs_recover_log_trees(log_tree_root); + ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { - btrfs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + err = ret; goto fail_qgroup; } - - if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - if (ret) - goto fail_qgroup; - } } ret = btrfs_find_orphan_roots(tree_root); @@ -2966,7 +3009,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (create_uuid_tree) { + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { @@ -2975,8 +3018,9 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (check_uuid_tree || - btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + fs_info->generation != + btrfs_super_uuid_tree_generation(disk_super)) { pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3668,7 +3712,7 @@ void close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - btrfs_err(root->fs_info, "commit super ret %d", ret); + btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) @@ -3680,10 +3724,10 @@ void close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); - btrfs_free_qgroup_config(root->fs_info); + btrfs_free_qgroup_config(fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - btrfs_info(root->fs_info, "at unmount delalloc count %lld", + btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } @@ -3723,7 +3767,7 @@ void close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); - btrfs_free_block_rsv(root, root->orphan_block_rsv); + __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; lock_chunks(root); @@ -4134,7 +4178,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start); + eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; if (!eb) continue; @@ -4285,7 +4329,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static struct extent_io_ops btree_extent_io_ops = { +static const struct extent_io_ops btree_extent_io_ops = { .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 27d44c0fd236..d4cbfeeeedd4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); + struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 37d164540c3a..8d052209f473 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -220,8 +220,8 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = child->d_inode; - struct inode *dir = parent->d_inode; + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b353ad02f03..1eef4ee01d1a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ spin_lock(&delayed_refs->lock); avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - avg = div64_u64(avg, 4); - fs_info->avg_delayed_ref_runtime = avg; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ spin_unlock(&delayed_refs->lock); } return 0; @@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) * We don't ever fill up leaves all the way so multiply by 2 just to be * closer to what we're really going to want to ouse. */ - return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; } int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, @@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, { struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 num_bytes; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; num_bytes = btrfs_calc_trans_metadata_size(root, 1); @@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, if (num_heads > 1) num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); global_rsv = &root->fs_info->global_block_rsv; /* * If we can't allocate any more chunks lets make sure we have _lots_ of * wiggle room since running delayed refs can create more delayed refs. */ - if (global_rsv->space_info->full) + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; num_bytes <<= 1; + } spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes) + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) ret = 1; spin_unlock(&global_rsv->lock); return ret; @@ -3193,7 +3224,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - int num_pages = 0; + u64 num_pages = 0; int retries = 0; int ret = 0; @@ -3267,7 +3298,7 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) goto out_put; } @@ -3293,14 +3324,14 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages, num_pages); if (ret) goto out_put; @@ -3351,16 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, return 0; } -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; - struct btrfs_path *path; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (!list_empty(&cur_trans->dirty_bgs)) { + list_splice_init(&cur_trans->dirty_bgs, &dirty); + } + spin_unlock(&cur_trans->dirty_bgs_lock); - if (list_empty(&cur_trans->dirty_bgs)) +again: + if (list_empty(&dirty)) { + btrfs_free_path(path); return 0; + } + + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + + /* + * cache_write_mutex is here only to save us from balance + * deleting this block group while we are writing out the + * cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + + if (ret) + break; + } + + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + + btrfs_free_path(path); + return ret; +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3376,16 +3547,61 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ list_del_init(&cache->dirty_list); - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); + should_put = 1; + + cache_save_setup(cache, trans, path); + if (!ret) - ret = btrfs_run_delayed_refs(trans, root, - (unsigned long) -1); - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) - btrfs_write_out_cache(root, trans, cache, path); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } if (!ret) ret = write_one_cache_group(trans, root, path, cache); + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } + + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); } @@ -3635,19 +3851,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; + int ret = 0; + int need_commit = 2; + int have_pinned_space; /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); if (btrfs_is_free_space_inode(inode)) { - committed = 1; + need_commit = 0; ASSERT(current->journal_info); } @@ -3669,7 +3887,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full && alloc_chunk) { + if (!data_sinfo->full) { u64 alloc_target; data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; @@ -3697,8 +3915,10 @@ alloc: if (ret < 0) { if (ret != -ENOSPC) return ret; - else + else { + have_pinned_space = 1; goto commit_trans; + } } if (!data_sinfo) @@ -3709,26 +3929,39 @@ alloc: /* * If we don't have enough pinned space to deal with this - * allocation don't bother committing the transaction. + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. */ - if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, - bytes) < 0) - committed = 1; + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ commit_trans: - if (!committed && + if (need_commit && !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; + need_commit--; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; + if (have_pinned_space >= 0 || + trans->transaction->have_free_bgs || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * make sure that all running delayed iput are + * done + */ + down_write(&root->fs_info->delayed_iput_sem); + up_write(&root->fs_info->delayed_iput_sem); + goto again; + } else { + btrfs_end_transaction(trans, root); + } } trace_btrfs_space_reservation(root->fs_info, @@ -3736,12 +3969,16 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } + ret = btrfs_qgroup_reserve(root, write_bytes); + if (ret) + goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); +out: spin_unlock(&data_sinfo->lock); - return 0; + return ret; } /* @@ -4298,8 +4535,13 @@ out: static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, struct btrfs_fs_info *fs_info, u64 used) { - return (used >= div_factor_fine(space_info->total_bytes, 98) && - !btrfs_fs_closing(fs_info) && + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } @@ -4354,10 +4596,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (!btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) return; - } while (flush_state <= COMMIT_TRANS); - - if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) - queue_work(system_unbound_wq, work); + } while (flush_state < COMMIT_TRANS); } void btrfs_init_async_reclaim_work(struct work_struct *work) @@ -4700,6 +4939,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -4812,10 +5056,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); + num_bytes += div_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } @@ -4998,8 +5242,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -5066,30 +5308,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, int reserve) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; + u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5163,8 +5393,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5172,8 +5401,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->nodesize); + btrfs_qgroup_free(root, nr_extents * root->nodesize); goto out_fail; } @@ -5290,10 +5518,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - dropped * root->nodesize); - } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -5318,7 +5542,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); if (ret) return ret; @@ -5390,14 +5614,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5446,6 +5662,16 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->unused_bgs_lock); } } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -6956,15 +7182,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, return -ENOSPC; } - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); - if (pin) pin_down_extent(root, cache, start, len, 1); else { + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -7095,9 +7321,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_path(path); btrfs_free_and_pin_reserved_extent(root, ins->objectid, root->nodesize); - btrfs_free_path(path); return ret; } @@ -7217,7 +7443,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -7815,7 +8041,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr); + next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); if (!next) @@ -8016,7 +8242,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root, eb); + clean_tree_block(trans, root->fs_info, eb); } if (eb == root->node) { @@ -8533,10 +8759,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); +again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (trans->transaction->dirty_bg_run) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8551,6 +8797,7 @@ out: alloc_flags = update_block_group_flags(root, cache->flags); check_system_chunk(trans, root, alloc_flags); } + mutex_unlock(&root->fs_info->ro_block_group_mutex); btrfs_end_transaction(trans, root); return ret; @@ -8720,7 +8967,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) min_free <<= 1; } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); + min_free = div64_u64(min_free, dev_min); } /* We need to do this so that we can look at pending chunks */ @@ -8992,6 +9239,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9355,7 +9603,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); if (ret) { @@ -9448,18 +9727,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - btrfs_put_block_group(block_group); + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); } spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); memcpy(&key, &block_group->key, sizeof(key)); @@ -9647,8 +9937,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); block_group->pinned = 0; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d688cfe5d496..782f3bc4651d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } ret = fiemap_fill_next_extent(fieinfo, em_start, disko, em_len, flags); - if (ret) + if (ret) { + if (ret == 1) + ret = 0; goto out_free; + } } out_free: free_extent_map(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 695b0ccfb755..c668f36898d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -97,7 +97,7 @@ struct extent_io_tree { u64 dirty_bytes; int track_uptodate; spinlock_t lock; - struct extent_io_ops *ops; + const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 84a2d1868271..58ece6558430 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, - GFP_NOFS); + btrfs_bio->csum_allocated = kmalloc_array(nblocks, + csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); return -ENOMEM; @@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root, btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(root, path, key); + btrfs_set_item_key_safe(root->fs_info, path, key); } else { BUG(); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index faa7d390841b..b072e17479aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - if (need_resched()) { - spin_unlock(&fs_info->defrag_inodes_lock); - cond_resched(); - spin_lock(&fs_info->defrag_inodes_lock); - } + cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } @@ -868,7 +864,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1126,7 +1122,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1160,7 +1156,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes); + ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); if (ret == -ENOSPC && (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) { @@ -1635,8 +1631,8 @@ again: btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { - u64 lockstart = round_down(pos, root->sectorsize); - u64 lockend = lockstart + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + (dirty_pages << PAGE_CACHE_SHIFT) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -1809,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0) @@ -1864,7 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; @@ -2162,7 +2160,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(root, path, &key); + btrfs_set_item_key_safe(root->fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2545,7 +2543,6 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2570,14 +2567,9 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); if (ret) return ret; - if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); - if (ret) - goto out_reserve_fail; - } mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); @@ -2667,23 +2659,35 @@ static long btrfs_fallocate(struct file *file, int mode, 1 << inode->i_blkbits, offset + len, &alloc_hint); - - if (ret < 0) { - free_extent_map(em); - break; - } } else if (actual_end > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + /* * We didn't need to allocate any more space, but we * still extended the size of the file so we need to - * update i_size. + * update i_size and the inode item. */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, + NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, + root); + } } free_extent_map(em); + if (ret < 0) + break; cur_offset = last_byte; if (cur_offset >= alloc_end) { @@ -2695,9 +2699,6 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, alloc_end - alloc_start); -out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a71978578fa7..81fa75a8e1f3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_mask(inode->i_mapping) & + ~(GFP_NOFS & ~__GFP_HIGHMEM)); return inode; } @@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root, key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } + leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); @@ -225,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode) { int ret = 0; + struct btrfs_path *path = btrfs_alloc_path(); + + if (!path) { + ret = -ENOMEM; + goto fail; + } + + if (block_group) { + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + } + btrfs_free_path(path); btrfs_i_size_write(inode, 0); truncate_pagecache(inode, 0); @@ -235,15 +264,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, /* * We don't need an orphan item because truncating the free space cache * will never be split across transactions. + * We don't need to check for -EAGAIN because we're a free space + * cache inode */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); if (ret) { + mutex_unlock(&trans->transaction->cache_write_mutex); btrfs_abort_transaction(trans, root, ret); return ret; } ret = btrfs_update_inode(trans, root, inode); + + if (block_group) + mutex_unlock(&trans->transaction->cache_write_mutex); + +fail: if (ret) btrfs_abort_transaction(trans, root, ret); @@ -269,18 +306,7 @@ static int readahead_cache(struct inode *inode) return 0; } -struct io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_root *root; - unsigned long size; - int index; - int num_pages; - unsigned check_crcs:1; -}; - -static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, struct btrfs_root *root, int write) { int num_pages; @@ -296,45 +322,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) return -ENOSPC; - memset(io_ctl, 0, sizeof(struct io_ctl)); + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); - io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->root = root; io_ctl->check_crcs = check_crcs; + io_ctl->inode = inode; return 0; } -static void io_ctl_free(struct io_ctl *io_ctl) +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); + io_ctl->pages = NULL; } -static void io_ctl_unmap_page(struct io_ctl *io_ctl) +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { - kunmap(io_ctl->page); io_ctl->cur = NULL; io_ctl->orig = NULL; } } -static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; - io_ctl->cur = kmap(io_ctl->page); + io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_CACHE_SIZE; if (clear) memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); } -static void io_ctl_drop_pages(struct io_ctl *io_ctl) +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; @@ -349,7 +376,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, int uptodate) { struct page *page; @@ -383,7 +410,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, return 0; } -static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *val; @@ -406,7 +433,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl->cur += sizeof(u64); } -static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *gen; @@ -435,7 +462,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return 0; } -static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; @@ -453,13 +480,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; - kunmap(io_ctl->pages[0]); } -static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; @@ -473,10 +499,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; - kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, @@ -492,7 +517,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) return 0; } -static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; @@ -522,7 +547,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, return 0; } -static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; @@ -545,7 +570,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) return 0; } -static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we @@ -562,7 +587,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) } } -static int io_ctl_read_entry(struct io_ctl *io_ctl, +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; @@ -589,7 +614,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, return 0; } -static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; @@ -648,7 +673,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct io_ctl io_ctl; + struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); @@ -877,7 +902,7 @@ out: } static noinline_for_stack -int write_cache_extent_entries(struct io_ctl *io_ctl, +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, int *entries, int *bitmaps, @@ -885,6 +910,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, { int ret; struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; @@ -896,6 +922,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, } if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } @@ -919,9 +947,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); cluster = NULL; } } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } /* * Make sure we don't miss any range that was removed from our rbtree @@ -939,6 +973,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, return 0; fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); return -ENOSPC; } @@ -1000,7 +1036,7 @@ fail: static noinline_for_stack int write_pinned_extent_entries(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, - struct io_ctl *io_ctl, + struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; @@ -1050,7 +1086,7 @@ write_pinned_extent_entries(struct btrfs_root *root, } static noinline_for_stack int -write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct list_head *pos, *n; int ret; @@ -1083,10 +1119,7 @@ static int flush_dirty_cache(struct inode *inode) } static void noinline_for_stack -cleanup_write_cache_enospc(struct inode *inode, - struct io_ctl *io_ctl, - struct extent_state **cached_state, - struct list_head *bitmap_list) +cleanup_bitmap_list(struct list_head *bitmap_list) { struct list_head *pos, *n; @@ -1095,12 +1128,85 @@ cleanup_write_cache_enospc(struct inode *inode, list_entry(pos, struct btrfs_free_space, list); list_del_init(&entry->list); } +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state, GFP_NOFS); } +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + if (block_group) + root = root->fs_info->tree_root; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + } + btrfs_update_inode(trans, root, inode); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + /** * __btrfs_write_out_cache - write out cached info to an inode * @root - the root the inode belongs to @@ -1117,20 +1223,22 @@ cleanup_write_cache_enospc(struct inode *inode, static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 offset) { struct extent_state *cached_state = NULL; - struct io_ctl io_ctl; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; + int must_iput = 0; if (!i_size_read(inode)) return -1; - ret = io_ctl_init(&io_ctl, inode, root, 1); + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, root, 1); if (ret) return -1; @@ -1143,55 +1251,57 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; + must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); + io_ctl_prepare_pages(io_ctl, inode, 0); lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state); - io_ctl_set_generation(&io_ctl, trans->transid); + io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ - ret = write_cache_extent_entries(&io_ctl, ctl, + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + if (ret) + goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ - ret = write_bitmap_entries(&io_ctl, &bitmap_list); + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), &cached_state); if (ret) goto out_nospc; @@ -1202,30 +1312,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * Release the pages and unlock the extent, we will flush * them out later */ - io_ctl_drop_pages(&io_ctl); + io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - /* Flush the dirty pages in the cache file. */ - ret = flush_dirty_cache(inode); + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating the + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); if (ret) goto out; - /* Update the cache item to tell everyone this cache file is valid. */ - ret = update_cache_item(trans, root, inode, path, offset, - entries, bitmaps); + return 0; + out: - io_ctl_free(&io_ctl); + io_ctl->inode = NULL; + io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); return ret; +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + out_nospc: - cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1241,7 +1365,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; - enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1250,34 +1373,34 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } - - if (block_group->delalloc_bytes) { - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, + &block_group->io_ctl, trans, path, block_group->key.objectid); if (ret) { - dcs = BTRFS_DC_ERROR; - ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->key.objectid); #endif + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); } - spin_lock(&block_group->lock); - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - iput(inode); + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + return ret; } @@ -1298,11 +1421,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u64 bytes_per_bitmap; + u32 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1521,10 +1644,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max(max_bitmaps, 1); + max_bitmaps = max_t(u32, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1537,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * - div64_u64(size, 1024 * 1024 * 1024); + div_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1552,14 +1675,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } /* - * we want the extent entry threshold to always be at most 1/2 the maxw + * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = - div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1673,7 +1796,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; - do_div(tmp, align); + tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { @@ -2402,11 +2525,8 @@ static void __btrfs_remove_free_space_cache_locked( } else { free_bitmap(ctl, info); } - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } } @@ -2431,11 +2551,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); spin_unlock(&ctl->tree_lock); @@ -3346,11 +3463,17 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; + struct btrfs_io_ctl io_ctl; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + memset(&io_ctl, 0, sizeof(io_ctl)); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, + trans, path, 0); + if (!ret) + ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + if (ret) { btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 88b2238a0aed..a16a029ad3b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,6 +48,8 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; +struct btrfs_io_ctl; + struct inode *lookup_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset); int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); - struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); int create_free_ino_inode(struct btrfs_root *root, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 74faea3a516e..f6a596d5a637 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -456,7 +456,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 43192e10cc43..ada4d24ed11b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -59,6 +59,7 @@ #include "backref.h" #include "hash.h" #include "props.h" +#include "qgroup.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -470,7 +471,7 @@ again: */ if (inode_need_compress(inode)) { WARN_ON(pages); - pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ goto cont; @@ -752,7 +753,6 @@ retry: } goto out_free; } - /* * here we're doing allocation and writeback of the * compressed pages @@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; + down_read(&fs_info->delayed_iput_sem); + spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } + + up_read(&root->fs_info->delayed_iput_sem); } /* @@ -4016,16 +4020,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (ret) goto out; @@ -4124,7 +4128,7 @@ out: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; @@ -4151,7 +4155,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); @@ -4162,6 +4166,21 @@ out: return err; } +static int truncate_space_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytes_deleted) +{ + int ret; + + bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->bytes_reserved += bytes_deleted; + return ret; + +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4197,9 +4216,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret; int err = 0; u64 ino = btrfs_ino(inode); + u64 bytes_deleted = 0; + bool be_nice = 0; + bool should_throttle = 0; + bool should_end = 0; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + /* + * for non-free space inodes and ref cows, we want to back off from + * time to time + */ + if (!btrfs_is_free_space_inode(inode) && + test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + be_nice = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4229,6 +4260,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + /* + * with a 16K leaf size and 128MB extents, you can actually queue + * up a huge file in a single leaf. Most of the time that + * bytes_deleted is > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (btrfs_should_end_transaction(trans, root)) { + err = -EAGAIN; + goto error; + } + } + + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { @@ -4371,22 +4415,39 @@ delete: } else { break; } + should_throttle = 0; + if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); + bytes_deleted += extent_num_bytes; ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset, 0); BUG_ON(ret); + if (btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_async_run_delayed_refs(root, + trans->delayed_ref_updates * 2, 0); + if (be_nice) { + if (truncate_space_check(trans, root, + extent_num_bytes)) { + should_end = 1; + } + if (btrfs_should_throttle_delayed_refs(trans, + root)) { + should_throttle = 1; + } + } } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot) { + path->slots[0] != pending_del_slot || + should_throttle || should_end) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4399,6 +4460,23 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); + if (should_throttle) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + /* + * if we failed to refill our space rsv, bail out + * and let the transaction restart + */ + if (should_end) { + err = -EAGAIN; + goto error; + } goto search_again; } else { path->slots[0]--; @@ -4415,7 +4493,18 @@ error: if (last_size != (u64)-1 && root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_free_path(path); + + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } return err; } @@ -4826,7 +4915,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; @@ -4924,6 +5013,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; + int steal_from_global = 0; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); int ret; @@ -4991,9 +5081,20 @@ void btrfs_evict_inode(struct inode *inode) * hard as possible to get this to work. */ if (ret) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); + steal_from_global++; + else + steal_from_global = 0; + ret = 0; - if (ret) { + /* + * steal_from_global == 0: we reserved stuff, hooray! + * steal_from_global == 1: we didn't reserve stuff, boo! + * steal_from_global == 2: we've committed, still not a lot of + * room but maybe we'll have room in the global reserve this + * time. + * steal_from_global == 3: abandon all hope! + */ + if (steal_from_global > 2) { btrfs_warn(root->fs_info, "Could not get space for a delete, will truncate on mount %d", ret); @@ -5009,10 +5110,40 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * We can't just steal from the global reserve, we need tomake + * sure there is room to do it, if not we need to commit and try + * again. + */ + if (steal_from_global) { + if (!btrfs_check_space_for_delayed_refs(trans, root)) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, + min_size); + else + ret = -ENOSPC; + } + + /* + * Couldn't steal from the global reserve, we have too much + * pending stuff built up, commit the transaction and try it + * again. + */ + if (ret) { + ret = btrfs_commit_transaction(trans, root); + if (ret) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + continue; + } else { + steal_from_global = 0; + } + trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC) + if (ret != -ENOSPC && ret != -EAGAIN) break; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -5416,10 +5547,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; @@ -6226,7 +6357,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); u64 index; int err; int drop_inode = 0; @@ -8129,7 +8260,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) return 0; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb__after_atomic(); /* @@ -8169,7 +8300,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { - inode_dio_done(inode); + inode_dio_end(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; } @@ -8188,7 +8319,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } out: if (wakeup) - inode_dio_done(inode); + inode_dio_end(inode); if (relock) mutex_lock(&inode->i_mutex); @@ -8581,7 +8712,7 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -ENOSPC) { + if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } @@ -8875,7 +9006,7 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { u64 delalloc_bytes; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); @@ -8896,8 +9027,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; @@ -9009,7 +9140,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.len); } else { ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, + d_inode(old_dentry), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9033,12 +9164,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, + d_inode(new_dentry), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, new_dentry->d_inode); + ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -9451,6 +9582,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74609b931ba5..b05653f182c2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir, if (ret) return ret; + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assume subvolme qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * The same as the snapshot creation, please see the comment @@ -717,7 +724,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -761,10 +768,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; - if (!victim->d_inode) + if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(victim->d_parent->d_inode != dir); + BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -772,8 +779,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || + IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -792,7 +799,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -810,7 +817,7 @@ static noinline int btrfs_mksubvol(struct path *parent, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = parent->dentry->d_inode; + struct inode *dir = d_inode(parent->dentry); struct dentry *dentry; int error; @@ -824,7 +831,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out_dput; error = btrfs_may_create(dir, dentry); @@ -1564,7 +1571,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - do_div(new_size, root->sectorsize); + new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", @@ -2294,7 +2301,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, { struct dentry *parent = file->f_path.dentry; struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2333,12 +2340,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_dir; } - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { err = -ENOENT; goto out_dput; } - inode = dentry->d_inode; + inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* @@ -2897,6 +2904,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, if (src == dst) return -EINVAL; + if (len == 0) + return 0; + btrfs_double_lock(src, loff, dst, dst_loff, len); ret = extent_same_check_offsets(src, loff, len); @@ -3039,7 +3049,7 @@ out: static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 disko) { - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist *roots; struct ulist_iterator uiter; struct ulist_node *root_node = NULL; @@ -3202,6 +3212,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { + u64 next_key_min_offset = key.offset + 1; + /* * note the key will change type as we walk through the * tree. @@ -3282,7 +3294,7 @@ process_slot: } else if (key.offset >= off + len) { break; } - + next_key_min_offset = key.offset + datal; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), @@ -3497,7 +3509,7 @@ process_slot: break; } btrfs_release_path(path); - key.offset++; + key.offset = next_key_min_offset; } ret = 0; @@ -3626,6 +3638,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; + if (len == 0) { + ret = 0; + goto out_unlock; + } + /* verify the end result is block aligned */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || !IS_ALIGNED(destoff, bs)) @@ -4624,6 +4641,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) sa->src, sa->dst); } + /* update qgroup status and info */ + err = btrfs_run_qgroups(trans, root->fs_info); + if (err < 0) + btrfs_error(root->fs_info, ret, + "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -4669,8 +4691,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->create) { - ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, - NULL); + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 617553cdb7d3..a2f051347731 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -434,7 +434,7 @@ out: return ret; } -struct btrfs_compress_op btrfs_lzo_compress = { +const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index b7816cefbd13..1b10a3cd1195 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor) if (factor == 10) return num; num *= factor; - do_div(num, 10); - return num; + return div_u64(num, 10); } static inline u64 div_factor_fine(u64 num, int factor) @@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor) if (factor == 100) return num; num *= factor; - do_div(num, 100); - return num; + return div_u64(num, 100); } #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 129b1dd28527..dca137b04095 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode) return NULL; } + + diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 058c79eecbfb..3d6546581bb9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -644,9 +644,8 @@ out: } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 qgroupid, - u64 flags, u64 max_rfer, u64 max_excl, - u64 rsv_rfer, u64 rsv_excl) + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) { struct btrfs_path *path; struct btrfs_key key; @@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; - key.offset = qgroupid; + key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) @@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); - btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); - btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); - btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); - btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); - btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); btrfs_mark_buffer_dirty(l); @@ -967,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, tree_root, quota_root->node); + clean_tree_block(trans, tree_root->fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + u64 num_bytes, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup->excl, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { @@ -1008,8 +1112,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1043,23 +1156,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); return ret; } -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, +int __del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; int err; - mutex_lock(&fs_info->qgroup_ioctl_lock); + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; @@ -1088,14 +1211,27 @@ exist: spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, fs_info, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) + struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -1133,6 +1269,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); @@ -1147,15 +1284,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } else { - /* check if there are no relations to this qgroup */ - if (!list_empty(&qgroup->groups) || - !list_empty(&qgroup->members)) { + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } } ret = del_qgroup_item(trans, quota_root, qgroupid); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, fs_info, + qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1184,23 +1330,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = update_qgroup_limit_item(trans, quota_root, qgroupid, - limit->flags, limit->max_rfer, - limit->max_excl, limit->rsv_rfer, - limit->rsv_excl); + + spin_lock(&fs_info->qgroup_lock); + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) + qgroup->max_rfer = limit->max_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) + qgroup->max_excl = limit->max_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) + qgroup->rsv_rfer = limit->rsv_rfer; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) + qgroup->rsv_excl = limit->rsv_excl; + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); + + ret = update_qgroup_limit_item(trans, quota_root, qgroup); if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_info(fs_info, "unable to update quota limit for %llu", qgroupid); } - spin_lock(&fs_info->qgroup_lock); - qgroup->lim_flags = limit->flags; - qgroup->max_rfer = limit->max_rfer; - qgroup->max_excl = limit->max_excl; - qgroup->rsv_rfer = limit->rsv_rfer; - qgroup->rsv_excl = limit->rsv_excl; - spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1256,14 +1406,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, return -1; if (oper1->bytenr > oper2->bytenr) return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) return 1; + if (oper1->seq < oper2->seq) + return -1; + if (oper1->seq > oper2->seq) + return 1; if (oper1->type < oper2->type) return -1; if (oper1->type > oper2->type) @@ -1372,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, return 0; } -/* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and - * exclusive counts adjusted. - */ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_operation *oper) { - struct btrfs_qgroup *qgroup; struct ulist *tmp; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; int sign = 0; int ret = 0; @@ -1395,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->qgroup_lock); if (!fs_info->quota_root) goto out; - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; + switch (oper->type) { case BTRFS_QGROUP_OPER_ADD_EXCL: sign = 1; @@ -1408,43 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, default: ASSERT(0); } - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } - - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - qgroup_dirty(fs_info, qgroup); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } - } - ret = 0; + ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, + oper->num_bytes, sign); out: spin_unlock(&fs_info->qgroup_lock); ulist_free(tmp); @@ -1845,7 +1949,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist *qgroups, *tmp; struct btrfs_qgroup *qgroup; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); u64 seq; int old_roots = 0; int new_roots = 0; @@ -1967,7 +2071,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, int err; struct btrfs_qgroup *qg; u64 root_obj = 0; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); parents = ulist_alloc(GFP_NOFS); if (!parents) @@ -2156,6 +2260,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; spin_lock(&fs_info->qgroup_lock); } if (fs_info->quota_enabled) @@ -2219,6 +2327,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } + + if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { + ret = -EINVAL; + goto out; + } ++i_qgroups; } } @@ -2230,17 +2343,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (ret) goto out; - if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { - ret = update_qgroup_limit_item(trans, quota_root, objectid, - inherit->lim.flags, - inherit->lim.max_rfer, - inherit->lim.max_excl, - inherit->lim.rsv_rfer, - inherit->lim.rsv_excl); - if (ret) - goto out; - } - if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; @@ -2286,6 +2388,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, goto unlock; } + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + dstgroup->qgroupid); + goto unlock; + } + } + if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) @@ -2302,6 +2420,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); } @@ -2358,12 +2484,6 @@ out: return ret; } -/* - * reserve some space for a qgroup and all its parents. The reservation takes - * place with start_transaction or dealloc_reserve, similar to ENOSPC - * accounting. If not enough space is available, EDQUOT is returned. - * We assume that the requested space is new for all qgroups. - */ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; @@ -2513,7 +2633,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) /* * returns < 0 on error, 0 when more leafs are to be scanned. - * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -2522,7 +2642,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, { struct btrfs_key found; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; u64 seq; int new_roots; @@ -2618,6 +2738,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -2660,7 +2781,7 @@ out: mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err == 2 && + if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (err < 0) { @@ -2668,13 +2789,33 @@ out: } mutex_unlock(&fs_info->qgroup_rescan_lock); + /* + * only update status, since the previous part has alreay updated the + * qgroup info. + */ + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_err(fs_info, + "fail to start transaction for status update: %d\n", + err); + goto done; + } + ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d\n", err); + } + btrfs_end_transaction(trans, fs_info->quota_root); + if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err == 2 ? " (inconsistency flag cleared)" : ""); + err > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } +done: complete_all(&fs_info->qgroup_rescan_completion); } @@ -2709,7 +2850,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); goto err; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 18cc68ca3090..c5242aa9a4b2 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); + struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5264858ed768..fa72068bd256 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) { - if (is_vmalloc_addr(x)) - vfree(x); - else - kfree(x); - } + if (x) + kvfree(x); return 0; } @@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); - if (is_vmalloc_addr(info->stripe_hash_table)) - vfree(info->stripe_hash_table); - else - kfree(info->stripe_hash_table); + kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } @@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->real_stripes * sizeof(void *), - GFP_NOFS); + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; goto cleanup_io; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d83085381bcc..74b24b01d574 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode, mutex_lock(&inode->i_mutex); ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start); + 1 - cluster->start, 0); if (ret) goto out; @@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct inode *inode, u64 ino) + struct btrfs_block_group_cache *block_group, + struct inode *inode, + u64 ino) { struct btrfs_key key; struct btrfs_root *root = fs_info->tree_root; @@ -3463,7 +3465,7 @@ truncate: goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc, */ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { ret = delete_block_group_cache(rc->extent_root->fs_info, + rc->block_group, NULL, ref_objectid); if (ret != -ENOENT) return ret; @@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, inode, 0); + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ec57687c9a4d..ab5811545a98 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * the statistics. */ - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); + sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, + sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - int offset; + u32 offset; int nsectors; int sectorsize = sparity->sctx->dev_root->sectorsize; @@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - offset = (int)do_div(start, sparity->stripe_len); + start = div_u64_rem(start, sparity->stripe_len, &offset); offset /= sectorsize; nsectors = (int)len / sectorsize; @@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num, int j = 0; u64 stripe_nr; u64 last_offset; - int stripe_index; - int rot; + u32 stripe_index; + u32 rot; last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); @@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = *offset; - do_div(stripe_nr, map->stripe_len); - do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ - rot = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2995,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nstripes = length; physical = map->stripes[num].physical; offset = 0; - do_div(nstripes, map->stripe_len); + nstripes = div_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3563,7 +3560,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { int ret = 0; - int flags = WQ_FREEZABLE | WQ_UNBOUND; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d6033f540cc7..a1216f9b4917 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } -static int path_loop(struct send_ctx *sctx, struct fs_path *name, - u64 ino, u64 gen, u64 *ancestor_ino) -{ - int ret = 0; - u64 parent_inode = 0; - u64 parent_gen = 0; - u64 start_ino = ino; - - *ancestor_ino = 0; - while (ino != BTRFS_FIRST_FREE_OBJECTID) { - fs_path_reset(name); - - if (is_waiting_for_rm(sctx, ino)) - break; - if (is_waiting_for_move(sctx, ino)) { - if (*ancestor_ino == 0) - *ancestor_ino = ino; - ret = get_first_ref(sctx->parent_root, ino, - &parent_inode, &parent_gen, name); - } else { - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, - &parent_gen, name); - if (ret > 0) { - ret = 0; - break; - } - } - if (ret < 0) - break; - if (parent_inode == start_ino) { - ret = 1; - if (*ancestor_ino == 0) - *ancestor_ino = ino; - break; - } - ino = parent_inode; - gen = parent_gen; - } - return ret; -} - static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; - u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; - ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); - if (ret) { - LIST_HEAD(deleted_refs); - ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); - ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs, - pm->is_orphan); - if (ret < 0) - goto out; - if (rmdir_ino) { - dm = get_waiting_dir_move(sctx, pm->ino); - ASSERT(dm); - dm->rmdir_ino = rmdir_ino; - } - goto out; - } fs_path_reset(name); to_path = name; name = NULL; @@ -3610,10 +3551,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { + struct name_cache_entry *nce; + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -5852,19 +5810,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) ret = PTR_ERR(clone_root); goto out; } - clone_sources_to_rollback = i + 1; spin_lock(&clone_root->root_item_lock); - clone_root->send_in_progress++; - if (!btrfs_root_readonly(clone_root)) { + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } + clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; } vfree(clone_sources_tmp); clone_sources_tmp = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 05fef198ff94..9e66f5e724db 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -901,6 +901,15 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); + if (!(sb->s_flags & MS_RDONLY)) { + int ret; + down_read(&fs_info->cleanup_work_sem); + ret = btrfs_orphan_cleanup(new_root); + up_read(&fs_info->cleanup_work_sem); + if (ret) + return ERR_PTR(ret); + } + dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -916,7 +925,7 @@ setup_root: * a reference to the dentry. We will have already gotten a reference * to the inode in btrfs_fill_super so we're good to go. */ - if (!new && sb->s_root->d_inode == inode) { + if (!new && d_inode(sb->s_root) == inode) { iput(inode); return dget(sb->s_root); } @@ -1221,7 +1230,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, root = mount_subtree(mnt, subvol_name); - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { struct super_block *s = root->d_sb; dput(root); root = ERR_PTR(-EINVAL); @@ -1714,7 +1723,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); avail_space *= BTRFS_STRIPE_LEN; /* @@ -1886,8 +1895,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; return 0; } @@ -1908,6 +1917,17 @@ static struct file_system_type btrfs_fs_type = { }; MODULE_ALIAS_FS("btrfs"); +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + /* * used by btrfsctl to scan devices when no FS is mounted */ @@ -2009,6 +2029,7 @@ static const struct super_operations btrfs_super_ops = { }; static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = btrfs_control_ioctl, .owner = THIS_MODULE, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 94edb0a2a026..e8a4c86d274d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; -static u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[3] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f7dd298b3cf6..3a4bbed723fd 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) /* convert from attribute */ -#define to_btrfs_feature_attr(a) \ - container_of(a, struct btrfs_feature_attr, kobj_attr) -#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) -#define attr_to_btrfs_feature_attr(a) \ - to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +static inline struct btrfs_feature_attr * +to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static inline struct btrfs_feature_attr * +attr_to_btrfs_feature_attr(struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 73f299ebdabb..c32a7ba76bca 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 5); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; @@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 256); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8be4278e25e8..5628e25250c0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -35,7 +35,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), @@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + if (transaction->delayed_refs.pending_csums) + printk(KERN_ERR "pending csums is %llu\n", + transaction->delayed_refs.pending_csums); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree) */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + + cond_resched_lock(&tree->lock); } spin_unlock(&tree->lock); } @@ -222,10 +222,12 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); + cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; @@ -250,6 +252,9 @@ loop: INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + mutex_init(&cur_trans->cache_write_mutex); + cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -721,7 +726,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates); + err = btrfs_run_delayed_refs(trans, root, updates * 2); if (err) /* Error code will also eval true */ return err; } @@ -1057,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1110,7 +1116,7 @@ again: return ret; } - while (!list_empty(dirty_bgs)) { + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans, root); if (ret) return ret; @@ -1810,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } + if (!cur_trans->dirty_bg_run) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds dirty_bg_run = 1 + * + * The dirty_bg_run flag is also used to make sure only + * one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (!cur_trans->dirty_bg_run) { + run_it = 1; + cur_trans->dirty_bg_run = 1; + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + if (run_it) + ret = btrfs_start_dirty_block_groups(trans, root); + } + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + spin_lock(&root->fs_info->trans_lock); list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { @@ -2003,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 937050a2b68e..0b24755596ba 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -64,9 +64,19 @@ struct btrfs_transaction { struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; + struct list_head io_bgs; + u64 num_dirty_bgs; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; + int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -136,9 +146,11 @@ struct btrfs_pending_snapshot { static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5b8ba37f88e..d04968374e9d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -492,11 +492,19 @@ insert: if (btrfs_inode_generation(eb, src_item) == 0) { struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && - S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { struct btrfs_map_token token; - u64 ino_size = btrfs_inode_size(eb, src_item); btrfs_init_map_token(&token); btrfs_set_token_inode_size(dst_eb, dst_item, @@ -1951,6 +1959,104 @@ out: return ret; } +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size_nr(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It @@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, @@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); + clean_tree_block(trans, log->fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; @@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) @@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, err = ret; goto done; } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; } path->slots[0] = nritems; @@ -3175,7 +3321,8 @@ done: static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { u64 min_key; u64 max_key; @@ -3187,7 +3334,7 @@ again: max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, + dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, if (ret < 0) { return ret; } else if (ret > 0) { - *size_ret = i_size_read(inode); + *size_ret = 0; } else { struct btrfs_inode_item *item; @@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) { - max_key_type = BTRFS_INODE_EXTREF_KEY; - max_key.type = max_key_type; - } + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { - max_key.type = BTRFS_INODE_EXTREF_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); + while(1) { + ret = btrfs_truncate_inode_items(trans, + log, inode, 0, 0); + if (ret != -EAGAIN) + break; + } } - } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); + if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; - } else { - max_key.type = BTRFS_INODE_EXTREF_KEY; - } + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4277,15 +4421,18 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4327,9 +4474,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) goto out; - inode = parent->d_inode; + inode = d_inode(parent); } while (1) { @@ -4355,7 +4502,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; if (IS_ROOT(parent)) @@ -4364,7 +4511,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = parent->d_inode; + inode = d_inode(parent); } dput(old_parent); @@ -4372,6 +4519,181 @@ out: return ret; } +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, const struct dentry * const first_parent = parent; const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; sb = inode->i_sb; @@ -4449,11 +4773,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; - inode = parent->d_inode; + inode = d_inode(parent); if (root != BTRFS_I(inode)->root) break; @@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, dput(old_parent); old_parent = parent; } - ret = 0; + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; end_trans: dput(old_parent); if (ret < 0) { @@ -4515,7 +4845,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, start, end, 0, ctx); dput(parent); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 154990c26dcb..6916a781ea02 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -29,6 +29,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; int io_err; + bool log_new_dentries; struct list_head list; }; @@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; + ctx->log_new_dentries = false; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8222f6f74147..8bcd2a007517 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -366,8 +366,8 @@ loop_lock: btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) - cond_resched(); + + cond_resched(); /* * we made progress, there is more work to do and the bdi @@ -400,8 +400,7 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) - cond_resched(); + cond_resched(); continue; } spin_lock(&device->io_lock); @@ -609,8 +608,7 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -1136,11 +1134,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: + max_hole_start = search_start; max_hole_size = 0; - hole_size = 0; +again: if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto out; @@ -1233,21 +1231,23 @@ next: * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ - if (search_end > search_start) + if (search_end > search_start) { hole_size = search_end - search_start; - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } + if (contains_pending_extent(trans, device, &search_start, + hole_size)) { + btrfs_release_path(path); + goto again; + } - if (contains_pending_extent(trans, device, &search_start, hole_size)) { - btrfs_release_path(path); - goto again; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } } /* See above. */ - if (hole_size < num_bytes) + if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; @@ -2487,8 +2487,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, + struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { int ret; @@ -2580,7 +2579,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct map_lookup *map; u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - u64 chunk_tree = root->fs_info->chunk_root->objectid; int i, ret = 0; /* Just in case */ @@ -2634,8 +2632,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } } - ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -2664,8 +2661,8 @@ out: } static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) + u64 chunk_objectid, + u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2707,7 +2704,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; - u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; bool retried = false; int failed = 0; @@ -2744,7 +2740,7 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); if (ret == -ENOSPC) @@ -3022,7 +3018,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); + stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) @@ -3255,7 +3251,6 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, found_key.objectid, found_key.offset); if (ret && ret != -ENOSPC) @@ -3957,7 +3952,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; int ret; @@ -4027,13 +4021,11 @@ again: break; } - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4131,7 +4123,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -4289,7 +4281,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); - devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -4400,8 +4392,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ if (stripe_size * data_stripes > max_chunk_size) { u64 mask = (1ULL << 24) - 1; - stripe_size = max_chunk_size; - do_div(stripe_size, data_stripes); + + stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ stripe_size = (stripe_size + mask) & ~mask; @@ -4413,10 +4405,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - do_div(stripe_size, dev_stripes); + stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, raid_stripe_len); + stripe_size = div_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4954,7 +4946,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - int stripe_index; + u32 stripe_index; int i; int ret = 0; int num_stripes; @@ -4995,7 +4987,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, stripe_len); + stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); @@ -5011,7 +5003,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* allow a write of a full stripe, but make sure we don't * allow straddling of stripes */ - do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); raid56_full_stripe_start *= full_stripe_len; } @@ -5136,7 +5129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_nr_orig = stripe_nr; stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - do_div(stripe_nr_end, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); @@ -5144,7 +5137,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { @@ -5170,9 +5164,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; + u32 factor = map->num_stripes / map->sub_stripes; - stripe_index = do_div(stripe_nr, factor); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) @@ -5198,8 +5192,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len * nr_data_stripes(map)); + stripe_nr = div_u64(raid56_full_stripe_start, + stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5209,32 +5203,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_offset = 0; } else { - u64 tmp; - /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64_rem(stripe_nr, + nr_data_stripes(map), &stripe_index); if (mirror_num > 1) stripe_index = nr_data_stripes(map) + mirror_num - 2; /* We distribute the parity blocks across stripes */ - tmp = stripe_nr + stripe_index; - stripe_index = do_div(tmp, map->num_stripes); + div_u64_rem(stripe_nr + stripe_index, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array */ - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); @@ -5261,7 +5255,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { u64 tmp; - int i, rot; + unsigned rot; bbio->raid_map = (u64 *)((void *)bbio->stripes + sizeof(struct btrfs_bio_stripe) * @@ -5269,8 +5263,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, sizeof(int) * tgtdev_indexes); /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); + div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ tmp = stripe_nr * nr_data_stripes(map); @@ -5285,8 +5278,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; + u32 factor = 0; + u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; @@ -5437,9 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - u64 length = map->stripe_len; - - if (physical_of_found + length <= + if (physical_of_found + map->stripe_len <= dev_replace->cursor_left) { struct btrfs_bio_stripe *tgtdev_stripe = bbio->stripes + num_stripes; @@ -5535,15 +5526,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) - do_div(length, map->num_stripes / map->sub_stripes); + length = div_u64(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - do_div(length, map->num_stripes); + length = div_u64(length, map->num_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - do_div(length, nr_data_stripes(map)); + length = div_u64(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } - buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { @@ -5554,11 +5545,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, continue; stripe_nr = physical - map->stripes[i].physical; - do_div(stripe_nr, map->stripe_len); + stripe_nr = div_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; - do_div(stripe_nr, map->sub_stripes); + stripe_nr = div_u64(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; } /* else if RAID[56], multiply by nr_data_stripes(). @@ -5835,8 +5826,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 length = 0; u64 map_length; int ret; - int dev_nr = 0; - int total_devs = 1; + int dev_nr; + int total_devs; struct btrfs_bio *bbio = NULL; length = bio->bi_iter.bi_size; @@ -5877,11 +5868,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - while (dev_nr < total_devs) { + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); - dev_nr++; continue; } @@ -5894,7 +5884,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = breakup_stripe_bio(root, bbio, first_bio, dev, dev_nr, rw, async_submit); BUG_ON(ret); - dev_nr++; continue; } @@ -5909,7 +5898,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, async_submit); - dev_nr++; } btrfs_bio_counter_dec(root->fs_info); return 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 83069dec6898..ebc31331a837 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 883b93623bc5..6f518c90e1c1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -261,7 +261,7 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key, found_key; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { /* * Check if the attribute is in a supported namespace. * - * This applied after the check for the synthetic attributes in the system + * This is applied after the check for the synthetic attributes in the system * namespace. */ -static bool btrfs_is_valid_xattr(const char *name) +static int btrfs_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || - !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || - !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); + int len = strlen(name); + int prefixlen = 0; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) + prefixlen = XATTR_SECURITY_PREFIX_LEN; + else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + prefixlen = XATTR_SYSTEM_PREFIX_LEN; + else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + prefixlen = XATTR_TRUSTED_PREFIX_LEN; + else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + prefixlen = XATTR_USER_PREFIX_LEN; + else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + prefixlen = XATTR_BTRFS_PREFIX_LEN; + else + return -EOPNOTSUPP; + + /* + * The name cannot consist of just prefix + */ + if (len <= prefixlen) + return -EINVAL; + + return 0; } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + int ret; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, buffer, size); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - return __btrfs_getxattr(dentry->d_inode, name, buffer, size); + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + return __btrfs_getxattr(d_inode(dentry), name, buffer, size); } int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, NULL, 0, XATTR_REPLACE); - return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index fb22fd8d8fb8..82990b8f872b 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -403,7 +403,7 @@ next: return ret; } -struct btrfs_compress_op btrfs_zlib_compress = { +const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index fbb08e97438d..6af790fc3df8 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -123,11 +123,11 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; - if (!root->d_inode || - !root->d_inode->i_op->lookup || - !root->d_inode->i_op->mkdir || - !root->d_inode->i_op->setxattr || - !root->d_inode->i_op->getxattr || + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->setxattr || + !d_backing_inode(root)->i_op->getxattr || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs) goto error_unsupported; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 232426214fdd..afa023dded5b 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -441,12 +441,12 @@ static int cachefiles_attr_changed(struct fscache_object *_object) fscache_set_store_limit(&object->fscache, ni_size); - oi_size = i_size_read(object->backer->d_inode); + oi_size = i_size_read(d_backing_inode(object->backer)); if (oi_size == ni_size) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&object->backer->d_inode->i_mutex); + mutex_lock(&d_inode(object->backer)->i_mutex); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&object->backer->d_inode->i_mutex); + mutex_unlock(&d_inode(object->backer)->i_mutex); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1e51714eb33e..ab857ab9f40d 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -286,13 +286,13 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { - ret = vfs_unlink(dir->d_inode, rep, NULL); + ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -303,7 +303,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -355,7 +355,7 @@ try_again: return -EIO; } - if (grave->d_inode) { + if (d_is_positive(grave)) { unlock_rename(cache->graveyard, dir); dput(grave); grave = NULL; @@ -387,8 +387,8 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { - ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave, NULL, 0); + ret = vfs_rename(d_inode(dir), rep, + d_inode(cache->graveyard), grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -415,18 +415,18 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); - ASSERT(object->dentry->d_inode); + ASSERT(d_backing_inode(object->dentry)); ASSERT(object->dentry->d_parent); dir = dget_parent(object->dentry); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -438,7 +438,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } } @@ -473,7 +473,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, path.mnt = cache->mnt; ASSERT(parent->dentry); - ASSERT(parent->dentry->d_inode); + ASSERT(d_backing_inode(parent->dentry)); if (!(d_is_dir(parent->dentry))) { // TODO: convert file to dir @@ -497,7 +497,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -505,21 +505,21 @@ lookup_again: if (IS_ERR(next)) goto lookup_error; - _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); if (!key) - object->new = !next->d_inode; + object->new = !d_backing_inode(next); /* if this element of the path doesn't exist, then the lookup phase * failed, and we can release any readers in the certain knowledge that * there's nothing for them to actually read */ - if (!next->d_inode) + if (d_is_negative(next)) fscache_object_lookup_negative(&object->fscache); /* we need to create the object if it's negative */ if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { /* index objects and intervening tree levels must be subdirs */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto create_error; @@ -529,26 +529,26 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_mkdir(dir->d_inode, next, 0); + ret = vfs_mkdir(d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } } else { /* non-index objects start out life as files */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto create_error; @@ -558,21 +558,21 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(dir->d_inode, next, S_IFREG, true); + ret = vfs_create(d_inode(dir), next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("create -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next) && !d_is_reg(next) ) { pr_err("inode %lu is not a file or directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } @@ -581,7 +581,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = next; next = NULL; @@ -617,7 +617,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = NULL; @@ -646,7 +646,7 @@ lookup_again: const struct address_space_operations *aops; ret = -EPERM; - aops = object->dentry->d_inode->i_mapping->a_ops; + aops = d_backing_inode(object->dentry)->i_mapping->a_ops; if (!aops->bmap) goto check_error; @@ -659,7 +659,7 @@ lookup_again: object->new = 0; fscache_obtained_object(&object->fscache); - _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; create_error: @@ -695,7 +695,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(next); error_out2: dput(dir); @@ -719,7 +719,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -731,10 +731,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } _debug("subdir -> %p %s", - subdir, subdir->d_inode ? "positive" : "negative"); + subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ - if (!subdir->d_inode) { + if (d_is_negative(subdir)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto mkdir_error; @@ -746,22 +746,22 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = vfs_mkdir(dir->d_inode, subdir, 0700); + ret = vfs_mkdir(d_inode(dir), subdir, 0700); if (ret < 0) goto mkdir_error; - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", subdir, - subdir->d_inode, - subdir->d_inode->i_ino); + d_backing_inode(subdir), + d_backing_inode(subdir)->i_ino); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); /* we need to make sure the subdir is a directory */ - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); @@ -770,18 +770,18 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op->setxattr || - !subdir->d_inode->i_op->getxattr || - !subdir->d_inode->i_op->lookup || - !subdir->d_inode->i_op->mkdir || - !subdir->d_inode->i_op->create || - (!subdir->d_inode->i_op->rename && - !subdir->d_inode->i_op->rename2) || - !subdir->d_inode->i_op->rmdir || - !subdir->d_inode->i_op->unlink) + if (!d_backing_inode(subdir)->i_op->setxattr || + !d_backing_inode(subdir)->i_op->getxattr || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->create || + (!d_backing_inode(subdir)->i_op->rename && + !d_backing_inode(subdir)->i_op->rename2) || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) goto check_error; - _leave(" = [%lu]", subdir->d_inode->i_ino); + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: @@ -790,19 +790,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -827,7 +827,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -836,13 +836,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, goto lookup_error; //_debug("victim -> %p %s", - // victim, victim->d_inode ? "positive" : "negative"); + // victim, d_backing_inode(victim) ? "positive" : "negative"); /* if the object is no longer there then we probably retired the object * at the netfs's request whilst the cull was in progress */ - if (!victim->d_inode) { - mutex_unlock(&dir->d_inode->i_mutex); + if (d_is_negative(victim)) { + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -871,13 +871,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -913,7 +913,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return PTR_ERR(victim); _debug("victim -> %p %s", - victim, victim->d_inode ? "positive" : "negative"); + victim, d_backing_inode(victim) ? "positive" : "negative"); /* okay... the victim is not being used so we can cull it * - start by marking it as stale @@ -936,7 +936,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); error: dput(victim); if (ret == -ENOENT) { @@ -971,7 +971,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c6cd8d7a4eef..3cbb0e834694 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -74,12 +74,12 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, static int cachefiles_read_reissue(struct cachefiles_object *object, struct cachefiles_one_read *monitor) { - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *backpage = monitor->back_page, *backpage2; int ret; _enter("{ino=%lx},{%lx,%lx}", - object->backer->d_inode->i_ino, + d_backing_inode(object->backer)->i_ino, backpage->index, backpage->flags); /* skip if the page was truncated away completely */ @@ -157,7 +157,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) object = container_of(op->op.object, struct cachefiles_object, fscache); - _enter("{ino=%lu}", object->backer->d_inode->i_ino); + _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); max = 8; spin_lock_irq(&object->work_lock); @@ -247,7 +247,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); /* attempt to get hold of the backing page */ - bmapping = object->backer->d_inode->i_mapping; + bmapping = d_backing_inode(object->backer)->i_mapping; newpage = NULL; for (;;) { @@ -408,7 +408,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (!object->backer) goto enobufs; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); @@ -468,7 +468,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, struct list_head *list) { struct cachefiles_one_read *monitor = NULL; - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; @@ -705,7 +705,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, if (cachefiles_has_space(cache, 0, *nr_pages) < 0) space = 0; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index 396c18ea2764..31bbc0528b11 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -55,14 +55,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, { int ret; - ret = security_inode_mkdir(root->d_inode, root, 0); + ret = security_inode_mkdir(d_backing_inode(root), root, 0); if (ret < 0) { pr_err("Security denies permission to make dirs: error %d", ret); return ret; } - ret = security_inode_create(root->d_inode, root, 0); + ret = security_inode_create(d_backing_inode(root), root, 0); if (ret < 0) pr_err("Security denies permission to create files: error %d", ret); @@ -95,7 +95,7 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, /* use the cache root dir's security context as the basis with * which create files */ - ret = set_create_files_as(new, root->d_inode); + ret = set_create_files_as(new, d_backing_inode(root)); if (ret < 0) { abort_creds(new); cachefiles_begin_secure(cache, _saved_cred); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index a8a68745e11d..d31c1a72d8a5 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -33,7 +33,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); if (!object->fscache.cookie) strcpy(type, "C3"); @@ -52,7 +52,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret != -EEXIST) { pr_err("Can't set xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) goto bad_type_length; pr_err("Can't read xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -84,14 +84,14 @@ error: bad_type_length: pr_err("Cache object %lu type xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; pr_err("Cache object %pd [%lu] type %s not %s\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, xtype, type); ret = -EIO; goto error; @@ -165,7 +165,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); ASSERT(object->fscache.cookie->def->check_aux); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); @@ -204,7 +204,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, _enter("%p,#%d", object, auxdata->len); ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); if (!auxbuf) { @@ -225,7 +225,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't read xattr on %lu (err %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -276,7 +276,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't update xattr on %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } } @@ -291,7 +291,7 @@ error: bad_type_length: pr_err("Cache object %lu xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; @@ -316,7 +316,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Can't remove xattr from %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); } _leave(" = %d", ret); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 155ab9c0246b..e162bcd105ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r < 0) + page_cache_release(page); + else + *pagep = page; } while (r == -EAGAIN); return r; @@ -1534,19 +1538,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); - err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, - "inline_version", &inline_version, - sizeof(inline_version), - CEPH_OSD_CMPXATTR_OP_GT, - CEPH_OSD_CMPXATTR_MODE_U64); - if (err) - goto out_put; - - err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, - "inline_version", &inline_version, - sizeof(inline_version), 0, 0); - if (err) - goto out_put; + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8172775428a0..be5ea6af8366 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode) return ret; } +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); - } + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); } @@ -1394,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) int was = ci->i_dirty_caps; int dirty = 0; + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); @@ -1404,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); - WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -1545,7 +1561,19 @@ retry_locked: if (!mdsc->stopping && inode->i_nlink > 0) { if (want) { retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= want; } else { + retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, @@ -2309,6 +2337,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) wake = 1; } } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); @@ -3391,7 +3422,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int ceph_encode_dentry_release(void **p, struct dentry *dentry, int mds, int drop, int unless) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1b2355109b9f..31f831471ed2 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -84,7 +84,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%pd (%s)", - ceph_ino(req->r_dentry->d_parent->d_inode), + ceph_ino(d_inode(req->r_dentry->d_parent)), req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83e9976f7189..4248307fea90 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -49,9 +49,9 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); else d_set_d_op(dentry, &ceph_snap_dentry_ops); @@ -77,7 +77,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); ihold(inode); } spin_unlock(&dentry->d_lock); @@ -122,7 +122,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct list_head *p; struct dentry *dentry, *last; struct ceph_dentry_info *di; @@ -161,15 +161,15 @@ more: } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && dentry->d_inode && - ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && - ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && + !d_unhashed(dentry) && d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && + ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, dentry, di->offset, ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !dentry->d_inode ? " null" : ""); + !d_inode(dentry) ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; dentry = list_entry(p, struct dentry, d_child); @@ -189,11 +189,11 @@ more: } dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, dentry->d_inode); + dentry, dentry, d_inode(dentry)); if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12)) { + ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { if (last) { /* remember our position */ fi->dentry = last; @@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); if ((ctx->pos == 2 || fi->dentry) && + ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -336,16 +337,23 @@ more: ceph_mdsc_put_request(req); return err; } - req->r_inode = inode; - ihold(inode); - req->r_dentry = dget(file->f_path.dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (fi->last_name) { + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -535,7 +543,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -571,8 +579,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, err = 0; if (!req->r_reply_info.head->is_dentry) { dout("ENOENT and no trace, dentry %p inode %p\n", - dentry, dentry->d_inode); - if (dentry->d_inode) { + dentry, d_inode(dentry)); + if (d_really_is_positive(dentry)) { d_drop(dentry); err = -ENOENT; } else { @@ -619,7 +627,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); /* can we conclude ENOENT locally? */ - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); @@ -629,6 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); @@ -725,7 +734,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -755,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_dentry = dget(dentry); - req->r_num_caps = 2; req->r_path2 = kstrdup(dest, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } req->r_locked_dir = dir; + req->r_dentry = dget(dentry); + req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); @@ -821,7 +835,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -858,8 +872,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (err) { d_drop(dentry); } else if (!req->r_reply_info.head->is_dentry) { - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); } ceph_mdsc_put_request(req); return err; @@ -892,7 +906,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err = -EROFS; int op; @@ -933,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; int err; if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; - if (ceph_snap(old_dir) != CEPH_NOSNAP || - ceph_snap(new_dir) != CEPH_NOSNAP) - return -EROFS; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); ihold(old_dir); @@ -957,8 +975,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; - if (new_dentry->d_inode) - req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* @@ -1024,7 +1042,7 @@ static int dentry_lease_is_valid(struct dentry *dentry) if (di->lease_renew_after && time_after(jiffies, di->lease_renew_after)) { /* we should renew */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); session = ceph_get_mds_session(s); seq = di->lease_seq; di->lease_renew_after = 0; @@ -1074,22 +1092,22 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, - dentry, dentry->d_inode, ceph_dentry(dentry)->offset); + dentry, d_inode(dentry), ceph_dentry(dentry)->offset); dir = ceph_get_dentry_parent_inode(dentry); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, dentry->d_inode); + dentry, d_inode(dentry)); valid = 1; - } else if (dentry->d_inode && - ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + } else if (d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - if (dentry->d_inode) - valid = ceph_is_any_caps(dentry->d_inode); + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); else valid = 1; } @@ -1151,7 +1169,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - ceph_dir_clear_complete(dentry->d_parent->d_inode); + ceph_dir_clear_complete(d_inode(dentry->d_parent)); } /* @@ -1240,11 +1258,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { - ret = wait_for_completion_timeout( - &req->r_safe_completion, req->r_timeout); - if (ret > 0) + unsigned long time_left = wait_for_completion_timeout( + &req->r_safe_completion, + req->r_timeout); + if (time_left > 0) ret = 0; - else if (ret == 0) + else ret = -EIO; /* timed out */ } else { wait_for_completion(&req->r_safe_completion); @@ -1372,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = { .getattr = ceph_getattr, .mkdir = ceph_mkdir, .rmdir = ceph_unlink, + .rename = ceph_rename, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8d7d782f4382..fe02ae7f056a 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -136,8 +136,8 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_CAST(req); if (child) { - req->r_inode = child->d_inode; - ihold(child->d_inode); + req->r_inode = d_inode(child); + ihold(d_inode(child)); } else { req->r_ino1 = (struct ceph_vino) { .ino = ino, @@ -164,7 +164,7 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_PTR(err); } dout("__get_parent ino %llx parent %p ino %llx.%llx\n", - child ? ceph_ino(child->d_inode) : ino, + child ? ceph_ino(d_inode(child)) : ino, dentry, ceph_vinop(inode)); return dentry; } @@ -172,11 +172,11 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { /* don't re-export snaps */ - if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) return ERR_PTR(-EINVAL); dout("get_parent %p ino %llx.%llx\n", - child, ceph_vinop(child->d_inode)); + child, ceph_vinop(d_inode(child))); return __get_parent(child->d_sb, child, 0); } @@ -209,32 +209,32 @@ static int ceph_get_name(struct dentry *parent, char *name, struct ceph_mds_request *req; int err; - mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + mdsc = ceph_inode_to_client(d_inode(child))->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); - req->r_inode = child->d_inode; - ihold(child->d_inode); - req->r_ino2 = ceph_vino(parent->d_inode); - req->r_locked_dir = parent->d_inode; + req->r_inode = d_inode(child); + ihold(d_inode(child)); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_locked_dir = d_inode(parent); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(child->d_inode), name); + child, ceph_vinop(d_inode(child)), name); } else { dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(child->d_inode), err); + child, ceph_vinop(d_inode(child)), err); } ceph_mdsc_put_request(req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b9b8eb225f66..3b6b522b4b31 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -291,14 +291,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } if (err) goto out_req; - if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) { + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 119c43c80638..e876e1944519 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -940,7 +940,7 @@ static void update_dentry_lease(struct dentry *dentry, dentry, duration, ttl); /* make lease_rdcache_gen match directory */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; if (duration == 0) @@ -980,7 +980,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; - BUG_ON(dn->d_inode); + BUG_ON(d_inode(dn)); /* dn must be unhashed */ if (!d_unhashed(dn)) @@ -998,13 +998,13 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, "inode %p ino %llx.%llx\n", dn, d_count(dn), realdn, d_count(realdn), - realdn->d_inode, ceph_vinop(realdn->d_inode)); + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); dout("dn %p attached to %p ino %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode)); + dn, d_inode(dn), ceph_vinop(d_inode(dn))); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); @@ -1125,11 +1125,11 @@ retry_lookup: dput(parent); goto done; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; @@ -1183,7 +1183,7 @@ retry_lookup: BUG_ON(!dn); BUG_ON(!dir); - BUG_ON(dn->d_parent->d_inode != dir); + BUG_ON(d_inode(dn->d_parent) != dir); BUG_ON(ceph_ino(dir) != le64_to_cpu(rinfo->diri.in->ino)); BUG_ON(ceph_snap(dir) != @@ -1235,7 +1235,7 @@ retry_lookup: /* null dentry? */ if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); - if (dn->d_inode) { + if (d_really_is_positive(dn)) { ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); @@ -1252,7 +1252,7 @@ retry_lookup: } /* attach proper inode */ - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); @@ -1261,9 +1261,9 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ - } else if (dn->d_inode && dn->d_inode != in) { + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode), + dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); have_lease = false; } @@ -1363,7 +1363,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, return readdir_prepopulate_inodes_only(req, session); if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(parent->d_inode); + snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); @@ -1371,7 +1371,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dout("readdir_prepopulate %d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) - ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } /* FIXME: release caps/leases if error occurs */ @@ -1405,11 +1405,11 @@ retry_lookup: err = ret; goto out; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; @@ -1423,8 +1423,8 @@ retry_lookup: } /* inode */ - if (dn->d_inode) { - in = dn->d_inode; + if (d_really_is_positive(dn)) { + in = d_inode(dn); } else { in = ceph_get_inode(parent->d_sb, vino); if (IS_ERR(in)) { @@ -1440,13 +1440,13 @@ retry_lookup: req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); - if (!dn->d_inode) + if (d_really_is_negative(dn)) iput(in); d_drop(dn); goto next_item; } - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { struct dentry *realdn = splice_dentry(dn, in, NULL); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); @@ -1693,7 +1693,7 @@ retry: */ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(d_inode(dentry)); nd_set_link(nd, ci->i_symlink); return NULL; } @@ -1714,7 +1714,7 @@ static const struct inode_operations ceph_symlink_iops = { */ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; @@ -1990,7 +1990,7 @@ int ceph_permission(struct inode *inode, int mask) int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); int err; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 71c073f38e54..84f37f34f9aa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -679,7 +679,7 @@ static struct dentry *get_nonsnap_parent(struct dentry *dentry) * except to resplice to another snapdir, and either the old or new * result is a valid result. */ - while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; } @@ -716,20 +716,20 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ struct dentry *dn = get_nonsnap_parent(parent); - inode = dn->d_inode; + inode = d_inode(dn); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ inode = dir; @@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) spin_unlock(&session->s_cap_lock); } +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + list_del_init(&req->r_unsafe_item); + pr_info(" dropping unsafe request %llu\n", req->r_tid); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); +} + /* * Helper to safely iterate over all caps associated with a session, with * special care taken to handle a racing __ceph_remove_cap(). @@ -1098,7 +1125,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); - if (!__ceph_is_any_real_caps(ci)) { + if (!ci->i_auth_cap) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1120,13 +1147,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, mdsc->num_cap_flushing--; drop = 1; } - if (drop && ci->i_wrbuffer_ref) { - pr_info(" dropping dirty data for %p %lld\n", - inode, ceph_ino(inode)); - ci->i_wrbuffer_ref = 0; - ci->i_wrbuffer_ref_head = 0; - drop++; - } spin_unlock(&mdsc->cap_dirty_lock); } spin_unlock(&ci->i_ceph_lock); @@ -1712,7 +1732,7 @@ retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = temp->d_inode; + struct inode *inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) len++; /* slash only */ else if (stop_on_nosnap && inode && @@ -1736,7 +1756,7 @@ retry: struct inode *inode; spin_lock(&temp->d_lock); - inode = temp->d_inode; + inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); @@ -1770,7 +1790,7 @@ retry: goto retry; } - *base = ceph_ino(temp->d_inode); + *base = ceph_ino(d_inode(temp)); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), *base, len, path); @@ -1783,8 +1803,8 @@ static int build_dentry_path(struct dentry *dentry, { char *path; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { - *pino = ceph_ino(dentry->d_parent->d_inode); + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { + *pino = ceph_ino(d_inode(dentry->d_parent)); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; @@ -1853,7 +1873,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, */ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_msg *msg; struct ceph_mds_request_head *head; @@ -1925,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_inode ? req->r_inode : req->r_dentry->d_inode, + req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, @@ -1935,8 +1955,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_old_dentry->d_inode, + d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + head->num_releases = cpu_to_le16(releases); /* time stamp */ @@ -1989,7 +2015,7 @@ static void complete_request(struct ceph_mds_client *mdsc, */ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_mds_request_head *rhead; struct ceph_msg *msg; @@ -2048,7 +2074,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds); + msg = create_request_message(mdsc, req, mds, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); complete_request(mdsc, req); @@ -2132,7 +2158,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds); + err = __prepare_send_request(mdsc, req, mds, false); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2590,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); + cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); @@ -2658,7 +2685,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2679,7 +2706,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; /* only old requests */ if (req->r_session && req->r_session->s_mds == session->s_mds) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, + session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2864,7 +2892,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_cap_lock); /* trim unused caps to reduce MDS's cache rejoin time */ - shrink_dcache_parent(mdsc->fsc->sb->s_root); + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, @@ -3133,7 +3162,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = - le32_to_cpu(h->duration_ms) * HZ / 1000; + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 51cc23e48111..89e6bc321df3 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a63997b8bcff..4e9905374078 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -44,7 +44,7 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; @@ -345,6 +345,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) { + err = -ENOMEM; + goto out; + } + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; @@ -406,31 +411,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) { struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; - struct ceph_options *opt = fsc->client->options; - - if (opt->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &opt->fsid); - if (opt->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (opt->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (opt->flags & CEPH_OPT_NOMSGAUTH) - seq_puts(m, ",nocephx_require_signatures"); - if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) - seq_puts(m, ",notcp_nodelay"); - - if (opt->name) - seq_printf(m, ",name=%s", opt->name); - if (opt->key) - seq_puts(m, ",secret=<hidden>"); - - if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); - if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - opt->osd_keepalive_timeout); + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); @@ -438,14 +432,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) - seq_puts(m, ",dcache"); - else + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); - else - seq_puts(m, ",nofsc"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -477,6 +467,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } @@ -730,6 +721,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; @@ -976,7 +972,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, - res->d_inode, ceph_vinop(res->d_inode)); + d_inode(res), ceph_vinop(d_inode(res))); return res; out_splat: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 04c8124ed30e..fa20e1318939 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ -#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ + CEPH_MOUNT_OPT_DCACHE) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; @@ -881,7 +882,6 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; -extern const struct address_space_operations ceph_aops; extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5a492caf34cb..cd7ffad4041d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -776,12 +776,12 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, value, size); - return __ceph_getxattr(dentry->d_inode, name, value, size); + return __ceph_getxattr(d_inode(dentry), name, value, size); } ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; @@ -847,7 +847,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; @@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = inode; - ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } req->r_pagelist = pagelist; pagelist = NULL; + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -901,7 +908,7 @@ out: int __ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; @@ -995,7 +1002,7 @@ out: int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) @@ -1011,7 +1018,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err; @@ -1019,12 +1026,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) + return -ENOMEM; + req->r_inode = inode; ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; - req->r_path2 = kstrdup(name, GFP_NOFS); - + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; @@ -1032,7 +1041,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int __ceph_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); int issued; @@ -1098,7 +1107,7 @@ out: int ceph_removexattr(struct dentry *dentry, const char *name) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index b8602f199815..430e0348c99e 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -301,7 +301,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); + cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index eaab4b2a0595..f5089bde3635 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -607,7 +607,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) p = s = full_path; do { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct dentry *child; if (!dir) { diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index fa13d5e79f64..84650a51c7c4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1898,7 +1898,7 @@ static void cifs_writev_requeue(struct cifs_writedata *wdata) { int i, rc = 0; - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; unsigned int rest_len; @@ -1981,7 +1981,7 @@ cifs_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); int i = 0; if (wdata->result == 0) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index b72bc29cba23..338d56936f6a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -745,13 +745,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, goto lookup_out; } - if (direntry->d_inode != NULL) { + if (d_really_is_positive(direntry)) { cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { cifs_dbg(FYI, "NULL inode in lookup\n"); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", - full_path, direntry->d_inode); + full_path, d_inode(direntry)); if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, @@ -792,7 +792,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (direntry->d_inode) { + if (d_really_is_positive(direntry)) { if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -803,7 +803,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(direntry->d_inode) && + if (IS_AUTOMOUNT(d_inode(direntry)) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ca2bc5406306..cafbf10521d5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -273,7 +273,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; @@ -357,7 +357,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - struct inode *inode = cifs_file->dentry->d_inode; + struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -386,7 +386,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cifs_dbg(FYI, "closing last open instance for inode %p\n", - cifs_file->dentry->d_inode); + d_inode(cifs_file->dentry)); /* * In strict cache mode we need invalidate mapping on the last * close because it may cause a error when we open this file @@ -572,7 +572,7 @@ static int cifs_relock_file(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -620,7 +620,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) return rc; } - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifs_sb = CIFS_SB(inode->i_sb); tcon = tlink_tcon(cfile->tlink); server = tcon->ses->server; @@ -874,7 +874,7 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { bool rc = false; struct cifs_fid_locks *cur; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); list_for_each_entry(cur, &cinode->llist, llist) { rc = cifs_find_fid_lock_conflict(cur, offset, length, type, @@ -899,7 +899,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { int rc = 0; struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; bool exist; @@ -927,7 +927,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -944,7 +944,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, bool wait) { struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); bool exist; int rc = 0; @@ -1125,7 +1125,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; struct file_lock_context *flctx = inode->i_flctx; @@ -1214,7 +1214,7 @@ static int cifs_push_locks(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -1382,7 +1382,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -1488,7 +1488,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); if (posix_lck) { int posix_lock_type; @@ -1643,7 +1643,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, struct TCP_Server_Info *server; unsigned int xid; struct dentry *dentry = open_file->dentry; - struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); struct cifs_io_parms io_parms; cifs_sb = CIFS_SB(dentry->d_sb); @@ -1676,7 +1676,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, break; } - len = min(server->ops->wp_retry_size(dentry->d_inode), + len = min(server->ops->wp_retry_size(d_inode(dentry)), (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; @@ -1696,9 +1696,9 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, return rc; } } else { - spin_lock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); cifs_update_eof(cifsi, *offset, bytes_written); - spin_unlock(&dentry->d_inode->i_lock); + spin_unlock(&d_inode(dentry)->i_lock); *offset += bytes_written; } } @@ -1706,12 +1706,12 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_stats_bytes_written(tcon, total_written); if (total_written > 0) { - spin_lock(&dentry->d_inode->i_lock); - if (*offset > dentry->d_inode->i_size) - i_size_write(dentry->d_inode, *offset); - spin_unlock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); + if (*offset > d_inode(dentry)->i_size) + i_size_write(d_inode(dentry), *offset); + spin_unlock(&d_inode(dentry)->i_lock); } - mark_inode_dirty_sync(dentry->d_inode); + mark_inode_dirty_sync(d_inode(dentry)); free_xid(xid); return total_written; } @@ -2406,7 +2406,7 @@ cifs_uncached_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct cifsInodeInfo *cifsi = CIFS_I(inode); spin_lock(&inode->i_lock); @@ -3794,7 +3794,7 @@ void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3e126d7bb2ea..55b58112d122 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1067,7 +1067,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, int rc; struct cifs_fid fid; struct cifs_open_parms oparms; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -1196,7 +1196,7 @@ cifs_drop_nlink(struct inode *inode) } /* - * If dentry->d_inode is null (usually meaning the cached dentry + * If d_inode(dentry) is null (usually meaning the cached dentry * is a negative dentry) then we would attempt a standard SMB delete, but * if that fails we can not attempt the fall back mechanisms on EACCESS * but will return the EACCESS to the caller. Note that the VFS does not call @@ -1207,7 +1207,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) int rc = 0; unsigned int xid; char *full_path = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1551,13 +1551,13 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_put_tlink(tlink); if (!rc) { - spin_lock(&direntry->d_inode->i_lock); - i_size_write(direntry->d_inode, 0); - clear_nlink(direntry->d_inode); - spin_unlock(&direntry->d_inode->i_lock); + spin_lock(&d_inode(direntry)->i_lock); + i_size_write(d_inode(direntry), 0); + clear_nlink(d_inode(direntry)); + spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(direntry->d_inode); + cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -1568,7 +1568,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = + d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); rmdir_exit: @@ -1727,7 +1727,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ - if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { + if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { if (d_is_dir(target_dentry)) tmprc = cifs_rmdir(target_dir, target_dentry); else @@ -1867,7 +1867,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) { unsigned int xid; int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; char *full_path = NULL; @@ -1919,7 +1919,7 @@ int cifs_revalidate_file(struct file *filp) int cifs_revalidate_dentry(struct dentry *dentry) { int rc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rc = cifs_revalidate_dentry_attr(dentry); if (rc) @@ -1933,7 +1933,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; /* @@ -2110,7 +2110,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) int rc; unsigned int xid; char *full_path = NULL; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -2251,7 +2251,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) unsigned int xid; kuid_t uid = INVALID_UID; kgid_t gid = INVALID_GID; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); char *full_path = NULL; @@ -2409,7 +2409,7 @@ cifs_setattr_exit: int cifs_setattr(struct dentry *direntry, struct iattr *attrs) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2ec6037f61c7..252e672d5604 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -586,12 +586,12 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, * if source file is cached (oplocked) revalidate will not go to server * until the file is closed or oplock broken so update nlinks locally */ - if (old_file->d_inode) { - cifsInode = CIFS_I(old_file->d_inode); + if (d_really_is_positive(old_file)) { + cifsInode = CIFS_I(d_inode(old_file)); if (rc == 0) { - spin_lock(&old_file->d_inode->i_lock); - inc_nlink(old_file->d_inode); - spin_unlock(&old_file->d_inode->i_lock); + spin_lock(&d_inode(old_file)->i_lock); + inc_nlink(d_inode(old_file)); + spin_unlock(&d_inode(old_file)->i_lock); /* * parent dir timestamps will update from srv within a @@ -629,7 +629,7 @@ cifs_hl_exit: void * cifs_follow_link(struct dentry *direntry, struct nameidata *nd) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); int rc = -ENOMEM; unsigned int xid; char *full_path = NULL; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 337946355b29..8442b8b8e0be 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -473,7 +473,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - pCifsInode = CIFS_I(netfile->dentry->d_inode); + pCifsInode = CIFS_I(d_inode(netfile->dentry)); set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &pCifsInode->flags); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index c295338e0a98..b4a47237486b 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,7 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, { struct dentry *dentry, *alias; struct inode *inode; - struct super_block *sb = parent->d_inode->i_sb; + struct super_block *sb = d_inode(parent)->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -88,7 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) { /* * If we're generating inode numbers, then we don't diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d2979036a4c7..7bfdd6066276 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -722,7 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, static void cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); cfile->fid.netfid = fid->netfid; cifs_set_oplock_level(cinode, oplock); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 7198eac5dddd..2ab297dae5a7 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -95,7 +95,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num = 0, max_buf; struct smb2_lock_element *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -231,7 +231,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) unsigned int xid; unsigned int max_num, max_buf; struct smb2_lock_element *buf; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_fid_locks *fdlocks; xid = get_xid(); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 22dfdf17d065..1c5907019045 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -453,7 +453,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (memcmp(cinode->lease_key, rsp->LeaseKey, SMB2_LEASE_KEY_SIZE)) @@ -590,7 +590,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eab05e1aa587..54daee5ad4c1 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -524,7 +524,7 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) static void smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; cfile->fid.persistent_fid = fid->persistent_fid; @@ -793,7 +793,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, * If extending file more than one page make sparse. Many Linux fs * make files sparse by default when extending via ftruncate */ - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); if (!set_alloc && (size > inode->i_size + 8192)) { __u8 set_sparse = 1; @@ -1032,7 +1032,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ @@ -1083,7 +1083,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* Need to make file sparse, if not already, before freeing range. */ @@ -1115,7 +1115,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 72a4d10653d6..ff9e1f8b16a4 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -50,9 +50,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -111,9 +111,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -177,12 +177,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, memcpy(pacl, ea_value, value_size); if (pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - value_size, direntry->d_inode, + value_size, d_inode(direntry), full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(direntry->d_inode)->time = 0; + CIFS_I(d_inode(direntry))->time = 0; kfree(pacl); } #else @@ -246,9 +246,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -324,7 +324,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, goto get_ea_exit; /* rc already EOPNOTSUPP */ pacl = pTcon->ses->server->ops->get_acl(cifs_sb, - direntry->d_inode, full_path, &acllen); + d_inode(direntry), full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", @@ -382,9 +382,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 46ee6f238985..5bb630a769e0 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -94,8 +94,8 @@ static void coda_flag_children(struct dentry *parent, int flag) spin_lock(&parent->d_lock); list_for_each_entry(de, &parent->d_subdirs, d_child) { /* don't know what to do with negative dentries */ - if (de->d_inode ) - coda_flag_inode(de->d_inode, flag); + if (d_inode(de) ) + coda_flag_inode(d_inode(de), flag); } spin_unlock(&parent->d_lock); return; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 60cb88c1dd2b..fda9f4311212 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -201,7 +201,7 @@ err_out: static int coda_link(struct dentry *source_de, struct inode *dir_inode, struct dentry *de) { - struct inode *inode = source_de->d_inode; + struct inode *inode = d_inode(source_de); const char * name = de->d_name.name; int len = de->d_name.len; int error; @@ -266,7 +266,7 @@ static int coda_unlink(struct inode *dir, struct dentry *de) return error; coda_dir_update_mtime(dir); - drop_nlink(de->d_inode); + drop_nlink(d_inode(de)); return 0; } @@ -279,8 +279,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ - if (de->d_inode) - clear_nlink(de->d_inode); + if (d_really_is_positive(de)) + clear_nlink(d_inode(de)); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); @@ -303,14 +303,14 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); if (!error) { - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } coda_dir_update_mtime(old_dir); coda_dir_update_mtime(new_dir); - coda_flag_inode(new_dentry->d_inode, C_VATTR); + coda_flag_inode(d_inode(new_dentry), C_VATTR); } else { coda_flag_inode(old_dir, C_VATTR); coda_flag_inode(new_dir, C_VATTR); @@ -449,13 +449,13 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = de->d_inode; + inode = d_inode(de); if (!inode || is_root_inode(inode)) goto out; if (is_bad_inode(inode)) goto bad; - cii = ITOC(de->d_inode); + cii = ITOC(d_inode(de)); if (!(cii->c_flags & (C_PURGE | C_FLUSH))) goto out; @@ -487,11 +487,11 @@ static int coda_dentry_delete(const struct dentry * dentry) { int flags; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE; - if (is_bad_inode(dentry->d_inode) || flags) { + flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE; + if (is_bad_inode(d_inode(dentry)) || flags) { return 1; } return 0; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 82ec68b59208..cac1390b87a3 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -257,15 +257,15 @@ static void coda_evict_inode(struct inode *inode) int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err = coda_revalidate_inode(dentry->d_inode); + int err = coda_revalidate_inode(d_inode(dentry)); if (!err) - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return err; } int coda_setattr(struct dentry *de, struct iattr *iattr) { - struct inode *inode = de->d_inode; + struct inode *inode = d_inode(de); struct coda_vattr vattr; int error; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 4326d172fc27..f36a4040afb8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -72,7 +72,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, if (error) return error; - target_inode = path.dentry->d_inode; + target_inode = d_inode(path.dentry); /* return if it is not a Coda inode */ if (target_inode->i_sb != inode->i_sb) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 5bb6e27298a4..9b1ffaa0572e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -820,8 +820,8 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) case CODA_FLUSH: coda_cache_clear_all(sb); shrink_dcache_sb(sb); - if (sb->s_root->d_inode) - coda_flag_inode(sb->s_root->d_inode, C_FLUSH); + if (d_really_is_positive(sb->s_root)) + coda_flag_inode(d_inode(sb->s_root), C_FLUSH); break; case CODA_PURGEUSER: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index acb3d63bc9dc..c81ce7f200a6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -289,7 +289,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); error = configfs_create(dentry, mode, init_dir); if (!error) { - inc_nlink(p->d_inode); + inc_nlink(d_inode(p)); item->ci_dentry = dentry; } else { struct configfs_dirent *sd = dentry->d_fsdata; @@ -375,8 +375,8 @@ static void remove_dir(struct dentry * d) list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_put(sd); - if (d->d_inode) - simple_rmdir(parent->d_inode,d); + if (d_really_is_positive(d)) + simple_rmdir(d_inode(parent),d); pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); @@ -513,7 +513,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex /* Abort if racing with mkdir() */ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { if (wait_mutex) - *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + *wait_mutex = &d_inode(sd->s_dentry)->i_mutex; return -EAGAIN; } @@ -624,13 +624,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&child->d_inode->i_mutex); + mutex_lock(&d_inode(child)->i_mutex); configfs_detach_group(sd->s_element); - child->d_inode->i_flags |= S_DEAD; + d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&child->d_inode->i_mutex); + mutex_unlock(&d_inode(child)->i_mutex); d_delete(child); dput(child); @@ -672,7 +672,7 @@ static int create_default_group(struct config_group *parent_group, sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; } else { - BUG_ON(child->d_inode); + BUG_ON(d_inode(child)); d_drop(child); dput(child); } @@ -818,11 +818,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); configfs_remove_dir(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); } } @@ -858,16 +858,16 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { configfs_detach_item(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (ret) d_delete(dentry); } @@ -1075,7 +1075,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); root_sd = root->d_fsdata; @@ -1111,7 +1111,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, out_unlock_dirent_lock: spin_unlock(&configfs_dirent_lock); out_unlock_fs: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1453,11 +1453,11 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + if (d_really_is_negative(new_dentry)) { error = config_item_set_name(item, "%s", new_name); if (!error) { d_add(new_dentry, NULL); @@ -1469,7 +1469,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); up_write(&configfs_rename_sem); return error; @@ -1482,7 +1482,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1495,7 +1495,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return err; } @@ -1505,11 +1505,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); release_configfs_dirent(cursor); @@ -1567,7 +1567,7 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&configfs_dirent_lock); dentry = next->s_dentry; if (dentry) - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) ino = inode->i_ino; spin_unlock(&configfs_dirent_lock); @@ -1590,7 +1590,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -1598,7 +1598,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -1624,7 +1624,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } @@ -1654,7 +1654,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1664,7 +1664,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) err = configfs_attach_group(sd->s_element, &group->cg_item, dentry); if (err) { - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); d_drop(dentry); dput(dentry); } else { @@ -1674,7 +1674,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); if (err) { unlink_group(group); @@ -1695,9 +1695,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&root->d_inode->i_mutex, + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1706,13 +1706,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 56d2cdc9ae0a..403269ffcdf3 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -326,10 +326,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 5423a6a6ecc8..8d89f5fd0331 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -56,7 +56,7 @@ static const struct inode_operations configfs_inode_operations ={ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; struct iattr * sd_iattr; unsigned int ia_valid = iattr->ia_valid; @@ -186,7 +186,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!dentry) return -ENOENT; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return -EEXIST; sd = dentry->d_fsdata; @@ -194,7 +194,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!inode) return -ENOMEM; - p_inode = dentry->d_parent->d_inode; + p_inode = d_inode(dentry->d_parent); p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; configfs_set_inode_lock_class(sd, inode); @@ -236,11 +236,11 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && dentry->d_inode) { + if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); } else spin_unlock(&dentry->d_lock); } @@ -251,11 +251,11 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; - if (dir->d_inode == NULL) + if (d_really_is_negative(dir)) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); } @@ -209,7 +209,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +219,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_done(inode); + inode_dio_end(inode); out: return retval; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 517e64938438..830a7e76f5c6 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -45,7 +45,7 @@ const struct file_operations debugfs_file_operations = { static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, dentry->d_inode->i_private); + nd_set_link(nd, d_inode(dentry)->i_private); return NULL; } diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c9ee0dfe90b5..c1e7ffb0dab6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -46,7 +46,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) static inline int debugfs_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } struct debugfs_mount_opts { @@ -124,7 +124,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) static int debugfs_apply_options(struct super_block *sb) { struct debugfs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -188,7 +188,7 @@ static struct vfsmount *debugfs_automount(struct path *path) { struct vfsmount *(*f)(void *); f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(path->dentry->d_inode->i_private); + return f(d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -270,20 +270,20 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -291,7 +291,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); return dentry; } @@ -344,7 +344,7 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, inode->i_fop = fops ? fops : &debugfs_file_operations; inode->i_private = data; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_file); @@ -384,7 +384,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode, struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (de) - de->d_inode->i_size = file_size; + d_inode(de)->i_size = file_size; return de; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); @@ -426,8 +426,8 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -525,9 +525,9 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) if (debugfs_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) - ret = simple_rmdir(parent->d_inode, dentry); + ret = simple_rmdir(d_inode(parent), dentry); else - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); if (!ret) d_delete(dentry); dput(dentry); @@ -557,12 +557,12 @@ void debugfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -588,12 +588,12 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -608,7 +608,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); parent = child; goto down; } @@ -629,10 +629,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); if (child != dentry) /* go up */ @@ -640,7 +640,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); @@ -672,27 +672,27 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ - if (!old_dir->d_inode || !new_dir->d_inode) + if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ - if (!old_dentry->d_inode || old_dentry == trap || + if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ - if (IS_ERR(dentry) || dentry == trap || dentry->d_inode) + if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; old_name = fsnotify_oldname_init(old_dentry->d_name.name); - error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, + error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry); if (error) { fsnotify_oldname_free(old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, d_is_dir(old_dentry), NULL, old_dentry); fsnotify_oldname_free(old_name); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466f7fef..add566303c68 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -253,7 +253,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -290,7 +290,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return rc; } @@ -298,7 +298,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) { struct inode *inode; if (fsi->ptmx_dentry) { - inode = fsi->ptmx_dentry->d_inode; + inode = d_inode(fsi->ptmx_dentry); inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } @@ -602,18 +602,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, s); if (dentry) { d_add(dentry, inode); - fsnotify_create(root->d_inode, dentry); + fsnotify_create(d_inode(root), dentry); } else { iput(inode); inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return inode; } @@ -658,7 +658,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_find_alias(inode); @@ -667,7 +667,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index c3b560b24a46..745d2342651a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -253,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->private); - inode_dio_done(dio->inode); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(dio->inode); + if (is_async) { if (dio->rw & WRITE) { int err; @@ -1195,7 +1197,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - atomic_inc(&inode->i_dio_count); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 719e1ce1c609..97315f2f6816 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1326,7 +1326,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, if (rc) goto out; if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) - ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); + ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry)); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read); @@ -1425,7 +1425,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) { int rc; char *page_virt; - struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; + struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry); struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 4000f6b3a750..8db0b464483f 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -54,11 +54,11 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { struct inode *lower_inode = - ecryptfs_inode_to_lower(dentry->d_inode); + ecryptfs_inode_to_lower(d_inode(dentry)); - fsstack_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(d_inode(dentry), lower_inode); } return rc; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a65786e26b05..72afcc629d7b 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -130,7 +130,7 @@ struct kmem_cache *ecryptfs_file_info_cache; static int read_or_initialize_metadata(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_crypt_stat *crypt_stat; int rc; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b08b5187f662..fc850b55db67 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT); + mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); } @@ -131,7 +131,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, static int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, struct super_block *sb) { - struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb); + struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -189,21 +189,21 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); + rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); inode = ERR_PTR(rc); goto out_lock; } - inode = __ecryptfs_get_inode(lower_dentry->d_inode, + inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL); + vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); goto out_lock; } - fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); return inode; @@ -332,7 +332,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry, struct inode *dir_inode) { - struct inode *inode, *lower_inode = lower_dentry->d_inode; + struct inode *inode, *lower_inode = d_inode(lower_dentry); struct ecryptfs_dentry_info *dentry_info; struct vfsmount *lower_mnt; int rc = 0; @@ -347,14 +347,14 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, } lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); dentry_info->lower_path.mnt = lower_mnt; dentry_info->lower_path.dentry = lower_dentry; - if (!lower_dentry->d_inode) { + if (d_really_is_negative(lower_dentry)) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return 0; @@ -400,11 +400,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -412,7 +412,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_dentry); goto out; } - if (lower_dentry->d_inode) + if (d_really_is_positive(lower_dentry)) goto interpose; mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; @@ -429,11 +429,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -458,24 +458,24 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, u64 file_size_save; int rc; - file_size_save = i_size_read(old_dentry->d_inode); + file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), lower_new_dentry, NULL); - if (rc || !lower_new_dentry->d_inode) + if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(old_dentry->d_inode, - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); - i_size_write(new_dentry->d_inode, file_size_save); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(d_inode(old_dentry), + ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); + i_size_write(d_inode(new_dentry), file_size_save); out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); @@ -485,7 +485,7 @@ out_lock: static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) { - return ecryptfs_do_unlink(dir, dentry, dentry->d_inode); + return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, @@ -510,20 +510,20 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, + rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, encoded_symname); kfree(encoded_symname); - if (rc || !lower_dentry->d_inode) + if (rc || d_really_is_negative(lower_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); dput(lower_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -536,18 +536,18 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); - if (rc || !lower_dentry->d_inode) + rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -562,12 +562,12 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); dget(lower_dentry); - rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); dput(lower_dentry); - if (!rc && dentry->d_inode) - clear_nlink(dentry->d_inode); - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + if (!rc && d_really_is_positive(dentry)) + clear_nlink(d_inode(dentry)); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); @@ -584,17 +584,17 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); - if (rc || !lower_dentry->d_inode) + rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -617,7 +617,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - target_inode = new_dentry->d_inode; + target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); /* source should not be ancestor of target */ if (trap == lower_old_dentry) { @@ -629,17 +629,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = -ENOTEMPTY; goto out_lock; } - rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry, + rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + d_inode(lower_new_dir_dentry), lower_new_dentry, NULL, 0); if (rc) goto out_lock; if (target_inode) fsstack_copy_attr_all(target_inode, ecryptfs_inode_to_lower(target_inode)); - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dir_dentry); @@ -662,7 +662,7 @@ static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); - rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, + rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry, (char __user *)lower_buf, PATH_MAX); set_fs(old_fs); @@ -681,8 +681,8 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) char *buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) goto out; - fsstack_copy_attr_atime(dentry->d_inode, - ecryptfs_dentry_to_lower(dentry)->d_inode); + fsstack_copy_attr_atime(d_inode(dentry), + d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; out: nd_set_link(nd, buf); @@ -738,7 +738,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, struct iattr *lower_ia) { int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -751,7 +751,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, rc = ecryptfs_get_lower_file(dentry, inode); if (rc) return rc; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -858,7 +858,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct iattr lower_ia = { .ia_valid = 0 }; int rc; - rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length); if (rc) return rc; @@ -866,9 +866,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); } return rc; } @@ -900,10 +900,10 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) ecryptfs_init_crypt_stat(crypt_stat); - inode = dentry->d_inode; + inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&crypt_stat->cs_mutex); @@ -967,9 +967,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -983,7 +983,7 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -1007,9 +1007,9 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); if (!rc) { - fsstack_copy_attr_all(dentry->d_inode, - ecryptfs_inode_to_lower(dentry->d_inode)); - generic_fillattr(dentry->d_inode, stat); + fsstack_copy_attr_all(d_inode(dentry), + ecryptfs_inode_to_lower(d_inode(dentry))); + generic_fillattr(d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1023,14 +1023,14 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->setxattr) { + if (!d_inode(lower_dentry)->i_op->setxattr) { rc = -EOPNOTSUPP; goto out; } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && dentry->d_inode) - fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); + if (!rc && d_really_is_positive(dentry)) + fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); out: return rc; } @@ -1041,14 +1041,14 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, { int rc = 0; - if (!lower_dentry->d_inode->i_op->getxattr) { + if (!d_inode(lower_dentry)->i_op->getxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value, + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1068,13 +1068,13 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->listxattr) { + if (!d_inode(lower_dentry)->i_op->listxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1085,13 +1085,13 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->removexattr) { + if (!d_inode(lower_dentry)->i_op->removexattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index f1ea610362c6..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -144,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file, /* Corresponding dput() and mntput() are done when the * lower file is fput() when all eCryptfs files for the inode are * released. */ - flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; + flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) goto out; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c095d3264259..4f4d0474bee9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -546,11 +546,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) { + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", - i_uid_read(path.dentry->d_inode), + i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } @@ -584,7 +584,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - inode = ecryptfs_get_inode(path.dentry->d_inode, s); + inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 4626976794e7..cf208522998e 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -420,7 +420,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) void *xattr_virt; struct dentry *lower_dentry = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; - struct inode *lower_inode = lower_dentry->d_inode; + struct inode *lower_inode = d_inode(lower_dentry); int rc; if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 07ab49745e31..3381b9da9ee6 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -145,12 +145,12 @@ out: static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) { - struct efivar_entry *var = dentry->d_inode->i_private; + struct efivar_entry *var = d_inode(dentry)->i_private; if (efivar_entry_delete(var)) return -EINVAL; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); dput(dentry); return 0; }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index ddbce42548c9..59fedbcf8798 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -144,7 +144,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; - inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); if (!inode) goto fail_name; diff --git a/fs/efs/namei.c b/fs/efs/namei.c index bbee8f063dfa..40ba9cc41bf7 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -111,9 +111,9 @@ struct dentry *efs_get_parent(struct dentry *child) struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - ino = efs_find_entry(child->d_inode, "..", 2); + ino = efs_find_entry(d_inode(child), "..", 2); if (ino) - parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); + parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino)); return parent; } diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d7defd557601..4deb0b05b011 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -379,7 +379,7 @@ ino_t exofs_parent_ino(struct dentry *child) struct exofs_dir_entry *de; ino_t ino; - de = exofs_dotdot(child->d_inode, &page); + de = exofs_dotdot(d_inode(child), &page); if (!de) return 0; @@ -429,7 +429,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int exofs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = exofs_chunk_size(dir); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 35073aaec6e0..786e4cc8c889 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1028,7 +1028,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) */ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* if we are about to modify an object, and it hasn't been diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 28907460e8fa..5ae25e431191 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -141,7 +141,7 @@ out_fail: static int exofs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); @@ -191,7 +191,7 @@ out_dir: static int exofs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct exofs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -213,7 +213,7 @@ out: static int exofs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (exofs_empty_dir(inode)) { @@ -230,8 +230,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry) static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct exofs_dir_entry *dir_de = NULL; struct page *old_page; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fcc2e565f540..b795c567b5e1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child) if (!ino) return ERR_PTR(-ESTALE); - return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino)); } static struct inode *exofs_nfs_get_inode(struct super_block *sb, diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 832e2624b80b..6f6f3a4c1365 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -37,7 +37,7 @@ static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct exofs_i_info *oi = exofs_i(dentry->d_inode); + struct exofs_i_info *oi = exofs_i(d_inode(dentry)); nd_set_link(nd, (char *)oi->i_data); return NULL; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6e1d4ab09d72..796b491e6978 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -486,7 +486,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, */ int ext2_add_link (struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = ext2_chunk_size(dir); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 6c14bb8322fa..5c04a0ddea80 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -278,7 +278,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = free_blocks / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { struct ext2_group_desc *best_desc = NULL; int best_ndir = inodes_per_group; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 5d9213963fae..f460ae36d5b7 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1544,7 +1544,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index ce422931f411..3e074a9ccbe6 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -79,10 +79,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns struct dentry *ext2_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino)); } /* @@ -208,7 +208,7 @@ out_fail: static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err; dquot_initialize(dir); @@ -275,7 +275,7 @@ out_dir: static int ext2_unlink(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; int err = -ENOENT; @@ -299,7 +299,7 @@ out: static int ext2_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (ext2_empty_dir(inode)) { @@ -316,8 +316,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry ) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 565cf817bbf1..20608f17c2e5 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -23,7 +23,7 @@ static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ext2_inode_info *ei = EXT2_I(dentry->d_inode); + struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); nd_set_link(nd, (char *)ei->i_data); return NULL; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 91426141c33a..0b6bfd3a398b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -243,7 +243,7 @@ cleanup: static int ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -319,7 +319,7 @@ cleanup: /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index c0ebc4db8849..702fc6840246 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -28,7 +28,7 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -38,7 +38,7 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 7e192574c001..42b6e9874bcc 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -32,7 +32,7 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index f470e44c4b8d..ecdc4605192c 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -36,7 +36,7 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index a1b810230cc5..3ad242e5840e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -210,7 +210,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = freeb / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { int best_ndir = inodes_per_group; int best_group = -1; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 13c0868c7160..2ee2dc4351d1 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3240,7 +3240,7 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) */ int ext3_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; const unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f197736dccfa..4264b9bd0002 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1049,19 +1049,19 @@ struct dentry *ext3_get_parent(struct dentry *child) struct ext3_dir_entry_2 * de; struct buffer_head *bh; - bh = ext3_find_entry(child->d_inode, &dotdot, &de); + bh = ext3_find_entry(d_inode(child), &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { - ext3_error(child->d_inode->i_sb, "ext3_get_parent", + if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { + ext3_error(d_inode(child)->i_sb, "ext3_get_parent", "bad inode number: %lu", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); } #define S_SHIFT 12 @@ -1243,7 +1243,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext3_dir_entry_2 *de, struct buffer_head * bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned long offset = 0; @@ -1330,7 +1330,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct buffer_head *bh2; @@ -1435,7 +1435,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1489,7 +1489,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head * bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct ext3_dir_entry_2 *de; int err; @@ -2111,7 +2111,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2125,7 +2125,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2173,7 +2173,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2187,7 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2328,7 +2328,7 @@ static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT3_LINK_MAX) @@ -2391,8 +2391,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + dquot_initialize(d_inode(new_dentry)); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2409,12 +2409,12 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); retval = -ENOENT; if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f037b4b27300..a9312f0a54e5 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1170,7 +1170,7 @@ static int parse_options (char *options, struct super_block *sb, return 0; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext3_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -2947,7 +2947,7 @@ static int ext3_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext3_journal_start(sb->s_root->d_inode, 2); + handle = ext3_journal_start(d_inode(sb->s_root), 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -2994,7 +2994,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(path->dentry->d_inode)) { + if (ext3_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 6b01c3eab1f3..ea96df3c58db 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -23,7 +23,7 @@ static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); + struct ext3_inode_info *ei = EXT3_I(d_inode(dentry)); nd_set_link(nd, (char*)ei->i_data); return NULL; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 24215dc09a18..7cf36501ccf4 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -137,7 +137,7 @@ ext3_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -355,7 +355,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, static int ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; @@ -391,7 +391,7 @@ cleanup: static int ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -432,7 +432,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + down_read(&EXT3_I(d_inode(dentry))->xattr_sem); i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; @@ -445,7 +445,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(dentry->d_inode)->xattr_sem); + up_read(&EXT3_I(d_inode(dentry))->xattr_sem); return i_error + b_error; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 722c2bf9645d..c9506d5e3b13 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -29,7 +29,7 @@ ext3_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -39,7 +39,7 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index d75727cc67fa..206cc66dc285 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -32,7 +32,7 @@ ext3_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 5612af3567e0..021508ad1616 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -34,7 +34,7 @@ ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, buffer, size); } @@ -46,7 +46,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e9d632e9aa4b..8850254136ae 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -55,7 +55,7 @@ static int ext4_sync_parent(struct inode *inode) dentry = d_find_any_alias(inode); if (!dentry) break; - next = igrab(dentry->d_parent->d_inode); + next = igrab(d_inode(dentry->d_parent)); dput(dentry); if (!next) break; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2cf18a2d5c72..1eaa6cb96cd0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -443,7 +443,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && - ((parent == sb->s_root->d_inode) || + ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3580629e42d3..958824019509 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -682,11 +682,11 @@ retry: * via ext4_inode_block_unlocked_dio(). Check inode's state * while holding extra i_dio_count ref. */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb(); if (unlikely(ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_done(inode); + inode_dio_end(inode); goto locked; } if (IS_DAX(inode)) @@ -697,7 +697,7 @@ retry: inode->i_sb->s_bdev, iter, offset, ext4_get_block, NULL, NULL, 0); - inode_dio_done(inode); + inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index feb2cafbeace..095c7a258d97 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1000,7 +1000,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; int err; @@ -1254,7 +1254,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 366476e71e10..cbd0654a2675 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3077,7 +3077,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ if (iov_iter_rw(iter) == WRITE) - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3182,7 +3182,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, retake_lock: if (iov_iter_rw(iter) == WRITE) - inode_dio_done(inode); + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { up_read(&EXT4_I(inode)->i_data_sem); @@ -4637,7 +4637,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; @@ -4785,7 +4785,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode; unsigned long long delalloc_blocks; - inode = dentry->d_inode; + inode = d_inode(dentry); generic_fillattr(inode, stat); /* diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 3cb267aee802..b52374e42102 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); - tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ef22cd951c0c..7223b0b4bc38 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1664,7 +1664,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); if (IS_ERR(bh)) return (struct dentry *) bh; if (!bh) @@ -1672,13 +1672,13 @@ struct dentry *ext4_get_parent(struct dentry *child) ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - EXT4_ERROR_INODE(child->d_inode, + if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { + EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); } /* @@ -1988,7 +1988,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; @@ -2048,7 +2048,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); #ifdef CONFIG_EXT4_FS_ENCRYPTION struct ext4_fname_crypto_ctx *ctx = NULL; int res; @@ -2202,7 +2202,7 @@ out_frames: static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; @@ -2287,7 +2287,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -3063,7 +3063,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -3072,7 +3072,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!bh) goto end_rmdir; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -3132,7 +3132,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -3141,7 +3141,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -3339,7 +3339,7 @@ static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT4_LINK_MAX) @@ -3613,12 +3613,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; int force_reread; int retval; @@ -3809,12 +3809,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 821f22dbe825..f06d0589ddba 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1556,7 +1556,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext4_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -5217,7 +5217,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); + handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5265,7 +5265,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * all updates to the file when we bypass pagecache... */ if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path->dentry->d_inode)) { + ext4_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 136ca0e911fd..19f78f20975e 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -28,7 +28,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) struct page *cpage = NULL; char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); @@ -43,8 +43,8 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) return ctx; if (ext4_inode_is_fast_symlink(inode)) { - caddr = (char *) EXT4_I(dentry->d_inode)->i_data; - max_size = sizeof(EXT4_I(dentry->d_inode)->i_data); + caddr = (char *) EXT4_I(inode)->i_data; + max_size = sizeof(EXT4_I(inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) { @@ -113,7 +113,7 @@ static void ext4_put_link(struct dentry *dentry, struct nameidata *nd, static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd) { - struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); + struct ext4_inode_info *ei = EXT4_I(d_inode(dentry)); nd_set_link(nd, (char *) ei->i_data); return NULL; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 759842ff8af0..16e28c08d1e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -178,7 +178,7 @@ ext4_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -423,7 +423,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); @@ -460,7 +460,7 @@ cleanup: static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -501,7 +501,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; - down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; @@ -514,7 +514,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto errout; ret += ret2; errout: - up_read(&EXT4_I(dentry->d_inode)->xattr_sem); + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index d2a200624af5..95d90e0560f0 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -33,7 +33,7 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -43,7 +43,7 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 95f1f4ab59a4..891ee2ddfbd6 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -36,7 +36,7 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -46,7 +46,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 0edb7611ffbe..6ed932b3c043 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -37,7 +37,7 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c06a25e5cec3..d8921cf2ba9a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1482,7 +1482,7 @@ bool f2fs_empty_dir(struct inode *); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, + return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a6f3f6186588..2b52e48d7482 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -574,7 +574,7 @@ void f2fs_truncate(struct inode *inode) int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks <<= 3; return 0; @@ -613,7 +613,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); int err; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 407dde3d7a92..7e3794edae42 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -151,7 +151,7 @@ out: static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; @@ -182,10 +182,10 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -263,7 +263,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -403,7 +403,7 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; @@ -451,8 +451,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *old_dir_page; struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL; @@ -578,8 +578,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *old_dir_page, *new_dir_page; struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index b0fd2f2d0716..9757f65a05bc 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL); + return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -108,7 +108,7 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, + return f2fs_setxattr(d_inode(dentry), type, name, value, size, NULL, flags); } @@ -130,7 +130,7 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; @@ -143,7 +143,7 @@ static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; @@ -444,7 +444,7 @@ cleanup: ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr; int error = 0; diff --git a/fs/fat/file.c b/fs/fat/file.c index cf50d93565a2..442d50a0e33e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -305,7 +305,7 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; @@ -377,7 +377,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) int fat_setattr(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index cc6a8541b668..b7e2b33aa793 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -308,7 +308,7 @@ out: static int msdos_rmdir(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fat_slot_info sinfo; int err; @@ -402,7 +402,7 @@ out: /***** Unlink a file */ static int msdos_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct fat_slot_info sinfo; int err; @@ -440,8 +440,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int err, old_attrs, is_dir, update_dotdot, corrupt = 0; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 7e0974eebd8e..7092584f424a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -33,7 +33,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != dentry->d_parent->d_inode->i_version) + if (dentry->d_time != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -45,7 +45,7 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* This is not negative dentry. Always valid. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; return vfat_revalidate_shortname(dentry); } @@ -65,7 +65,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* @@ -801,7 +801,7 @@ out: static int vfat_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -832,7 +832,7 @@ out: static int vfat_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -915,8 +915,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); mutex_lock(&MSDOS_SB(sb)->s_lock); err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); if (err) diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 93e14933dcb6..eb192656fba2 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -266,7 +266,7 @@ struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) * Find the parent for a directory that is not currently connected to * the filesystem root. * - * On entry, the caller holds child_dir->d_inode->i_mutex. + * On entry, the caller holds d_inode(child_dir)->i_mutex. */ static struct dentry *fat_get_parent(struct dentry *child_dir) { @@ -276,7 +276,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir) struct inode *parent_inode = NULL; struct msdos_sb_info *sbi = MSDOS_SB(sb); - if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { + if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c36aeaf92e41..8b9229e2ca5c 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -76,7 +76,7 @@ const struct address_space_operations vxfs_immed_aops = { static void * vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) { - struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode); + struct vxfs_inode_info *vip = VXFS_INO(d_inode(dp)); nd_set_link(np, vip->vii_immed.vi_immed); return NULL; } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 205e0d5d5307..f863ac6647ac 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -244,7 +244,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return 0; parent = fuse_control_sb->s_root; - inc_nlink(parent->d_inode); + inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, &simple_dir_inode_operations, @@ -283,11 +283,11 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; - dentry->d_inode->i_private = NULL; + d_inode(dentry)->i_private = NULL; d_drop(dentry); dput(dentry); } - drop_nlink(fuse_control_sb->s_root->d_inode); + drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1545b711ddcf..0572bca49f15 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -192,7 +192,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct fuse_inode *fi; int ret; - inode = ACCESS_ONCE(entry->d_inode); + inode = d_inode_rcu(entry); if (inode && is_bad_inode(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || @@ -220,7 +220,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(parent->d_inode), + fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); ret = fuse_simple_request(fc, &args); dput(parent); @@ -254,7 +254,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); - fuse_advise_use_readdirplus(parent->d_inode); + fuse_advise_use_readdirplus(d_inode(parent)); dput(parent); } } @@ -487,7 +487,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, entry = res; } - if (!(flags & O_CREAT) || entry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ @@ -653,7 +653,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); @@ -689,7 +689,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) @@ -721,12 +721,12 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(oldent->d_inode); - fuse_update_ctime(oldent->d_inode); + fuse_invalidate_attr(d_inode(oldent)); + fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(newent->d_inode); - fuse_update_ctime(newent->d_inode); + fuse_invalidate_attr(d_inode(newent)); + fuse_update_ctime(d_inode(newent)); } fuse_invalidate_attr(olddir); @@ -734,10 +734,10 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { - fuse_invalidate_attr(newent->d_inode); + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { + fuse_invalidate_attr(d_inode(newent)); fuse_invalidate_entry_cache(newent); - fuse_update_ctime(newent->d_inode); + fuse_update_ctime(d_inode(newent)); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -746,7 +746,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); - if (newent->d_inode) + if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } @@ -788,7 +788,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, { int err; struct fuse_link_in inarg; - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -961,9 +961,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); - if (child_nodeid != 0 && entry->d_inode) { - mutex_lock(&entry->d_inode->i_mutex); - if (get_node_id(entry->d_inode) != child_nodeid) { + if (child_nodeid != 0 && d_really_is_positive(entry)) { + mutex_lock(&d_inode(entry)->i_mutex); + if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } @@ -977,13 +977,13 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, err = -ENOTEMPTY; goto badentry; } - entry->d_inode->i_flags |= S_DEAD; + d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&entry->d_inode->i_mutex); + mutex_unlock(&d_inode(entry)->i_mutex); if (!err) d_delete(entry); } else { @@ -1169,7 +1169,7 @@ static int fuse_direntplus_link(struct file *file, struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; @@ -1205,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file, name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { d_drop(dentry); } else if (get_node_id(inode) != o->nodeid || @@ -1367,7 +1367,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) static char *read_link(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; @@ -1712,7 +1712,7 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1726,7 +1726,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct kstat *stat) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); if (!fuse_allow_current_process(fc)) @@ -1738,7 +1738,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, static int fuse_setxattr(struct dentry *entry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; @@ -1774,7 +1774,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, static ssize_t fuse_getxattr(struct dentry *entry, const char *name, void *value, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1815,7 +1815,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1857,7 +1857,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) static int fuse_removexattr(struct dentry *entry, const char *name) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); int err; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e8799c11424b..082ac1c97f39 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -421,7 +421,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) memset(&outarg, 0, sizeof(outarg)); args.in.numargs = 0; args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(dentry->d_inode); + args.in.h.nodeid = get_node_id(d_inode(dentry)); args.out.numargs = 1; args.out.args[0].size = sizeof(outarg); args.out.args[0].value = &outarg; @@ -740,7 +740,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, static struct dentry *fuse_get_parent(struct dentry *child) { - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct fuse_conn *fc = get_fuse_conn(child_inode); struct inode *inode; struct dentry *parent; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 589f4ea9381c..30822b148f3e 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -48,9 +48,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - sdp = GFS2_SB(parent->d_inode); - dip = GFS2_I(parent->d_inode); - inode = dentry->d_inode; + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); + inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -68,7 +68,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto fail; } - error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); + error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); switch (error) { case 0: if (!inode) @@ -113,10 +113,10 @@ static int gfs2_dentry_delete(const struct dentry *dentry) { struct gfs2_inode *ginode; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - ginode = GFS2_I(dentry->d_inode); + ginode = GFS2_I(d_inode(dentry)); if (!ginode->i_iopen_gh.gh_gl) return 0; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index c41d255b6a7b..5d15e9498b48 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -49,7 +49,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; - if (!parent || inode == sb->s_root->d_inode) + if (!parent || inode == d_inode(sb->s_root)) return *len; ip = GFS2_I(parent); @@ -88,8 +88,8 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, static int gfs2_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *dir = parent->d_inode; - struct inode *inode = child->d_inode; + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); struct gfs2_inode *dip, *ip; struct get_name_filldir gnfd = { .ctx.actor = get_name_filldir, @@ -128,7 +128,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); + return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1)); } static struct dentry *gfs2_get_dentry(struct super_block *sb, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 08bc84d7e768..1b3ca7a2e3fc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -295,7 +295,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || (name->len == 2 && memcmp(name->name, "..", 2) == 0 && - dir == sb->s_root->d_inode)) { + dir == d_inode(sb->s_root))) { igrab(dir); return dir; } @@ -687,7 +687,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_set_inode_flags(inode); - if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) aflags |= GFS2_AF_ORLOV; @@ -888,7 +888,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; @@ -1055,7 +1055,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, static int gfs2_unlink_inode(struct gfs2_inode *dip, const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); int error; @@ -1091,7 +1091,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; @@ -1241,7 +1241,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, return PTR_ERR(d); if (d != NULL) dentry = d; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { if (!(*opened & FILE_OPENED)) return finish_no_open(file, d); dput(d); @@ -1282,7 +1282,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) error = -EINVAL; break; } - if (dir == sb->s_root->d_inode) { + if (dir == d_inode(sb->s_root)) { error = 0; break; } @@ -1321,7 +1321,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, { struct gfs2_inode *odip = GFS2_I(odir); struct gfs2_inode *ndip = GFS2_I(ndir); - struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; @@ -1332,8 +1332,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, unsigned int x; int error; - if (ndentry->d_inode) { - nip = GFS2_I(ndentry->d_inode); + if (d_really_is_positive(ndentry)) { + nip = GFS2_I(d_inode(ndentry)); if (ip == nip) return 0; } @@ -1457,7 +1457,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; } @@ -1550,7 +1550,7 @@ out: static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int size; @@ -1742,7 +1742,7 @@ out: static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; int error; @@ -1798,7 +1798,7 @@ out: static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; @@ -1821,7 +1821,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1841,7 +1841,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1862,7 +1862,7 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efc8e254787c..35b49f44c72f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -647,7 +647,7 @@ out_unlock: static int init_journal(struct gfs2_sbd *sdp, int undo) { - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); struct gfs2_holder ji_gh; struct gfs2_inode *ip; int jindex = 1; @@ -782,7 +782,7 @@ static struct lock_class_key gfs2_quota_imutex_key; static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (undo) goto fail_qinode; @@ -848,7 +848,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (sdp->sd_args.ar_spectator) return 0; @@ -1357,7 +1357,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_inode->i_sb->s_bdev); + d_inode(path.dentry)->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1666382b198d..859c6edbf81a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1171,7 +1171,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = d_inode(dentry)->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_statfs_change_host sc; int error; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index fd260ce8869a..4c096fa9e2a1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -420,7 +420,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; @@ -586,7 +586,7 @@ out: static int gfs2_xattr_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_location el; int error; @@ -1230,7 +1230,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, static int gfs2_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - return __gfs2_xattr_set(dentry->d_inode, name, value, + return __gfs2_xattr_set(d_inode(dentry), name, value, size, flags, type); } diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index e057ec542a6a..8d931b157bbe 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -16,7 +16,7 @@ int hfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -59,7 +59,7 @@ out: ssize_t hfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -105,7 +105,7 @@ out: ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) return -EOPNOTSUPP; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 36d1a6ae7655..70788e03820a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -253,7 +253,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) @@ -285,18 +285,18 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } - res = hfs_cat_move(old_dentry->d_inode->i_ino, + res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, - (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key, + (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 75fd5d873c19..b99ebddb10cb 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -600,7 +600,7 @@ static int hfs_file_release(struct inode *inode, struct file *file) int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 91b91fd3a901..2875961fdc10 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -21,7 +21,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if(!inode) return 1; diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 3074609befc3..d0f39dcbb58e 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -81,7 +81,7 @@ again: HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> create_date || entry.file.create_date == - HFSPLUS_I(sb->s_root->d_inode)-> + HFSPLUS_I(d_inode(sb->s_root))-> create_date) && HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; @@ -296,8 +296,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); - struct inode *inode = src_dentry->d_inode; - struct inode *src_dir = src_dentry->d_parent->d_inode; + struct inode *inode = d_inode(src_dentry); + struct inode *src_dir = d_inode(src_dentry->d_parent); struct qstr str; char name[32]; u32 cnid, id; @@ -353,7 +353,7 @@ out: static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct qstr str; char name[32]; u32 cnid; @@ -410,7 +410,7 @@ out: static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (inode->i_size != 2) @@ -529,7 +529,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b0afedbef12b..6dd107d7421e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -243,7 +243,7 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 8e98f5db6ad6..0624ce4e0702 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -26,7 +26,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); struct hfsplus_vh *vh = sbi->s_vhdr; struct hfsplus_vh *bvh = sbi->s_backup_vhdr; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 89f262d8fcd8..416b1dbafe51 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -440,7 +440,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(dentry->d_inode, xattr_name, value, size, + res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, flags); kfree(xattr_name); return res; @@ -600,7 +600,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_getxattr(dentry->d_inode, xattr_name, value, size); + res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size); kfree(xattr_name); return res; @@ -620,7 +620,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; @@ -688,7 +688,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 key_len = 0; struct hfsplus_attr_key attr_key; @@ -868,7 +868,7 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_getxattr(dentry->d_inode, name, buffer, size); + return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); } static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, @@ -890,7 +890,7 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(dentry->d_inode, name, buffer, size, flags); + return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); } static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index b83a0343378b..ef263174acd2 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -807,7 +807,7 @@ static int hostfs_permission(struct inode *ino, int desired) static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hostfs_iattr attrs; char *name; int err; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 7ce4b74234a1..933c73780813 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) int hpfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = -EINVAL; hpfs_lock(inode->i_sb); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bdbc2c3080a4..a0872f239f04 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -359,7 +359,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int r; int rep = 0; @@ -433,7 +433,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int n_items = 0; int err; @@ -522,8 +522,8 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned old_len = old_dentry->d_name.len; const unsigned char *new_name = new_dentry->d_name.name; unsigned new_len = new_dentry->d_name.len; - struct inode *i = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *i = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct quad_buffer_head qbh, qbh1; struct hpfs_dirent *dep, *nde; struct hpfs_dirent de; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 043ac9d77262..fa2bd5366ecf 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -153,9 +153,9 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, return ERR_PTR(-ENOENT); parent = HPPFS_I(ino)->proc_dentry; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); proc_dentry = lookup_one_len(name->name, parent, name->len); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (IS_ERR(proc_dentry)) return proc_dentry; @@ -637,25 +637,25 @@ static const struct super_operations hppfs_sbops = { static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; - return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; + return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer, buflen); } static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); + return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd); } static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - if (proc_dentry->d_inode->i_op->put_link) - proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); + if (d_inode(proc_dentry)->i_op->put_link) + d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie); } static const struct inode_operations hppfs_dir_iops = { @@ -670,7 +670,7 @@ static const struct inode_operations hppfs_link_iops = { static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) { - struct inode *proc_ino = dentry->d_inode; + struct inode *proc_ino = d_inode(dentry); struct inode *inode = new_inode(sb); if (!inode) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2640d88b0e63..87724c1d7be6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,7 +393,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -587,7 +587,7 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); - struct hstate *h = hstate_inode(dentry->d_inode); + struct hstate *h = hstate_inode(d_inode(dentry)); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); diff --git a/fs/inode.c b/fs/inode.c index f00b16f45507..ea37cd17b53f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1587,7 +1587,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_inode(path->dentry); struct timespec now; if (inode->i_flags & S_NOATIME) @@ -1639,7 +1639,7 @@ EXPORT_SYMBOL(touch_atime); */ int should_remove_suid(struct dentry *dentry) { - umode_t mode = dentry->d_inode->i_mode; + umode_t mode = d_inode(dentry)->i_mode; int kill = 0; /* suid always must be killed */ @@ -1675,7 +1675,7 @@ static int __remove_suid(struct dentry *dentry, int kill) int file_remove_suid(struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int killsuid; int killpriv; int error = 0; @@ -1946,20 +1946,6 @@ void inode_dio_wait(struct inode *inode) EXPORT_SYMBOL(inode_dio_wait); /* - * inode_dio_done - signal finish of a direct I/O requests - * @inode: inode the direct I/O happens on - * - * This is called once we've finished processing a direct I/O request, - * and is used to wake up callers waiting for direct I/O to be quiesced. - */ -void inode_dio_done(struct inode *inode) -{ - if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); -} -EXPORT_SYMBOL(inode_dio_done); - -/* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 12088d8de3fa..0c5f721b4e91 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -44,7 +44,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f21b6fb5e4c4..1ba5c97943b8 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -224,14 +224,14 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry)); int ret; uint32_t now = get_seconds(); ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); + set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -241,8 +241,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); int ret; uint8_t type; @@ -256,7 +256,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de return -EPERM; /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); @@ -264,11 +264,11 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); + set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); - d_instantiate(dentry, old_dentry->d_inode); + d_instantiate(dentry, d_inode(old_dentry)); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - ihold(old_dentry->d_inode); + ihold(d_inode(old_dentry)); } return ret; } @@ -585,7 +585,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); struct jffs2_full_dirent *fd; int ret; uint32_t now = get_seconds(); @@ -599,7 +599,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) dentry->d_name.len, f, now); if (!ret) { dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } return ret; @@ -770,8 +770,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, * the VFS can't check whether the victim is empty. The filesystem * needs to do that for itself. */ - if (new_dentry->d_inode) { - victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) { + victim_f = JFFS2_INODE_INFO(d_inode(new_dentry)); if (d_is_dir(new_dentry)) { struct jffs2_full_dirent *fd; @@ -794,12 +794,12 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* Make a hard link */ /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i), - old_dentry->d_inode->i_ino, type, + d_inode(old_dentry)->i_ino, type, new_dentry->d_name.name, new_dentry->d_name.len, now); if (ret) @@ -808,9 +808,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (victim_f) { /* There was a victim. Kill it off nicely */ if (d_is_dir(new_dentry)) - clear_nlink(new_dentry->d_inode); + clear_nlink(d_inode(new_dentry)); else - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); /* Don't oops if the victim was a dirent pointing to an inode which didn't exist. */ if (victim_f->inocache) { @@ -836,9 +836,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (ret) { /* Oh shit. We really ought to make a single node which can do both atomically */ - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); mutex_lock(&f->sem); - inc_nlink(old_dentry->d_inode); + inc_nlink(d_inode(old_dentry)); if (f->inocache && !d_is_dir(old_dentry)) f->inocache->pino_nlink++; mutex_unlock(&f->sem); @@ -846,8 +846,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); /* Might as well let the VFS know */ - d_instantiate(new_dentry, old_dentry->d_inode); - ihold(old_dentry->d_inode); + d_instantiate(new_dentry, d_inode(old_dentry)); + ihold(d_inode(old_dentry)); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 601afd1afddf..fe5ea080b4ec 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,7 +190,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aca97f35b292..d4b43fb7adb1 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -54,7 +54,7 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size); } @@ -64,7 +64,7 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 3d76f28a2ba9..d86c5e3176a1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -140,14 +140,14 @@ static struct dentry *jffs2_get_parent(struct dentry *child) BUG_ON(!d_is_dir(child)); - f = JFFS2_INODE_INFO(child->d_inode); + f = JFFS2_INODE_INFO(d_inode(child)); pino = f->inocache->pino_nlink; JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", f->inocache->ino, pino); - return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); + return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino)); } static const struct export_operations jffs2_export_ops = { diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index c7c77b0dfccd..1fefa25d0fa5 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -31,7 +31,7 @@ const struct inode_operations jffs2_symlink_inode_operations = static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); char *p = (char *)f->target; /* diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 762c7a3cf43d..f092fee5be50 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -960,7 +960,7 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_cache *ic = f->inocache; @@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ if (rc) { JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", __func__, rc, totlen); - rc = rc ? rc : -EBADFD; goto out; } rc = save_xattr_ref(c, ref); diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 1c868194c504..ceaf9c693225 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -21,7 +21,7 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 916b5c966039..a71391eba514 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -21,7 +21,7 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index ae46788b9723..e98d39d75cf4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -100,7 +100,7 @@ static int jfs_release(struct inode *inode, struct file *file) int jfs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 38fdc533f4ec..66db7bc0ed10 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -346,7 +346,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; struct inode *iplist[2]; @@ -472,7 +472,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; /* object name */ struct inode *iplist[2]; @@ -791,7 +791,7 @@ static int jfs_link(struct dentry *old_dentry, { int rc; tid_t tid; - struct inode *ip = old_dentry->d_inode; + struct inode *ip = d_inode(old_dentry); ino_t ino; struct component_name dname; struct btstack btstack; @@ -879,7 +879,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, struct component_name dname; int ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; @@ -1086,8 +1086,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_ip = old_dentry->d_inode; - new_ip = new_dentry->d_inode; + old_ip = d_inode(old_dentry); + new_ip = d_inode(new_dentry); if ((rc = get_UCSname(&old_dname, old_dentry))) goto out1; @@ -1500,9 +1500,9 @@ struct dentry *jfs_get_parent(struct dentry *dentry) unsigned long parent_ino; parent_ino = - le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); + le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); - return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino)); + return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { @@ -1578,7 +1578,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 205b946d8e0d..80f42bcc4ef1 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -24,7 +24,7 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *s = JFS_IP(dentry->d_inode)->i_inline; + char *s = JFS_IP(d_inode(dentry))->i_inline; nd_set_link(nd, s); return NULL; } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 46325d5c34fc..48b15a6e5558 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -849,7 +849,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t value_len, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -872,7 +872,7 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len, + rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len, flags); if (!rc) rc = txCommit(tid, 1, &inode, 0); @@ -959,7 +959,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, return -EOPNOTSUPP; } - err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); + err = __jfs_getxattr(d_inode(dentry), name, data, buf_size); return err; } @@ -976,7 +976,7 @@ static inline int can_list(struct jfs_ea *ea) ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); char *buffer; ssize_t size = 0; int xattr_size; @@ -1029,7 +1029,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) int jfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -1047,7 +1047,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name) tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); if (!rc) rc = txCommit(tid, 1, &inode, 0); txEnd(tid); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6acc9648f986..f131fc23ffc4 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -444,7 +444,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* Always perform fresh lookup for negatives */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad_unlocked; kn = dentry->d_fsdata; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 9000874a945b..2da8493a380b 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -111,7 +111,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct kernfs_node *kn = dentry->d_fsdata; int error; @@ -172,11 +172,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, + error = security_inode_setsecurity(d_inode(dentry), suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(dentry->d_inode, + error = security_inode_getsecctx(d_inode(dentry), &secdata, &secdata_len); if (error) return error; @@ -271,7 +271,7 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct kernfs_node *kn = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); diff --git a/fs/libfs.c b/fs/libfs.c index 0ab65122ee45..cb1fb4b9b637 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -22,13 +22,13 @@ static inline int simple_positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); return 0; @@ -94,7 +94,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -102,7 +102,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -129,7 +129,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -169,7 +169,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) spin_unlock(&next->d_lock); spin_unlock(&dentry->d_lock); if (!dir_emit(ctx, next->d_name.name, next->d_name.len, - next->d_inode->i_ino, dt_type(next->d_inode))) + d_inode(next)->i_ino, dt_type(d_inode(next)))) return 0; spin_lock(&dentry->d_lock); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); @@ -270,7 +270,7 @@ EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); @@ -304,7 +304,7 @@ EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; drop_nlink(inode); @@ -318,7 +318,7 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; @@ -328,16 +328,16 @@ EXPORT_SYMBOL(simple_rmdir); int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { @@ -368,7 +368,7 @@ EXPORT_SYMBOL(simple_rename); */ int simple_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 665ef5a05183..a563ddbc19e6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -31,7 +31,7 @@ static struct hlist_head nlm_files[FILE_NRHASH]; static DEFINE_MUTEX(nlm_file_mutex); -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) { u32 *fhp = (u32*)f->data; diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 6bdc347008f5..4cf38f118549 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -213,7 +213,7 @@ static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) static int logfs_unlink(struct inode *dir, struct dentry *dentry) { struct logfs_super *super = logfs_super(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct logfs_transaction *ta; struct page *page; pgoff_t index; @@ -271,7 +271,7 @@ static inline int logfs_empty_dir(struct inode *dir) static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -537,7 +537,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); @@ -607,7 +607,7 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, /* 2. write target dd */ mutex_lock(&super->s_dirop_mutex); logfs_add_transaction(new_dir, ta); - err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry)); if (!err) err = write_inode(new_dir); @@ -658,8 +658,8 @@ static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct logfs_super *super = logfs_super(old_dir->i_sb); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); int isdir = S_ISDIR(old_inode->i_mode); struct logfs_disk_dentry dd; struct logfs_transaction *ta; @@ -719,7 +719,7 @@ out: static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - if (new_dentry->d_inode) + if (d_really_is_positive(new_dentry)) return logfs_rename_target(old_dir, old_dentry, new_dir, new_dentry); return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/logfs/file.c b/fs/logfs/file.c index b2c13f739ffa..1a6f0167b16a 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -241,7 +241,7 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int logfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; err = inode_change_ok(inode, attr); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dfaf6fa9b7b5..118e4e7bc935 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -156,7 +156,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; @@ -203,7 +203,7 @@ found: int minix_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; diff --git a/fs/minix/file.c b/fs/minix/file.c index 6d63e27ec961..94f0eb9a6e2c 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -23,7 +23,7 @@ const struct file_operations minix_file_operations = { static int minix_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 3f57af196a7d..1182d1e26a9c 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -626,8 +626,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *sb = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); - if (INODE_VERSION(dentry->d_inode) == MINIX_V1) + generic_fillattr(d_inode(dentry), stat); + if (INODE_VERSION(d_inode(dentry)) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index cd950e2331b6..a795a11e50c7 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,7 +104,7 @@ out_fail: static int minix_link(struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -151,7 +151,7 @@ out_dir: static int minix_unlink(struct inode * dir, struct dentry *dentry) { int err = -ENOENT; - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; @@ -171,7 +171,7 @@ end_unlink: static int minix_rmdir(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (minix_empty_dir(inode)) { @@ -187,8 +187,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir, struct dentry *new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/namei.c b/fs/namei.c index ffab2e06e147..4a8d998b7274 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1590,7 +1590,8 @@ static inline int walk_component(struct nameidata *nd, struct path *path, if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { err = -ECHILD; goto out_err; } @@ -3045,7 +3046,8 @@ finish_lookup: if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { + if (unlikely(nd->path.mnt != path->mnt || + unlazy_walk(nd, path->dentry))) { error = -ECHILD; goto out; } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index e7ca827d7694..80021c709af9 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -127,7 +127,7 @@ static inline int ncp_case_sensitive(const struct inode *i) static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) { - struct inode *inode = ACCESS_ONCE(dentry->d_inode); + struct inode *inode = d_inode_rcu(dentry); if (!inode) return 0; @@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, if (len != name->len) return 1; - pinode = ACCESS_ONCE(parent->d_inode); + pinode = d_inode_rcu(parent); if (!pinode) return 1; @@ -180,7 +180,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, static int ncp_delete_dentry(const struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -224,7 +224,7 @@ ncp_force_unlink(struct inode *dir, struct dentry* dentry) memset(&info, 0, sizeof(info)); /* remove the Read-Only flag on the NW server */ - inode = dentry->d_inode; + inode = d_inode(dentry); old_nwattr = NCP_FINFO(inode)->nwattr; info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); @@ -254,7 +254,7 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na { struct nw_modify_dos_info info; int res=0x90,res2; - struct inode *old_inode = old_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; __le32 new_nwattr = 0; /* shut compiler warning */ int old_nwattr_changed = 0; @@ -268,8 +268,8 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); if (!res2) old_nwattr_changed = 1; - if (new_dentry && new_dentry->d_inode) { - new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr; + if (new_dentry && d_really_is_positive(new_dentry)) { + new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr; info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); if (!res2) @@ -324,9 +324,9 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto finished; server = NCP_SERVER(dir); @@ -367,7 +367,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) * what we remember, it's not valid any more. */ if (!res) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&inode->i_mutex); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { @@ -388,7 +388,7 @@ finished: static time_t ncp_obtain_mtime(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server = NCP_SERVER(inode); struct nw_info_struct i; @@ -404,7 +404,7 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) static inline void ncp_invalidate_dircache_entries(struct dentry *parent) { - struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct ncp_server *server = NCP_SERVER(d_inode(parent)); struct dentry *dentry; spin_lock(&parent->d_lock); @@ -418,7 +418,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) static int ncp_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); union ncp_dir_cache *cache = NULL; @@ -491,13 +491,13 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) goto invalid_cache; } spin_unlock(&dentry->d_lock); - if (!dent->d_inode) { + if (d_really_is_negative(dent)) { dput(dent); goto invalid_cache; } over = !dir_emit(ctx, dent->d_name.name, dent->d_name.len, - dent->d_inode->i_ino, DT_UNKNOWN); + d_inode(dent)->i_ino, DT_UNKNOWN); dput(dent); if (over) goto finished; @@ -571,7 +571,7 @@ static void ncp_d_prune(struct dentry *dentry) { if (!dentry->d_fsdata) /* not referenced from page cache */ return; - NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE; } static int @@ -580,7 +580,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, int inval_childs) { struct dentry *newdent, *dentry = file->f_path.dentry; - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct ncp_cache_control ctl = *ctrl; struct qstr qname; int valid = 0; @@ -621,7 +621,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, dentry_update_name_case(newdent, &qname); } - if (!newdent->d_inode) { + if (d_really_is_negative(newdent)) { struct inode *inode; entry->opened = 0; @@ -637,7 +637,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); } } else { - struct inode *inode = newdent->d_inode; + struct inode *inode = d_inode(newdent); mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); @@ -659,10 +659,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, ctl.cache = kmap(ctl.page); } if (ctl.cache) { - if (newdent->d_inode) { + if (d_really_is_positive(newdent)) { newdent->d_fsdata = newdent; ctl.cache->dentry[ctl.idx] = newdent; - ino = newdent->d_inode->i_ino; + ino = d_inode(newdent)->i_ino; ncp_new_dentry(newdent); } valid = 1; @@ -807,7 +807,7 @@ int ncp_conn_logged_in(struct super_block *sb) } dent = sb->s_root; if (dent) { - struct inode* ino = dent->d_inode; + struct inode* ino = d_inode(dent); if (ino) { ncp_update_known_namespace(server, volNumber, NULL); NCP_FINFO(ino)->volNumber = volNumber; @@ -815,7 +815,7 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n"); } } else { ncp_dbg(1, "sb->s_root == NULL!\n"); @@ -1055,7 +1055,7 @@ out: static int ncp_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server; int error; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 01a9e16e9782..9605a2f63549 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -812,7 +812,7 @@ static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) if (!d) { goto dflt; } - i = d->d_inode; + i = d_inode(d); if (!i) { goto dflt; } @@ -865,7 +865,7 @@ dflt:; int ncp_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int result = 0; __le32 info_mask; struct nw_modify_dos_info info; @@ -878,7 +878,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; result = -EPERM; - if (IS_DEADDIR(dentry->d_inode)) + if (IS_DEADDIR(d_inode(dentry))) goto out; /* ageing the dentry to force validation */ diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index cf7e043a9447..79b113048eac 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -376,7 +376,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg struct dentry* dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { sr.volNumber = NCP_FINFO(s_inode)->volNumber; @@ -384,7 +384,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); } else ncp_dbg(1, "s_root==NULL\n"); } else { @@ -431,7 +431,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg if (result == 0) { dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { NCP_FINFO(s_inode)->volNumber = vnum; @@ -439,7 +439,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); result = -EIO; } } else { diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 2b502a0d7941..88dbbc9fcf4d 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -727,7 +727,7 @@ int ncp_del_file_or_subdir2(struct ncp_server *server, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); __u8 volnum; __le32 dirent; diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 1a63bfdb4a65..421b6f91e8ec 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -156,7 +156,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { goto failfree; } - inode=dentry->d_inode; + inode=d_inode(dentry); if (ncp_make_open(inode, O_WRONLY)) goto failfree; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1e987acf20c9..8664417955a2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ - delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1cac3c175d18..d2554fe140a3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, }; static int __init nfs4blocklayout_init(void) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5aed4f98df41..e535599a0719 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d) container_of(d, struct pnfs_block_dev, node); bl_free_device(dev); - kfree(dev); + kfree_rcu(dev, node.rcu); } static int diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 351be9205bf8..8d129bb7355a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - /* schedule_timeout to game the hung task watchdog */ - schedule_timeout(60 * HZ); + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } + flush_signals(current); } return 0; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 19874151e95c..892aefff3630 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -31,7 +31,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad68865880..029d688a969f 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (freeme == NULL) goto out; } - list_add_rcu(&delegation->super_list, &server->delegations); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) delegation = nfs_inode_detach_delegation(inode); if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); + nfs_do_return_delegation(inode, delegation, 1); } /** diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c19e16f0b2d0..b2c8b31b2be7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -416,15 +416,14 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct nfs_inode *nfsi; - if (dentry->d_inode == NULL) - goto different; + if (d_really_is_negative(dentry)) + return 0; - nfsi = NFS_I(dentry->d_inode); + nfsi = NFS_I(d_inode(dentry)); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) return 1; -different: return 0; } @@ -473,7 +472,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct qstr filename = QSTR_INIT(entry->name, entry->len); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -497,9 +496,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) - nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); + nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); goto out; } else { d_invalidate(dentry); @@ -544,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; + if (buflen == 0) + goto out_nopages; + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -565,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); +out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { @@ -870,7 +873,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir) static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; @@ -1118,15 +1121,15 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { if (nfs_neg_need_reval(dir, dentry, flags)) { @@ -1242,7 +1245,7 @@ out_error: } /* - * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a * pathwalk ends on a dentry that was not found via a normal lookup in the * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). @@ -1253,7 +1256,7 @@ out_error: static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * I believe we can only get a negative dentry here in the case of a @@ -1287,7 +1290,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ - if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) + if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) return 1; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1491,7 +1494,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, int err; /* Expect a negative dentry */ - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -1587,7 +1590,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) goto no_open; - inode = dentry->d_inode; + inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. @@ -1598,12 +1601,12 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; @@ -1643,14 +1646,14 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int error = -EACCES; d_drop(dentry); /* We may have been initialized further down */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); @@ -1768,7 +1771,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (dentry->d_inode != NULL && !d_unhashed(dentry)) + if (d_really_is_positive(dentry) && !d_unhashed(dentry)) d_delete(dentry); } @@ -1780,13 +1783,13 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { nfs_wait_on_sillyrename(dentry); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { case 0: - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); break; case -ENOENT: nfs_dentry_handle_enoent(dentry); @@ -1808,8 +1811,8 @@ EXPORT_SYMBOL_GPL(nfs_rmdir); */ static int nfs_safe_remove(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = dentry->d_inode; + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = d_inode(dentry); int error = -EBUSY; dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); @@ -1853,7 +1856,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ - write_inode_now(dentry->d_inode, 0); + write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } @@ -1931,7 +1934,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); @@ -1950,7 +1953,7 @@ EXPORT_SYMBOL_GPL(nfs_symlink); int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", @@ -1997,8 +2000,8 @@ EXPORT_SYMBOL_GPL(nfs_link); int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 682f65fe09b5..38678d9a5cc4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; - WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); - dreq->count = count; + dreq->count = count; + } } /* @@ -258,18 +261,11 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) if (!IS_SWAPFILE(inode)) return 0; -#ifndef CONFIG_NFS_SWAP - dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, iter->nr_segs); - - return -EINVAL; -#else VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) return nfs_file_direct_read(iocb, iter, pos); return nfs_file_direct_write(iocb, iter); -#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -386,7 +382,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) if (write) nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); + inode_dio_end(inode); if (dreq->iocb) { long res = (long) dreq->error; @@ -403,8 +399,8 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_readpage_release(struct nfs_page *req) { dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + d_inode(req->wb_context->dentry)->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -486,7 +482,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); while (iov_iter_count(iter)) { struct page **pagevec; @@ -538,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -872,7 +868,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); NFS_I(inode)->write_io += iov_iter_count(iter); while (iov_iter_count(iter)) { @@ -928,7 +924,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -1030,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); } } nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c40e4363e746..8b8d83a526ce 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -280,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -782,7 +783,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - nfs_sync_mapping(filp->f_mapping); + vfs_fsync(filp, 0); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 91e88a7ecef0..a46bf6de9ce4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1086,7 +1087,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server, } static void -filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } @@ -1137,7 +1138,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, - .free_deviceid_node = filelayout_free_deveiceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4f372e224603..4946ef40ba87 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); - kfree(dsaddr); + kfree_rcu(dsaddr, id_node.rcu); } /* Decode opaque device data and return the result */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 315cc68945b9..7d05089e52d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,10 +11,10 @@ #include <linux/module.h> #include <linux/sunrpc/metrics.h> -#include <linux/nfs_idmap.h> #include "flexfilelayout.h" #include "../nfs4session.h" +#include "../nfs4idmap.h" #include "../internal.h" #include "../delegation.h" #include "../nfs4trace.h" @@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1414,7 +1415,7 @@ ff_layout_get_ds_info(struct inode *inode) } static void -ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, id_node)); @@ -1498,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .free_deviceid_node = ff_layout_free_deveiceid_node, + .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, @@ -1508,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e2c01f204a95..77a2d026aa12 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); - kfree(mirror_ds); + kfree_rcu(mirror_ds, id_node.rcu); } /* Decode opaque device data and construct new_ds using it */ diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 9ac3846cb59e..a608ffd28acc 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -56,11 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&d_inode(sb->s_root)->i_lock); spin_lock(&sb->s_root->d_lock); hlist_del_init(&sb->s_root->d_u.d_alias); spin_unlock(&sb->s_root->d_lock); - spin_unlock(&sb->s_root->d_inode->i_lock); + spin_unlock(&d_inode(sb->s_root)->i_lock); } return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d42dff6d5e98..f734562c6d24 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } +int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} +EXPORT_SYMBOL_GPL(nfs_sync_inode); + /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { @@ -495,7 +501,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); int nfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = -ENOMEM; @@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - nfs_wb_all(inode); - } + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -621,7 +625,7 @@ static void nfs_request_parent_use_readdirplus(struct dentry *dentry) struct dentry *parent; parent = dget_parent(dentry); - nfs_force_use_readdirplus(parent->d_inode); + nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } @@ -637,15 +641,16 @@ static bool nfs_need_revalidate_inode(struct inode *inode) int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err = 0; trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - err = filemap_write_and_wait(inode->i_mapping); + mutex_lock(&inode->i_mutex); + err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); if (err) goto out; } @@ -708,7 +713,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); @@ -736,7 +741,7 @@ EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; @@ -763,7 +768,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; if (!is_sync) return; - inode = ctx->dentry->d_inode; + inode = d_inode(ctx->dentry); if (!list_empty(&NFS_I(inode)->open_files)) return; server = NFS_SERVER(inode); @@ -810,7 +815,7 @@ EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { @@ -842,7 +847,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); @@ -885,7 +890,7 @@ static void nfs_file_clear_open_context(struct file *filp) struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); filp->private_data = NULL; spin_lock(&inode->i_lock); @@ -1588,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1614,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + if (!nfs_fileid_valid(nfsi, fattr)) { printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, @@ -1819,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b5a0afc3ee10..c8162c660c44 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -139,7 +139,7 @@ EXPORT_SYMBOL_GPL(nfs_path); struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; @@ -180,16 +180,16 @@ out_nofree: static int nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_getattr(mnt, dentry, stat); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } static int nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_setattr(dentry, attr); return -EACCES; } @@ -279,7 +279,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct dentry *parent = dget_parent(dentry); /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL); dput(parent); if (err != 0) return ERR_PTR(err); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 658e586ca438..1ebe2fc7cda2 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -279,7 +279,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, ssize_t nfs3_listxattr(struct dentry *dentry, char *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); ssize_t result = 0; int error; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1f11d2533ee4..cb28cceefebe 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -120,7 +120,7 @@ static int nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs3_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr, @@ -386,13 +386,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); + nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) goto out_release_acls; } - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -570,7 +570,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -623,7 +623,7 @@ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), @@ -715,7 +715,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index cb170722769c..3a9e75235f30 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, }; - struct nfs42_falloc_res res; - struct nfs_server *server = NFS_SERVER(inode); int status; msg->rpc_argp = &args; @@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status) return status; - return nfs4_call_sync(server->client, server, msg, - &args.seq_args, &res.seq_res, 0); + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; } static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, @@ -84,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } @@ -101,9 +117,16 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; + nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 038a7e1521fa..1a25b27248f2 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -25,16 +25,20 @@ #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_allocate_maxsz) + encode_allocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_allocate_maxsz) + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_deallocate_maxsz) + encode_deallocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_deallocate_maxsz) + decode_deallocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) @@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86d6214ea022..e42be52a8c18 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -4,7 +4,6 @@ */ #include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> @@ -15,6 +14,7 @@ #include "callback.h" #include "delegation.h" #include "nfs4session.h" +#include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" @@ -1130,7 +1130,7 @@ error: */ static int nfs_probe_destination(struct nfs_server *server) { - struct inode *inode = server->super->s_root->d_inode; + struct inode *inode = d_inode(server->super->s_root); struct nfs_fattr *fattr; int error; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 0181cde1d102..f58c17b3b480 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #ifdef CONFIG_NFS_V4_2 #include "nfs42.h" #endif @@ -46,7 +48,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); err = PTR_ERR(ctx); @@ -57,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_wb_all(inode); + nfs_sync_inode(inode); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); @@ -74,7 +76,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) goto out_drop; } } - if (inode != dentry->d_inode) + if (inode != d_inode(dentry)) goto out_drop; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -100,6 +102,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -107,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) - ret = pnfs_layoutcommit_inode(inode, true); + ret = pnfs_sync_inode(inode, !!datasync); mutex_unlock(&inode->i_mutex); /* * If nfs_file_fsync_commit detected a server reboot, then @@ -118,6 +123,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } @@ -152,15 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) - ret = nfs42_proc_deallocate(filep, offset, len); - else - ret = nfs42_proc_allocate(filep, offset, len); - mutex_unlock(&inode->i_mutex); - - nfs_zap_caches(inode); - return ret; + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c index 857e2a99acc8..2e1737c40a29 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -36,7 +36,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/nfs_idmap.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> @@ -49,6 +48,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 diff --git a/include/linux/nfs_idmap.h b/fs/nfs/nfs4idmap.h index 333844e38f66..de44d7330ab3 100644 --- a/include/linux/nfs_idmap.h +++ b/fs/nfs/nfs4idmap.h @@ -1,5 +1,5 @@ /* - * include/linux/nfs_idmap.h + * fs/nfs/nfs4idmap.h * * UID and GID to name mapping for clients. * @@ -46,19 +46,8 @@ struct nfs_server; struct nfs_fattr; struct nfs4_string; -#if IS_ENABLED(CONFIG_NFS_V4) int nfs_idmap_init(void); void nfs_idmap_quit(void); -#else -static inline int nfs_idmap_init(void) -{ - return 0; -} - -static inline void nfs_idmap_quit(void) -{} -#endif - int nfs_idmap_new(struct nfs_client *); void nfs_idmap_delete(struct nfs_client *); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d83cb1fdc70..f592672373cb 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -375,7 +375,7 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * dprintk("%s: getting locations for %pd2\n", __func__, dentry); - err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page); + err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0 || fs_locations->nlocations <= 0 || @@ -396,7 +396,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, { rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 627f37c44456..45b35b9b1e36 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,7 +51,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -63,6 +62,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" @@ -185,7 +185,8 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY, + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, #ifdef CONFIG_NFS_V4_SECURITY_LABEL FATTR4_WORD2_SECURITY_LABEL #endif @@ -293,7 +294,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry))); } *p++ = xdr_one; /* next */ @@ -305,7 +306,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -1004,7 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); struct nfs4_opendata *p; @@ -1057,7 +1058,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - p->o_arg.fh = NFS_FH(dentry->d_inode); + p->o_arg.fh = NFS_FH(d_inode(dentry)); } if (attrs != NULL && attrs->ia_valid != 0) { __u32 verf[2]; @@ -1794,7 +1795,7 @@ static const struct rpc_call_ops nfs4_open_confirm_ops = { */ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) { - struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(data->dir)); struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], @@ -1951,7 +1952,7 @@ static const struct rpc_call_ops nfs4_open_ops = { static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -1998,7 +1999,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) static int _nfs4_recover_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_openres *o_res = &data->o_res; int status; @@ -2067,7 +2068,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ static int _nfs4_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -2314,7 +2315,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); dentry = opendata->dentry; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { /* FIXME: Is this d_drop() ever needed? */ d_drop(dentry); dentry = d_add_unique(dentry, igrab(state->inode)); @@ -2325,7 +2326,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ctx->dentry = dget(dentry); } nfs_set_verifier(dentry, - nfs_save_change_attribute(opendata->dir->d_inode)); + nfs_save_change_attribute(d_inode(opendata->dir))); } ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); @@ -2333,7 +2334,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; ctx->state = state; - if (dentry->d_inode == state->inode) { + if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); @@ -2374,10 +2375,10 @@ static int _nfs4_do_open(struct inode *dir, status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; - if (dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(dentry->d_inode, fmode); + if (d_really_is_positive(dentry)) + nfs4_return_incompatible_delegation(d_inode(dentry), fmode); status = -ENOMEM; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, label, claim, GFP_KERNEL); @@ -2400,8 +2401,8 @@ static int _nfs4_do_open(struct inode *dir, } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } - if (dentry->d_inode != NULL) - opendata->state = nfs4_get_open_state(dentry->d_inode, sp); + if (d_really_is_positive(dentry)) + opendata->state = nfs4_get_open_state(d_inode(dentry), sp); status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) @@ -3095,16 +3096,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, bool auth_probe) { - int status; + int status = 0; - switch (auth_probe) { - case false: + if (!auth_probe) status = nfs4_lookup_root(server, fhandle, info); - if (status != -NFS4ERR_WRONGSEC) - break; - default: + + if (auth_probe || status == NFS4ERR_WRONGSEC) status = nfs4_do_find_root_sec(server, fhandle, info); - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -3254,7 +3252,7 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; struct nfs4_label *label = NULL; @@ -3871,13 +3869,13 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), .pages = pages, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, .plus = plus, }; struct nfs4_readdir_res res; @@ -3914,8 +3912,8 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, do { err = _nfs4_proc_readdir(dentry, cred, cookie, pages, count, plus); - trace_nfs4_readdir(dentry->d_inode, err); - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err, + trace_nfs4_readdir(d_inode(dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, &exception); } while (exception.retry); return err; @@ -4830,7 +4828,7 @@ nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -5670,7 +5668,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); @@ -6112,7 +6110,7 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); } static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, @@ -6121,7 +6119,7 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); } static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, @@ -6130,7 +6128,7 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, { size_t len = sizeof(XATTR_NAME_NFSV4_ACL); - if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) + if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) return 0; if (list && len <= list_len) @@ -6158,7 +6156,7 @@ static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, void *buf, size_t buflen, int type) { if (security_ismaclabel(key)) - return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return nfs4_get_security_label(d_inode(dentry), buf, buflen); return -EOPNOTSUPP; } @@ -6168,10 +6166,10 @@ static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, { size_t len = 0; - if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { - len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(d_inode(dentry), NULL, 0); if (list && len <= list_len) - security_inode_listsecurity(dentry->d_inode, list, len); + security_inode_listsecurity(d_inode(dentry), list, len); } return len; } @@ -7944,6 +7942,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, }; struct nfs4_getdeviceinfo_res res = { .pdev = pdev, @@ -7958,6 +7958,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f95e3b58bbc3..2782cfca2265 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -42,7 +42,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -57,6 +56,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -1902,7 +1902,7 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) goto out; } - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(inode, locations, page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", @@ -2021,7 +2021,7 @@ restart: rcu_read_unlock(); - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); status = nfs4_proc_fsid_present(inode, cred); if (status != -NFS4ERR_MOVED) goto restart; /* wasn't this one */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 75090feeafad..6fb7cb6b3f4b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -3,12 +3,12 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index b6ebe7e445f6..0fbd3ab1be22 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -6,10 +6,10 @@ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> */ #include <linux/sysctl.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 1c32adbe728d..470af1a78bec 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fileid = 0; __entry->fhandle = 0; } - __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); + __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); __assign_str(name, ctx->dentry->d_name.name); ), @@ -1110,7 +1110,7 @@ TRACE_EVENT(nfs4_layoutget, ), TP_fast_assign( - const struct inode *inode = ctx->dentry->d_inode; + const struct inode *inode = d_inode(ctx->dentry); __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5c399ec41079..0aea97841d30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,10 +52,10 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + 4); *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); + *p++ = cpu_to_be32(args->notify_types); } static void @@ -5753,8 +5753,9 @@ out_overflow: #if defined(CONFIG_NFS_V4_1) static int decode_getdeviceinfo(struct xdr_stream *xdr, - struct pnfs_device *pdev) + struct nfs4_getdeviceinfo_res *res) { + struct pnfs_device *pdev = res->pdev; __be32 *p; uint32_t len, type; int status; @@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; - if (be32_to_cpup(p++) & - ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { - dprintk("%s: unsupported notification\n", - __func__); - } - + res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { if (be32_to_cpup(p++)) { dprintk("%s: unsupported notification\n", @@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res); out: return status; } @@ -7365,6 +7361,11 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7417,6 +7418,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index 4eb0aead69b6..c74f7af23d77 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -7,3 +7,6 @@ #define CREATE_TRACE_POINTS #include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 24e1d7403c0b..5aaed363556a 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) dprintk("%s: free od=%p\n", __func__, de->od.od); osduld_put_device(de->od.od); - kfree(de); + kfree_rcu(d, rcu); } struct objio_segment { @@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_read_ops = &objio_pg_read_ops, .pg_write_ops = &objio_pg_write_ops, + .sync = pnfs_generic_sync, + .free_deviceid_node = objio_free_deviceid_node, .encode_layoutcommit = objlayout_encode_layoutcommit, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d57190a0d533..282b39369510 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -938,7 +938,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; - flctx = req->wb_context->dentry->d_inode->i_flctx; + flctx = d_inode(req->wb_context->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b02fbb9..230606243be6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino) pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); return true; out_noroc: @@ -1104,8 +1105,10 @@ out_noroc: } } spin_unlock(&ino->i_lock); - if (layoutreturn) + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } return false; } @@ -1841,7 +1844,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else pnfs_ld_handle_write_error(hdr); @@ -1902,7 +1906,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -2032,7 +2035,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) @@ -2099,64 +2101,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", - __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { - /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); - } - if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); - - /* if pnfs_layoutcommit_inode() runs between inode locks, the next one - * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ - if (mark_as_dirty) - mark_inode_dirty_sync(inode); -} -EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); - -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) -{ - struct inode *inode = data->inode; - struct nfs_inode *nfsi = NFS_I(inode); - bool mark_as_dirty = false; - - spin_lock(&inode->i_lock); - if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(data->lseg); + pnfs_get_lseg(lseg); } - if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, data->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ if (mark_as_dirty) mark_inode_dirty_sync(inode); } -EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { @@ -2216,7 +2188,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -2233,11 +2204,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = ld->prepare_layoutcommit(&data->args); if (status) { spin_lock(&inode->i_lock); - if (end_pos < nfsi->layout->plh_lwb) + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); put_rpccred(data->cred); - set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); goto clear_layoutcommitting; } } @@ -2258,6 +2229,13 @@ clear_layoutcommitting: } EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 635f0865671c..1e6308f82fc3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type { int how, struct nfs_commit_info *cinfo); + int (*sync)(struct inode *inode, bool datasync); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -203,6 +205,7 @@ struct pnfs_device { struct page **pages; unsigned int pgbase; unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -263,10 +266,11 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_header *); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); @@ -291,6 +295,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ }; /* pnfs_dev.c */ @@ -302,6 +307,7 @@ struct nfs4_deviceid_node { unsigned long flags; unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; + struct rcu_head rcu; atomic_t ref; }; @@ -426,7 +432,7 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) @@ -438,7 +444,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (ld == NULL || ld->clear_request_commit == NULL) @@ -486,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode) return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { @@ -568,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode) return false; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index aa2ec0015183..2961fcd7a2df 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server, */ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: for (i = 0; i < max_pages; i++) @@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server, rcu_read_lock(); d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, hash); - if (d != NULL) - atomic_inc(&d->ref); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; rcu_read_unlock(); return d; } @@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, return; } hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); /* balance the initial ref set in pnfs_insert_deviceid */ - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } if (!atomic_dec_and_test(&d->ref)) return false; d->ld->free_deviceid_node(d); @@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); } rcu_read_unlock(); spin_unlock(&nfs4_deviceid_lock); @@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (hlist_empty(&tmp)) return; - synchronize_rcu(); while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b38fb5f..f37e25b6311c 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void) return(get_v3_ds_connect != NULL); } -void __exit nfs4_pnfs_v3_ds_connect_unload(void) +void nfs4_pnfs_v3_ds_connect_unload(void) { if (get_v3_ds_connect) { symbol_put(nfs3_set_ds_client); @@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list(req, list, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c63189acd052..b417bbcd9704 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -118,7 +118,7 @@ static int nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr @@ -487,7 +487,7 @@ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b8f5c63f77b2..ae0ff7a11b40 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -117,7 +117,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, static void nfs_readpage_release(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, @@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_inc_stats(inode, NFSIOS_READPAGES); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 322b2de02988..f175b833b6ba 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -43,7 +43,6 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -433,7 +432,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; unsigned long blockres; - struct nfs_fh *fh = NFS_FH(dentry->d_inode); + struct nfs_fh *fh = NFS_FH(d_inode(dentry)); struct nfs_fsstat res; int error = -ENOMEM; @@ -447,7 +446,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) pd_dentry = dget_parent(dentry); if (pd_dentry != NULL) { - nfs_zap_caches(pd_dentry->d_inode); + nfs_zap_caches(d_inode(pd_dentry)); dput(pd_dentry); } } @@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->selected_flavor != nfss->client->cl_auth->au_flavor || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2526,7 +2524,7 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 05c9e02f4153..2d56200655fe 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -45,7 +45,7 @@ error: static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page; void *err; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index de54129336c6..fa538b2ba251 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -143,7 +143,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_free_dname(data); ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (ret == 0 && alias->d_inode != NULL && + if (ret == 0 && d_really_is_positive(alias) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -190,7 +190,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) parent = dget_parent(dentry); if (parent == NULL) goto out_free; - dir = parent->d_inode; + dir = d_inode(parent); /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -210,21 +210,21 @@ out_free: void nfs_wait_on_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); } void nfs_block_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); } void nfs_unblock_sillyrename(struct dentry *dentry) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(dir); struct nfs_unlinkdata *data; @@ -367,8 +367,8 @@ static void nfs_async_rename_release(void *calldata) struct nfs_renamedata *data = calldata; struct super_block *sb = data->old_dir->i_sb; - if (data->old_dentry->d_inode) - nfs_mark_for_revalidate(data->old_dentry->d_inode); + if (d_really_is_positive(data->old_dentry)) + nfs_mark_for_revalidate(d_inode(data->old_dentry)); dput(data->old_dentry); dput(data->new_dentry); @@ -529,10 +529,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - fileid = NFS_FILEID(dentry->d_inode); + fileid = NFS_FILEID(d_inode(dentry)); /* Return delegation in anticipation of the rename */ - NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); + NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); sdentry = NULL; do { @@ -554,7 +554,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) */ if (IS_ERR(sdentry)) goto out; - } while (sdentry->d_inode != NULL); /* need negative lookup */ + } while (d_inode(sdentry) != NULL); /* need negative lookup */ /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 759931088094..d12a4be613a5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_inc_stats(inode, NFSIOS_WRITEPAGES); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); @@ -702,7 +702,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; @@ -861,7 +861,7 @@ static void nfs_clear_request_commit(struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); @@ -1591,7 +1591,7 @@ void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo) { struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->dentry->d_inode; + struct inode *inode = d_inode(first->wb_context->dentry); /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1690,7 +1690,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { @@ -1840,17 +1840,16 @@ EXPORT_SYMBOL_GPL(nfs_write_inode); */ int nfs_wb_all(struct inode *inode) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; int ret; trace_nfs_writeback_inode_enter(inode); - ret = sync_inode(inode, &wbc); + ret = filemap_write_and_wait(inode->i_mapping); + if (!ret) { + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (!ret) + pnfs_sync_inode(inode, true); + } trace_nfs_writeback_inode_exit(inode, ret); return ret; diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index fc2d108f5272..a0b77fc1bd39 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -108,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS help This option enables support for manually injecting faults into the NFS server. This is intended to be used for diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c3e3b6e55ae2..f79521a59747 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -599,7 +599,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, + err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, exp.ex_uuid); if (err) goto out4; @@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b) struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && - orig->ex_path.dentry == new->ex_path.dentry && - orig->ex_path.mnt == new->ex_path.mnt; + path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) @@ -891,7 +890,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name, printk("nfsd: exp_rootfh path not found %s", name); return err; } - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, @@ -1159,6 +1158,7 @@ static struct flags { { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, + { NFSEXP_PNFS, {"pnfs", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index ac54ea60b3f6..d54701f6dc78 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -42,7 +42,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -103,7 +103,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -266,9 +266,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, * nfsd_dispatch actually ensures the following cannot happen. * However, it seems fragile to depend on that. */ - if (dentry == NULL || dentry->d_inode == NULL) + if (dentry == NULL || d_really_is_negative(dentry)) return 0; - inode = dentry->d_inode; + inode = d_inode(dentry); p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->mask); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 34cbbab6abd7..882b1a14bc3e 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -39,7 +39,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -94,7 +94,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -174,8 +174,8 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct dentry *dentry = resp->fh.fh_dentry; p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && dentry->d_inode) { - struct inode *inode = dentry->d_inode; + if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { + struct inode *inode = d_inode(dentry); struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 12f2aab4f614..7b755b7f785c 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -166,7 +166,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, rqstp->rq_vec, argp->vlen, &resp->count); if (nfserr == 0) { - struct inode *inode = resp->fh.fh_dentry->d_inode; + struct inode *inode = d_inode(resp->fh.fh_dentry); resp->eof = (argp->offset + resp->count) >= inode->i_size; } @@ -551,7 +551,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, * different read/write sizes for file systems known to have * problems with large blocks */ if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ if (sb->s_magic == MSDOS_SUPER_MAGIC) { @@ -587,7 +587,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ switch (sb->s_magic) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 39c5eb3ad33a..e4b2b4322553 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) default: case FSIDSOURCE_DEV: p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_inode->i_sb->s_dev)); + (d_inode(fhp->fh_dentry)->i_sb->s_dev)); break; case FSIDSOURCE_FSID: p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); @@ -203,14 +203,14 @@ static __be32 * encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode) { + if (dentry && d_really_is_positive(dentry)) { __be32 err; struct kstat stat; err = fh_getattr(fhp, &stat); if (!err) { *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(dentry->d_inode, &stat.mtime); + lease_get_mtime(d_inode(dentry), &stat.mtime); return encode_fattr3(rqstp, p, fhp, &stat); } } @@ -233,7 +233,7 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode && fhp->fh_post_saved) { + if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { if (fhp->fh_pre_saved) { *p++ = xdr_one; p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); @@ -260,11 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; + fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; if (err) { fhp->fh_post_saved = 0; /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; } else fhp->fh_post_saved = 1; } @@ -628,7 +628,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_attrstat *resp) { if (resp->status == 0) { - lease_get_mtime(resp->fh.fh_dentry->d_inode, + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); } @@ -828,7 +828,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, return rv; if (d_mountpoint(dchild)) goto out; - if (!dchild->d_inode) + if (d_really_is_negative(dchild)) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); out: diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 59fd76651781..67242bf7c6cc 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -139,7 +139,7 @@ int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = 0; struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; @@ -499,43 +499,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s state->mask.allow |= astate->allow; } -/* - * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS, - * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate - * to traditional read/write/execute permissions. - * - * It's problematic to reject acls that use certain mode bits, because it - * places the burden on users to learn the rules about which bits one - * particular server sets, without giving the user a lot of help--we return an - * error that could mean any number of different things. To make matters - * worse, the problematic bits might be introduced by some application that's - * automatically mapping from some other acl model. - * - * So wherever possible we accept anything, possibly erring on the side of - * denying more permissions than necessary. - * - * However we do reject *explicit* DENY's of a few bits representing - * permissions we could never deny: - */ - -static inline int check_deny(u32 mask, int isowner) -{ - if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL)) - return -EINVAL; - if (!isowner) - return 0; - if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)) - return -EINVAL; - return 0; -} - static struct posix_acl * posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) { struct posix_acl_entry *pace; struct posix_acl *pacl; int nace; - int i, error = 0; + int i; /* * ACLs with no ACEs are treated differently in the inheritable @@ -560,17 +530,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace = pacl->a_entries; pace->e_tag = ACL_USER_OBJ; - error = check_deny(state->owner.deny, 1); - if (error) - goto out_err; low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); for (i=0; i < state->users->n; i++) { pace++; pace->e_tag = ACL_USER; - error = check_deny(state->users->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->users->aces[i].perms.allow, &pace->e_perm, flags); pace->e_uid = state->users->aces[i].uid; @@ -579,18 +543,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_GROUP_OBJ; - error = check_deny(state->group.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); add_to_mask(state, &state->group); for (i=0; i < state->groups->n; i++) { pace++; pace->e_tag = ACL_GROUP; - error = check_deny(state->groups->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->groups->aces[i].perms.allow, &pace->e_perm, flags); pace->e_gid = state->groups->aces[i].gid; @@ -605,15 +563,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_OTHER; - error = check_deny(state->other.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); return pacl; -out_err: - posix_acl_release(pacl); - return ERR_PTR(error); } static inline void allow_bits(struct posix_ace_state *astate, u32 mask) @@ -828,7 +780,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, return error; dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) return nfserr_attrnotsupp; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 92b9d97aff4f..864e2003e8de 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -52,7 +52,7 @@ static inline void nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) { - struct inode *inode = resfh->fh_dentry->d_inode; + struct inode *inode = d_inode(resfh->fh_dentry); int status; mutex_lock(&inode->i_mutex); @@ -110,7 +110,7 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * in current environment or not. */ if (bmval[0] & FATTR4_WORD0_ACL) { - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; } @@ -209,7 +209,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) { - umode_t mode = fh->fh_dentry->d_inode->i_mode; + umode_t mode = d_inode(fh->fh_dentry)->i_mode; if (S_ISREG(mode)) return nfs_ok; @@ -470,7 +470,7 @@ out: fh_put(resfh); kfree(resfh); } - nfsd4_cleanup_open_state(cstate, open, status); + nfsd4_cleanup_open_state(cstate, open); nfsd4_bump_seqid(cstate, status); return status; } @@ -881,7 +881,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; } else @@ -1030,6 +1030,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } + if (!file) + return nfserr_bad_stateid; status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, fallocate->falloc_offset, @@ -1069,6 +1071,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; } + if (!file) + return nfserr_bad_stateid; switch (seek->seek_whence) { case NFS4_CONTENT_DATA: @@ -1308,7 +1312,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, + nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; @@ -1342,7 +1346,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); if (!ops) goto out; - inode = current_fh->fh_dentry->d_inode; + inode = d_inode(current_fh->fh_dentry); nfserr = nfserr_inval; if (new_size <= seg->offset) { @@ -1815,7 +1819,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, bmap0 &= ~FATTR4_WORD0_FILEHANDLE; } if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { - ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + ret += NFS4_MAXLABELLEN + 12; bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; } /* @@ -2282,13 +2286,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_allocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_ALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_DEALLOCATE] = { .op_func = (nfsd4op_func)nfsd4_deallocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_DEALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1c307f02baa8..d88ea7b9a85c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,14 +192,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } - if (dentry->d_inode) + if (d_really_is_positive(dentry)) /* * In the 4.1 case, where we're called from * reclaim_complete(), records from the previous reboot @@ -209,11 +209,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -285,7 +285,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); while (!list_empty(&ctx.names)) { struct name_list *entry; entry = list_entry(ctx.names.next, struct name_list, list); @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); nfs4_reset_creds(original_cred); return status; } @@ -316,20 +316,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } status = -ENOENT; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(dir->d_inode, dentry); + status = vfs_rmdir(d_inode(dir), dentry); out: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return status; } @@ -385,7 +385,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(child->d_name.name, nn)) return 0; - status = vfs_rmdir(parent->d_inode, child); + status = vfs_rmdir(d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 326a545ea7b2..38f2d7abe3a7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1139,7 +1139,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid) return sid->sequence % SESSION_HASH_SIZE; } -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { @@ -4049,7 +4049,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; - status = nfserr_jukebox; } /* @@ -4118,7 +4117,7 @@ out: } void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status) + struct nfsd4_open *open) { if (open->op_openowner) { struct nfs4_stateowner *so = &open->op_openowner->oo_owner; @@ -4473,7 +4472,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; struct svc_fh *current_fh = &cstate->current_fh; - struct inode *ino = current_fh->fh_dentry->d_inode; + struct inode *ino = d_inode(current_fh->fh_dentry); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct file *file = NULL; __be32 status; @@ -5171,7 +5170,7 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; - struct inode *inode = cstate->current_fh.fh_dentry->d_inode; + struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; unsigned int strhashval; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5fb7e78169a6..158badf945df 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -424,7 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, len += 4; dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); - if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) + if (dummy32 > NFS4_MAXLABELLEN) return nfserr_badlabel; len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -2020,7 +2020,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr, * dentries/path components in an array. */ for (;;) { - if (cur.dentry == root->dentry && cur.mnt == root->mnt) + if (path_equal(&cur, root)) break; if (cur.dentry == cur.mnt->mnt_root) { if (follow_up(&cur)) @@ -2292,7 +2292,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, #ifdef CONFIG_NFSD_V4_SECURITY_LABEL if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(dentry->d_inode, + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { @@ -2384,7 +2384,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, dentry->d_inode); + p = encode_change(p, &stat, d_inode(dentry)); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); @@ -2807,7 +2807,7 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * nfsd_buffered_readdir drops the i_mutex between * readdir and calling this callback, leaving a window @@ -3324,7 +3324,7 @@ static __be32 nfsd4_encode_splice_read( } eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3401,7 +3401,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); @@ -3422,6 +3422,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; struct file *file = read->rd_filp; + struct svc_fh *fhp = read->rd_fhp; int starting_len = xdr->buf->len; struct raparms *ra; __be32 *p; @@ -3445,12 +3446,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (!read->rd_filp) { + if (read->rd_filp) + err = nfsd_permission(resp->rqstp, fhp->fh_export, + fhp->fh_dentry, + NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); + else err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, &file, &ra); - if (err) - goto err_truncate; - } + if (err) + goto err_truncate; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) err = nfsd4_encode_splice_read(resp, read, file, maxcount); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index aa47d75ddb26..9690cb4dd588 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1250,15 +1250,15 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_cld_notifier(); - if (retval) - return retval; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_unregister_notifier; - retval = nfsd4_init_slabs(); + return retval; + retval = register_cld_notifier(); if (retval) goto out_unregister_pernet; + retval = nfsd4_init_slabs(); + if (retval) + goto out_unregister_notifier; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; @@ -1290,10 +1290,10 @@ out_exit_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_pernet: - unregister_pernet_subsys(&nfsd_net_ops); out_unregister_notifier: unregister_cld_notifier(); +out_unregister_pernet: + unregister_pernet_subsys(&nfsd_net_ops); return retval; } @@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void) nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); - unregister_pernet_subsys(&nfsd_net_ops); unregister_cld_notifier(); + unregister_pernet_subsys(&nfsd_net_ops); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 565c4da1a9eb..cf980523898b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,7 +24,7 @@ #include "export.h" #undef ifdebug -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG # define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) #else # define ifdebug(flag) if (0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index e9fa966fc37f..350041a40fe5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -38,7 +38,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(parent->d_inode, MAY_EXEC); + err = inode_permission(d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; @@ -340,7 +340,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); + error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); if (error) goto out; @@ -412,8 +412,8 @@ static inline void _fh_update_old(struct dentry *dentry, struct svc_export *exp, struct knfsd_fh *fh) { - fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); - fh->ofh_generation = dentry->d_inode->i_generation; + fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); + fh->ofh_generation = d_inode(dentry)->i_generation; if (d_is_dir(dentry) || (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) fh->ofh_dirino = 0; @@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp) static struct super_block *exp_sb(struct svc_export *exp) { - return exp->ex_path.dentry->d_inode->i_sb; + return d_inode(exp->ex_path.dentry)->i_sb; } static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) @@ -520,12 +520,12 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * */ - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", MAJOR(ex_dev), MINOR(ex_dev), - (long) exp->ex_path.dentry->d_inode->i_ino, + (long) d_inode(exp->ex_path.dentry)->i_ino, dentry, (inode ? inode->i_ino : 0)); @@ -558,7 +558,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; fhp->fh_handle.ofh_xino = - ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino); + ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); @@ -570,7 +570,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, mk_fsid(fhp->fh_handle.fh_fsid_type, fhp->fh_handle.fh_fsid, ex_dev, - exp->ex_path.dentry->d_inode->i_ino, + d_inode(exp->ex_path.dentry)->i_ino, exp->ex_fsid, exp->ex_uuid); if (inode) @@ -597,7 +597,7 @@ fh_update(struct svc_fh *fhp) goto out_bad; dentry = fhp->fh_dentry; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_negative; if (fhp->fh_handle.fh_version != 1) { _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f22920442172..1e90dad4926b 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -225,7 +225,7 @@ fill_pre_wcc(struct svc_fh *fhp) { struct inode *inode; - inode = fhp->fh_dentry->d_inode; + inode = d_inode(fhp->fh_dentry); if (!fhp->fh_pre_saved) { fhp->fh_pre_mtime = inode->i_mtime; fhp->fh_pre_ctime = inode->i_ctime; @@ -264,7 +264,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) return; } - inode = dentry->d_inode; + inode = d_inode(dentry); mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); fhp->fh_locked = 1; @@ -284,7 +284,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); fhp->fh_locked = 0; } } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b8680738f588..aecbcd34d336 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -223,7 +223,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } fh_init(newfhp, NFS_FHSIZE); nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); - if (!nfserr && !dchild->d_inode) + if (!nfserr && d_really_is_negative(dchild)) nfserr = nfserr_noent; dput(dchild); if (nfserr) { @@ -241,7 +241,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } } - inode = newfhp->fh_dentry->d_inode; + inode = d_inode(newfhp->fh_dentry); /* Unfudge the mode bits */ if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 412d7061f9e5..79d964aa8079 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -187,7 +187,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) stat->ino); *p++ = htonl((u32) stat->atime.tv_sec); *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); - lease_get_mtime(dentry->d_inode, &time); + lease_get_mtime(d_inode(dentry), &time); *p++ = htonl((u32) time.tv_sec); *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); *p++ = htonl((u32) stat->ctime.tv_sec); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 368526582429..84d770be056e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -174,7 +174,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; - return dentry->d_inode != NULL; + return d_inode(dentry) != NULL; } __be32 @@ -270,7 +270,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, * dentry may be negative, it may need to be updated. */ err = fh_compose(resfh, exp, dentry, fhp); - if (!err && !dentry->d_inode) + if (!err && d_really_is_negative(dentry)) err = nfserr_noent; out: dput(dentry); @@ -284,7 +284,7 @@ out: static int commit_metadata(struct svc_fh *fhp) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); const struct export_operations *export_ops = inode->i_sb->s_export_op; if (!EX_ISSYNC(fhp->fh_export)) @@ -364,7 +364,7 @@ static __be32 nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); int host_err; if (iap->ia_size < inode->i_size) { @@ -426,7 +426,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); /* Ignore any mode updates on symlinks */ if (S_ISLNK(inode->i_mode)) @@ -495,7 +495,7 @@ out: */ int nfsd4_is_junction(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode == NULL) return 0; @@ -521,9 +521,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return nfserrno(host_error); } #else @@ -706,7 +706,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); /* Disallow write access to files with the append-only bit set * or any access when mandatory locking enabled @@ -1211,7 +1211,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); err = nfserr_notdir; if (!dirp->i_op->lookup) @@ -1250,7 +1250,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * Make sure the child dentry is still negative ... */ err = nfserr_exist; - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { dprintk("nfsd_create: dentry %pd/%pd not negative!\n", dentry, dchild); goto out; @@ -1353,7 +1353,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); /* Get all the sanity checks out of the way before * we lock the parent. */ @@ -1376,7 +1376,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; /* If file doesn't exist, check for permissions to create one */ - if (!dchild->d_inode) { + if (d_really_is_negative(dchild)) { err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1397,7 +1397,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { err = 0; switch (createmode) { @@ -1420,17 +1420,17 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } break; case NFS3_CREATE_EXCLUSIVE: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; break; } case NFS4_CREATE_EXCLUSIVE4_1: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; goto set_attr; @@ -1513,7 +1513,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); err = nfserr_inval; if (!inode->i_op->readlink) @@ -1576,7 +1576,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = vfs_symlink(dentry->d_inode, dnew, path); + host_err = vfs_symlink(d_inode(dentry), dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1632,7 +1632,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; - dirp = ddir->d_inode; + dirp = d_inode(ddir); dnew = lookup_one_len(name, ddir, len); host_err = PTR_ERR(dnew); @@ -1642,7 +1642,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; err = nfserr_noent; - if (!dold->d_inode) + if (d_really_is_negative(dold)) goto out_dput; host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { @@ -1689,10 +1689,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = fdentry->d_inode; + fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = tdentry->d_inode; + tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1717,7 +1717,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_nfserr; host_err = -ENOENT; - if (!odentry->d_inode) + if (d_really_is_negative(odentry)) goto out_dput_old; host_err = -EINVAL; if (odentry == trap) @@ -1790,21 +1790,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); rdentry = lookup_one_len(fname, dentry, flen); host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) goto out_nfserr; - if (!rdentry->d_inode) { + if (d_really_is_negative(rdentry)) { dput(rdentry); err = nfserr_noent; goto out; } if (!type) - type = rdentry->d_inode->i_mode & S_IFMT; + type = d_inode(rdentry)->i_mode & S_IFMT; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry, NULL); @@ -2015,7 +2015,7 @@ __be32 nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err; if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0bda93e58e1b..f982ae84f0cd 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -40,7 +40,6 @@ #include "state.h" #include "nfsd.h" -#define NFSD4_MAX_SEC_LABEL_LEN 2048 #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) @@ -632,7 +631,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); cinfo->atomic = fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); cinfo->before_change = fhp->fh_pre_change; cinfo->after_change = fhp->fh_post_change; @@ -683,7 +682,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status); + struct nfsd4_open *open); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 197a63e9d102..0ee0bed3649b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -435,7 +435,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, */ int nilfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = nilfs_chunk_size(dir); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index be936df4ba73..258d9fe2521a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -835,7 +835,7 @@ void nilfs_evict_inode(struct inode *inode) int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int err; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 0f84b257932c..22180836ec22 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -192,7 +192,7 @@ out_fail: static int nilfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct nilfs_transaction_info ti; int err; @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto out; - inode = dentry->d_inode; + inode = d_inode(dentry); err = -EIO; if (le64_to_cpu(de->inode) != inode->i_ino) goto out; @@ -318,7 +318,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) if (!err) { nilfs_mark_inode_dirty(dir); - nilfs_mark_inode_dirty(dentry->d_inode); + nilfs_mark_inode_dirty(d_inode(dentry)); err = nilfs_transaction_commit(dir->i_sb); } else nilfs_transaction_abort(dir->i_sb); @@ -328,7 +328,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; int err; @@ -358,8 +358,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct nilfs_dir_entry *dir_de = NULL; struct page *old_page; @@ -453,13 +453,13 @@ static struct dentry *nilfs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct nilfs_root *root; - ino = nilfs_inode_by_name(child->d_inode, &dotdot); + ino = nilfs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - root = NILFS_I(child->d_inode)->i_root; + root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(child->d_inode->i_sb, root, ino); + inode = nilfs_iget(d_inode(child)->i_sb, root, ino); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c1725f20a9d1..f47585bfeb01 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -610,7 +610,7 @@ static int nilfs_unfreeze(struct super_block *sb) static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; struct the_nilfs *nilfs = root->nilfs; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); unsigned long long blocks; @@ -681,7 +681,7 @@ static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); @@ -1190,7 +1190,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; - root = NILFS_I(sb->s_root->d_inode)->i_root; + root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); if (err) goto restore_opts; diff --git a/fs/nsfs.c b/fs/nsfs.c index af1b24fa899d..99521e7c492b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,7 +13,7 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", @@ -22,7 +22,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) static void ns_prune_dentry(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { struct ns_common *ns = inode->i_private; atomic_long_set(&ns->stashed, 0); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 1d0c21df0d80..d284f07eda77 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2889,7 +2889,7 @@ void ntfs_truncate_vfs(struct inode *vi) { */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *vi = dentry->d_inode; + struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index b3973c2fd190..0f35b80d17fe 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -292,14 +292,14 @@ const struct inode_operations ntfs_dir_inode_ops = { * The code is based on the ext3 ->get_parent() implementation found in * fs/ext3/namei.c::ext3_get_parent(). * - * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down. + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. * * Return the dentry of the parent directory on success or the error code on * error (IS_ERR() is true). */ static struct dentry *ntfs_get_parent(struct dentry *child_dent) { - struct inode *vi = child_dent->d_inode; + struct inode *vi = d_inode(child_dent); ntfs_inode *ni = NTFS_I(vi); MFT_RECORD *mrec; ntfs_attr_search_ctx *ctx; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 4fda7a5f3088..290373024d9d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -42,8 +42,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) { unsigned long gen = - OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - BUG_ON(dentry->d_inode); + OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + BUG_ON(d_inode(dentry)); dentry->d_fsdata = (void *)gen; } @@ -57,7 +57,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, @@ -71,7 +71,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) unsigned long gen = (unsigned long) dentry->d_fsdata; unsigned long pgen; spin_lock(&dentry->d_lock); - pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; spin_unlock(&dentry->d_lock); trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, dentry->d_name.name, @@ -146,7 +146,7 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (skip_unhashed && d_unhashed(dentry)) return 0; - parent = dentry->d_parent->d_inode; + parent = d_inode(dentry->d_parent); /* Negative parent dentry? */ if (!parent) return 0; @@ -243,7 +243,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; - if (!dentry->d_inode && dentry->d_fsdata) { + if (d_really_is_negative(dentry) && dentry->d_fsdata) { /* Converting a negative dentry to positive Clear dentry->d_fsdata */ dentry->d_fsdata = dl = NULL; @@ -446,7 +446,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, { int ret; struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * Move within the same directory, so the actual lock info won't diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index f0344b75b14d..3d8639f38973 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -72,7 +72,7 @@ static inline int ocfs2_add_entry(handle_t *handle, struct buffer_head *parent_fe_bh, struct ocfs2_dir_lookup_result *lookup) { - return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + return __ocfs2_add_entry(handle, d_inode(dentry->d_parent), dentry->d_name.name, dentry->d_name.len, inode, blkno, parent_fe_bh, lookup); } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 061ba6a91bf2..b5cf27dcb18a 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -208,7 +208,7 @@ static int dlmfs_file_release(struct inode *inode, static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; error = inode_change_ok(inode, attr); @@ -549,7 +549,7 @@ static int dlmfs_unlink(struct inode *dir, struct dentry *dentry) { int status; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mlog(0, "unlink inode %lu\n", inode->i_ino); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 540dc4bdd042..827fc9809bc2 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -147,7 +147,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) int status; u64 blkno; struct dentry *parent; - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 913fc250d85a..d8b670cbd909 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1126,7 +1126,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; @@ -1275,8 +1275,8 @@ int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; - struct super_block *sb = dentry->d_inode->i_sb; + struct inode *inode = d_inode(dentry); + struct super_block *sb = d_inode(dentry)->i_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; @@ -2114,7 +2114,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); loff_t end; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index be71ca0937f7..b254416dc8d9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1209,7 +1209,7 @@ int ocfs2_drop_inode(struct inode *inode) */ int ocfs2_inode_revalidate(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 09f90cbf0e24..176fe6afd94e 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -689,8 +689,8 @@ static int ocfs2_link(struct dentry *old_dentry, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; - struct inode *old_dir = old_dentry->d_parent->d_inode; + struct inode *inode = d_inode(old_dentry); + struct inode *old_dir = d_inode(old_dentry->d_parent); int err; struct buffer_head *fe_bh = NULL; struct buffer_head *old_dir_bh = NULL; @@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir, int status; int child_locked = 0; bool is_unlinkable = false; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); u64 blkno; @@ -898,7 +898,7 @@ static int ocfs2_unlink(struct inode *dir, dquot_initialize(dir); - BUG_ON(dentry->d_parent->d_inode != dir); + BUG_ON(d_inode(dentry->d_parent) != dir); if (inode == osb->root_inode) return -EPERM; @@ -1209,8 +1209,8 @@ static int ocfs2_rename(struct inode *old_dir, { int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct inode *orphan_dir = NULL; struct ocfs2_dinode *newfe = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; @@ -1454,7 +1454,7 @@ static int ocfs2_rename(struct inode *old_dir, should_add_orphan = true; } } else { - BUG_ON(new_dentry->d_parent->d_inode != new_dir); + BUG_ON(d_inode(new_dentry->d_parent) != new_dir); status = ocfs2_check_dir_for_entry(new_dir, new_dentry->d_name.name, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index df3a500789c7..d8c6af101f3f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4194,7 +4194,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, bool preserve) { int ret; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { @@ -4263,7 +4263,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { int error; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; struct posix_acl *default_acl, *acl; @@ -4357,7 +4357,7 @@ out: /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -4375,7 +4375,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; if (!inode) @@ -4463,7 +4463,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, } error = ocfs2_vfs_reflink(old_path.dentry, - new_path.dentry->d_inode, + d_inode(new_path.dentry), new_dentry, preserve); out_dput: done_path_create(&new_path, new_dentry); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ca7533be479..d03bfbf3d27d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1020,7 +1020,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, int ret = 0, i_ret = 0, b_ret = 0; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry)); if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) return -EOPNOTSUPP; @@ -1028,7 +1028,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return ret; - ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0); if (ret < 0) { mlog_errno(ret); return ret; @@ -1037,7 +1037,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, di = (struct ocfs2_dinode *)di_bh->b_data; down_read(&oi->ip_xattr_sem); - i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size); if (i_ret < 0) b_ret = 0; else { @@ -1045,13 +1045,13 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, buffer += i_ret; size -= i_ret; } - b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + b_ret = ocfs2_xattr_block_list(d_inode(dentry), di, buffer, size); if (b_ret < 0) i_ret = 0; } up_read(&oi->ip_xattr_sem); - ocfs2_inode_unlock(dentry->d_inode, 0); + ocfs2_inode_unlock(d_inode(dentry), 0); brelse(di_bh); @@ -7257,7 +7257,7 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -7267,7 +7267,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7347,7 +7347,7 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -7357,7 +7357,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7399,7 +7399,7 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, buffer, size); } @@ -7413,7 +7413,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 1b8e9e8405b2..f833bf8d5792 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -110,7 +110,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) static int omfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct omfs_inode *oi; @@ -155,7 +155,7 @@ out: static int omfs_delete_entry(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct inode *dirty; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -237,7 +237,7 @@ static int omfs_dir_is_empty(struct inode *inode) static int omfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; @@ -373,8 +373,8 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); int err; if (new_inode) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index f993be7f2156..d9e26cfbb793 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -346,7 +346,7 @@ const struct file_operations omfs_file_operations = { static int omfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/open.c b/fs/open.c index 6796f04d6032..98e5a52dc68c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ @@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; + /* Insert range should only be used exclusively. */ + if ((mode & FALLOC_FL_INSERT_RANGE) && + (mode & ~FALLOC_FL_INSERT_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; diff --git a/fs/pipe.c b/fs/pipe.c index 822da5b7cff0..8865f7963700 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -627,7 +627,7 @@ static struct vfsmount *pipe_mnt __read_mostly; static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3a48bb789c9f..84bb65b83570 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -774,12 +774,12 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int error; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; - acl = get_acl(dentry->d_inode, type); + acl = get_acl(d_backing_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -795,7 +795,7 @@ static int posix_acl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; @@ -834,7 +834,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, const char *xname; size_t size; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; diff --git a/fs/proc/base.c b/fs/proc/base.c index 7a3b82f986dd..093ca14f5701 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -169,7 +169,7 @@ static int get_task_root(struct task_struct *task, struct path *root) static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -186,7 +186,7 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path) static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -514,7 +514,7 @@ static int proc_fd_access_allowed(struct inode *inode) int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; @@ -1362,7 +1362,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1382,7 +1382,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; int error = -EACCES; @@ -1427,7 +1427,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ @@ -1497,7 +1497,7 @@ out_unlock: int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *task; const struct cred *cred; struct pid_namespace *pid = dentry->d_sb->s_fs_info; @@ -1554,7 +1554,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (task) { @@ -1588,7 +1588,7 @@ int pid_delete_dentry(const struct dentry *dentry) * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return proc_inode_is_dead(dentry->d_inode); + return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = @@ -1626,12 +1626,12 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_alloc(dir, &qname); if (!child) goto end_instantiate; - if (instantiate(dir->d_inode, child, task, ptr) < 0) { + if (instantiate(d_inode(dir), child, task, ptr) < 0) { dput(child); goto end_instantiate; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -1674,7 +1674,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; } - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; @@ -1727,7 +1727,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) int rc; rc = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) goto out; @@ -2863,13 +2863,13 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = ns->proc_self->d_inode; + struct inode *inode = d_inode(ns->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = ns->proc_thread_self->d_inode; + struct inode *inode = d_inode(ns->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; @@ -3188,7 +3188,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index af84ad04df77..6e5fcd00733e 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -91,7 +91,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); fd = proc_fd(inode); @@ -151,14 +151,14 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) struct task_struct *task; int ret = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (task) { files = get_files_struct(task); put_task_struct(task); } if (files) { - int fd = proc_fd(dentry->d_inode); + int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index be65b2082135..df6327a2b865 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -101,7 +101,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; @@ -120,7 +120,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7697b6621cfd..8272aaba1bb0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -396,7 +396,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct proc_dir_entry *pde = PDE(dentry->d_inode); + struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) return ERR_PTR(-EINVAL); nd_set_link(nd, pde->data); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index c9eac4563fa8..e512642dbbdc 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -32,7 +32,7 @@ static const struct proc_ns_operations *ns_entries[] = { static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; @@ -53,7 +53,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1bde894bc624..350984a19c83 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -142,7 +142,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct net *net; net = get_proc_task_net(inode); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f92d5dd578a4..fea2561d773b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -604,7 +604,7 @@ static bool proc_sys_fill_cache(struct file *file, return false; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -710,7 +710,7 @@ static int proc_sys_permission(struct inode *inode, int mask) static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) @@ -727,7 +727,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; @@ -773,12 +773,12 @@ static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; - return !PROC_I(dentry->d_inode)->sysctl->unregistering; + return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { - return !!PROC_I(dentry->d_inode)->sysctl->unregistering; + return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) @@ -805,7 +805,7 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *de /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (!inode) return 1; if (name->len != len) diff --git a/fs/proc/root.c b/fs/proc/root.c index e74ac9f1a2c0..b7fa4bfe896a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -195,7 +195,7 @@ void __init proc_root_init(void) static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat ) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 4348bb8907c2..6195b4a7c3b1 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ static unsigned self_inum; int proc_setup_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *self; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 59075b509df3..a8371993b4fb 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -47,7 +47,7 @@ static unsigned thread_self_inum; int proc_setup_thread_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56e1ffda4d89..dc43b5f29305 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -190,7 +190,7 @@ static const struct file_operations pstore_file_operations = { */ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { - struct pstore_private *p = dentry->d_inode->i_private; + struct pstore_private *p = d_inode(dentry)->i_private; int err; err = pstore_check_syslog_permissions(p); @@ -199,7 +199,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, - dentry->d_inode->i_ctime, p->psi); + d_inode(dentry)->i_ctime, p->psi); else return -EPERM; @@ -376,7 +376,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, name); if (!dentry) @@ -396,12 +396,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return 0; fail_lockedalloc: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); kfree(private); fail_alloc: iput(inode); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 44e73923670d..32d2e1a9774c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -182,7 +182,7 @@ static const char *qnx6_checkroot(struct super_block *s) static char match_root[2][3] = {".\0\0", "..\0"}; int i, error = 0; struct qnx6_dir_entry *dir_entry; - struct inode *root = s->s_root->d_inode; + struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; struct page *page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ecc25cf0ee6e..20d1f74561cf 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2328,7 +2328,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(path->dentry->d_inode, type, + error = vfs_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2392,20 +2392,20 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&sb->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(sb->s_root)->i_mutex); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&sb->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(sb->s_root)->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { error = -ENOENT; goto out; } error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + error = vfs_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 0b38befa69f3..ba1323a94924 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -163,7 +163,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) */ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int old_ia_valid = ia->ia_valid; int ret = 0; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 0a7dc941aaf4..4a024e2ceb9f 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -53,8 +53,8 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) { struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; - return (privroot->d_inode && - deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); + return (d_really_is_positive(privroot) && + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); } int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 742242b60972..f6f2fbad9777 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3308,7 +3308,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index cd11358b10c7..b55a074653d7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -400,7 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); if (dir->i_nlink == 0) { return ERR_PTR(-ENOENT); @@ -917,7 +917,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; } - inode = dentry->d_inode; + inode = d_inode(dentry); reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -987,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); /* * in this transaction we can be doing at max two balancings and @@ -1174,7 +1174,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct reiserfs_transaction_handle th; /* * We need blocks for transaction + update of quotas for @@ -1311,8 +1311,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_inode = old_dentry->d_inode; - new_dentry_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_dentry_inode = d_inode(new_dentry); /* * make sure that oldname still exists and points to an object we diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 68b5f182984e..0111ad0466ed 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1687,7 +1687,7 @@ static __u32 find_hash_out(struct super_block *s) __u32 hash = DEFAULT_HASH; __u32 deh_hashval, teahash, r5hash, yurahash; - inode = s->s_root->d_inode; + inode = d_inode(s->s_root); make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); retval = search_by_entry_key(s, &key, &path, &de); @@ -2347,7 +2347,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, err = -EXDEV; goto out; } - inode = path->dentry->d_inode; + inode = d_inode(path->dentry); /* * We must not pack tails for quota files on reiserfs for quota * IO to work diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4e781e697c90..e87f9b52bf06 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -87,9 +87,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -102,11 +102,11 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) - dentry->d_inode->i_flags |= S_DEAD; - mutex_unlock(&dentry->d_inode->i_mutex); + d_inode(dentry)->i_flags |= S_DEAD; + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -120,26 +120,26 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; - if (!privroot->d_inode) + if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) xaroot = ERR_PTR(-ENODATA); - else if (!xaroot->d_inode) { + else if (d_really_is_negative(xaroot)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(privroot->d_inode, xaroot, 0700); + err = xattr_mkdir(d_inode(privroot), xaroot, 0700); if (err) { dput(xaroot); xaroot = ERR_PTR(err); } } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); return xaroot; } @@ -156,21 +156,21 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); - if (!IS_ERR(xadir) && !xadir->d_inode) { + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(xaroot->d_inode, xadir, 0700); + err = xattr_mkdir(d_inode(xaroot), xadir, 0700); if (err) { dput(xadir); xadir = ERR_PTR(err); } } - mutex_unlock(&xaroot->d_inode->i_mutex); + mutex_unlock(&d_inode(xaroot)->i_mutex); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); + WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -207,7 +207,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { return PTR_ERR(dentry); - } else if (!dentry->d_inode) { + } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", "Corrupted directory: xattr %pd listed but " @@ -249,16 +249,16 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_ERR(dir)) { err = PTR_ERR(dir); goto out; - } else if (!dir->d_inode) { + } else if (d_really_is_negative(dir)) { err = 0; goto out_dir; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); buf.xadir = dir; while (1) { - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; if (!buf.count) @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, + mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&dir->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dir->d_parent)->i_mutex); err = jerror ?: err; } } @@ -319,7 +319,7 @@ out: static int delete_one_xattr(struct dentry *dentry, void *data) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); /* This is the xattr dir, handle specially. */ if (d_is_dir(dentry)) @@ -384,27 +384,27 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); goto out; } - if (xafile->d_inode && (flags & XATTR_CREATE)) + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) err = -EEXIST; - if (!xafile->d_inode) { + if (d_really_is_negative(xafile)) { err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_create(xadir->d_inode, xafile, + err = xattr_create(d_inode(xadir), xafile, 0700|S_IFREG); } if (err) dput(xafile); out: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); if (err) return ERR_PTR(err); @@ -469,21 +469,21 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_dput; } - if (dentry->d_inode) { - err = xattr_unlink(xadir->d_inode, dentry); + if (d_really_is_positive(dentry)) { + err = xattr_unlink(d_inode(xadir), dentry); update_ctime(inode); } dput(dentry); out_dput: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); return err; } @@ -533,7 +533,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, else chunk = buffer_size - buffer_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -573,18 +573,18 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, } new_size = buffer_size + sizeof(struct reiserfs_xattr_header); - if (!err && new_size < i_size_read(dentry->d_inode)) { + if (!err && new_size < i_size_read(d_inode(dentry))) { struct iattr newattrs = { .ia_ctime = current_fs_time(inode->i_sb), .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - inode_dio_wait(dentry->d_inode); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); } else update_ctime(inode); out_unlock: @@ -657,7 +657,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, down_read(&REISERFS_I(inode)->i_xattr_sem); - isize = i_size_read(dentry->d_inode); + isize = i_size_read(d_inode(dentry)); /* Just return the size needed */ if (buffer == NULL) { @@ -680,7 +680,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, else chunk = isize - file_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -775,7 +775,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->get(dentry, name, buffer, size, handler->flags); @@ -784,7 +784,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, /* * Inode operation setxattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, @@ -794,7 +794,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, value, size, flags, handler->flags); @@ -803,7 +803,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, /* * Inode operation removexattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { @@ -811,7 +811,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); @@ -875,14 +875,14 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) .size = buffer ? size : 0, }; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return -EINVAL; if (!dentry->d_sb->s_xattr || - get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE); + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); if (err == -ENODATA) @@ -890,9 +890,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + mutex_unlock(&d_inode(dir)->i_mutex); if (!err) err = buf.pos; @@ -905,12 +905,12 @@ out: static int create_privroot(struct dentry *dentry) { int err; - struct inode *inode = dentry->d_parent->d_inode; + struct inode *inode = d_inode(dentry->d_parent); WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); - if (err || !dentry->d_inode) { + if (err || d_really_is_negative(dentry)) { reiserfs_warning(dentry->d_sb, "jdm-20006", "xattrs/ACLs enabled and couldn't " "find/create .reiserfs_priv. " @@ -918,7 +918,7 @@ static int create_privroot(struct dentry *dentry) return -EOPNOTSUPP; } - dentry->d_inode->i_flags |= S_PRIVATE; + d_inode(dentry)->i_flags |= S_PRIVATE; reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -997,17 +997,17 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&s->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(s->s_root)->i_mutex); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (dentry->d_inode) - dentry->d_inode->i_flags |= S_PRIVATE; + if (d_really_is_positive(dentry)) + d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); return err; } @@ -1026,15 +1026,15 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (err) goto error; - if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - mutex_lock(&s->s_root->d_inode->i_mutex); + if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + mutex_lock(&d_inode(s->s_root)->i_mutex); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); } - if (privroot->d_inode) { + if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&privroot->d_inode->i_mutex); + mutex_lock(&d_inode(privroot)->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1045,7 +1045,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); } error: diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index f620e9678dd5..15dde6262c00 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -78,7 +78,7 @@ static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode) + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index e7f8939a4cb5..9a3b0616f283 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -15,10 +15,10 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -28,10 +28,10 @@ security_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t security_list(struct dentry *dentry, char *list, size_t list_len, @@ -39,7 +39,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len, { const size_t len = namelen + 1; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5eeb0c48ba46..e4f1343714e0 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -14,10 +14,10 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,10 +27,10 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, @@ -38,7 +38,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e50eab046471..d0b08d3e5689 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -15,7 +15,7 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, return -EINVAL; if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,7 +27,7 @@ user_set(struct dentry *dentry, const char *name, const void *buffer, if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t user_list(struct dentry *dentry, char *list, size_t list_size, diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 5e1101ff276f..8073b6532cf0 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -110,7 +110,7 @@ static struct dentry *squashfs_fh_to_parent(struct super_block *sb, static struct dentry *squashfs_get_parent(struct dentry *child) { - struct inode *inode = child->d_inode; + struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 92fcde7b4d61..e5e0ddf5b143 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -39,7 +39,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int); ssize_t squashfs_listxattr(struct dentry *d, char *buffer, size_t buffer_size) { - struct inode *inode = d->d_inode; + struct inode *inode = d_inode(d); struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) @@ -229,7 +229,7 @@ static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name, buffer, size); } @@ -259,7 +259,7 @@ static int squashfs_trusted_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name, buffer, size); } @@ -286,7 +286,7 @@ static int squashfs_security_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name, buffer, size); } diff --git a/fs/stat.c b/fs/stat.c index 19636af5e75c..cccc1aab9a8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(generic_fillattr); */ int vfs_getattr_nosec(struct path *path, struct kstat *stat) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); if (inode->i_op->getattr) return inode->i_op->getattr(path->mnt, path->dentry, stat); @@ -326,7 +326,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, retry: error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); if (!error) { - struct inode *inode = path.dentry->d_inode; + struct inode *inode = d_backing_inode(path.dentry); error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index d42291d08215..8f3555f00c54 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -132,7 +132,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; @@ -176,7 +176,7 @@ found: int sysv_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct page *page = NULL; diff --git a/fs/sysv/file.c b/fs/sysv/file.c index a48e30410ad1..82ddc09061e2 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -30,7 +30,7 @@ const struct file_operations sysv_file_operations = { static int sysv_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 66bc316927e8..2fde40acf024 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -443,7 +443,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *s = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 731b2bbcaab3..11e83ed0b4bf 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -118,7 +118,7 @@ out_fail: static int sysv_link(struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -166,7 +166,7 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct sysv_dir_entry * de; int err = -ENOENT; @@ -187,7 +187,7 @@ out: static int sysv_rmdir(struct inode * dir, struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (sysv_empty_dir(inode)) { @@ -208,8 +208,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct sysv_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c index 00d2f8a43e4e..d3fa0d703314 100644 --- a/fs/sysv/symlink.c +++ b/fs/sysv/symlink.c @@ -10,7 +10,7 @@ static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data); + nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data); return NULL; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 02d1ee778df0..27060fc855d4 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -499,7 +499,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); @@ -554,7 +554,7 @@ out_cancel: static int ubifs_unlink(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *dir_ui = ubifs_inode(dir); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; @@ -646,7 +646,7 @@ static int check_dir_empty(struct ubifs_info *c, struct inode *dir) static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -662,7 +662,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = check_dir_empty(c, dentry->d_inode); + err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -970,8 +970,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); @@ -1136,7 +1136,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { loff_t size; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3ba3fef64e9e..35efc103c39c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1257,7 +1257,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, int ubifs_setattr(struct dentry *dentry, struct iattr *attr) { int err; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_info *c = inode->i_sb->s_fs_info; dbg_gen("ino %lu, mode %#x, ia_valid %#x", @@ -1302,7 +1302,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); + struct ubifs_inode *ui = ubifs_inode(d_inode(dentry)); nd_set_link(nd, ui->data); return NULL; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 90ae1a8439d9..0b9da5b6e0f9 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -930,8 +930,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, union ubifs_key key; struct ubifs_dent_node *dent, *dent2; int err, dlen1, dlen2, ilen, lnum, offs, len; - const struct inode *old_inode = old_dentry->d_inode; - const struct inode *new_inode = new_dentry->d_inode; + const struct inode *old_inode = d_inode(old_dentry); + const struct inode *new_inode = d_inode(new_dentry); int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3659b1934500..96f3448b6eb4 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -364,15 +364,15 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", - name, dentry->d_inode->i_ino, dentry, size); + name, d_inode(dentry)->i_ino, dentry, size); - return setxattr(dentry->d_inode, name, value, size, flags); + return setxattr(d_inode(dentry), name, value, size, flags); } ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; @@ -432,7 +432,7 @@ out_unlock: ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) { union ubifs_key key; - struct inode *host = dentry->d_inode; + struct inode *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_dent_node *xent, *pxent = NULL; @@ -535,7 +535,7 @@ out_cancel: int ubifs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; diff --git a/fs/udf/file.c b/fs/udf/file.c index 5dadad9960b9..7a95b8fed302 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -249,7 +249,7 @@ const struct file_operations udf_file_operations = { static int udf_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 39661977c89c..5c03f0dfb98b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -551,7 +551,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -767,7 +767,7 @@ static int empty_dir(struct inode *dir) static int udf_rmdir(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; @@ -809,7 +809,7 @@ out: static int udf_unlink(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi; struct fileIdentDesc cfi; @@ -999,7 +999,7 @@ out_no_entry: static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -1038,8 +1038,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_bh ofibh, nfibh; struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; struct fileIdentDesc ocfi, ncfi; @@ -1179,7 +1179,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) + if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi)) return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) @@ -1187,7 +1187,7 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.sbh); tloc = lelb_to_cpu(cfi.icb.extLocation); - inode = udf_iget(child->d_inode->i_sb, &tloc); + inode = udf_iget(d_inode(child)->i_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 0ecc2cebed8f..1bfe8cabff0f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -311,7 +311,7 @@ found: */ int ufs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index fd65deb4b5f0..e491a93a7e9a 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -165,7 +165,7 @@ out_fail: static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; lock_ufs(dir->i_sb); @@ -222,7 +222,7 @@ out_fail: static int ufs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -244,7 +244,7 @@ out: static int ufs_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; lock_ufs(dir->i_sb); @@ -263,8 +263,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct ufs_dir_entry * dir_de = NULL; struct page *old_page; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8092d3759a5e..b3bc3e7ae79d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -144,10 +144,10 @@ static struct dentry *ufs_get_parent(struct dentry *child) struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; - ino = ufs_inode_by_name(child->d_inode, &dot_dot); + ino = ufs_inode_by_name(d_inode(child), &dot_dot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino)); } static const struct export_operations ufs_export_ops = { diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index d283628b4778..5b537e2fdda3 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -34,7 +34,7 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct ufs_inode_info *p = UFS_I(dentry->d_inode); + struct ufs_inode_info *p = UFS_I(d_inode(dentry)); nd_set_link(nd, (char*)p->i_u1.i_symlink); return NULL; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f04f89fbd4d9..21154704c168 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -492,7 +492,7 @@ out: int ufs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a6fbf4472017..516162be1398 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -260,6 +260,7 @@ xfs_alloc_fix_len( rlen = rlen - (k - args->mod); else rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ if ((int)rlen < (int)args->minlen) return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); @@ -286,7 +287,8 @@ xfs_alloc_fix_minleft( if (diff >= 0) return 1; args->len += diff; /* shrink the allocated space */ - if (args->len >= args->minlen) + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) return 1; args->agbno = NULLAGBLOCK; return 0; @@ -315,6 +317,9 @@ xfs_alloc_fixup_trees( xfs_agblock_t nfbno2; /* second new free startblock */ xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; /* * Look up the record in the by-size tree if necessary. @@ -323,13 +328,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Look up the record in the by-block tree if necessary. @@ -338,13 +343,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } #ifdef DEBUG @@ -355,7 +360,7 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif @@ -386,25 +391,25 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Fix up the by-block btree entry(s). @@ -415,7 +420,7 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } else { /* * Update the by-block entry to start later|be shorter. @@ -429,10 +434,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } return 0; } @@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(fbno <= args->agbno); /* @@ -783,7 +788,7 @@ xfs_alloc_find_best_extent( error = xfs_alloc_get_rec(*scur, sbno, slen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* @@ -946,7 +951,7 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (ltlen >= args->minlen) break; if ((error = xfs_btree_increment(cnt_cur, 0, &i))) @@ -966,7 +971,7 @@ restart: */ if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena < args->minlen) @@ -999,7 +1004,7 @@ restart: cnt_cur->bc_ptrs[0] = besti; if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { @@ -1088,7 +1093,7 @@ restart: if (bno_cur_lt) { if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena >= args->minlen) @@ -1104,7 +1109,7 @@ restart: if (bno_cur_gt) { if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); if (gtlena >= args->minlen) @@ -1303,7 +1308,7 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); @@ -1342,7 +1347,7 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen < args->maxlen) { xfs_agblock_t bestfbno; @@ -1362,13 +1367,13 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen > bestrlen) { @@ -1383,7 +1388,7 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1408,7 +1413,7 @@ restart: if (!xfs_alloc_fix_minleft(args)) goto out_nominleft; rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1422,7 +1427,7 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small( if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); } /* * Nothing in the btree, try the freelist. Make sure @@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small( } args->len = 1; args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1579,7 +1584,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1591,7 +1596,8 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); } } /* @@ -1606,7 +1612,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1618,7 +1624,7 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); } } /* @@ -1635,31 +1641,31 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1672,7 +1678,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(mp, i == 1 && xxbno == ltbno && xxlen == ltlen, error0); } @@ -1695,17 +1701,17 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1721,10 +1727,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1743,7 +1749,7 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1752,10 +1758,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15105dbc9e28..04e79d57bca6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + void xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { @@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); - to->firstused = be16_to_cpu(hdr3->firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); - to->firstused = be16_to_cpu(from->hdr.firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk( void xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { - int i; + int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); @@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk( hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); - hdr3->firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; @@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk( to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); - to->hdr.firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; @@ -178,7 +254,7 @@ xfs_attr3_leaf_verify( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr3_icleaf_hdr ichdr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; @@ -757,9 +833,10 @@ xfs_attr_shortform_allfit( struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); @@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform( memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ @@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node( btree = dp->d_ops->node_tree_p(node); leaf = bp2->b_addr; - xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ @@ -988,7 +1065,7 @@ xfs_attr3_leaf_create( } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; @@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add( trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); @@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add( tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact( ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ - xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, @@ -1344,9 +1421,10 @@ xfs_attr_leaf_order( { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } @@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance( ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; @@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance( ichdr1.count, count); } - xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); - xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); @@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall( */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; @@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall( if (error) return error; - xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - @@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove( trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); @@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove( tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; - if (!ichdr.firstused) - ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; + ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); - xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* @@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance( tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ - xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { @@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance( kmem_free(tmp_leaf); } - xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); @@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int( trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); ASSERT(ichdr.count < args->geo->blksize / 8); @@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue( int valuelen; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); @@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash( { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; @@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag( ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); @@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag( leaf = bp->b_addr; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif @@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags( entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index e2929da7c3ba..025c4b820c03 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); -void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 61ec015dca16..aeffeaaac0ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset( } } -/* - * Debug/sanity checking code - */ - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && - block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) - return 0; - - if (be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - - return 1; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents( goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; @@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree( if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -1311,14 +1285,12 @@ xfs_bmap_read_extents( if (error) return error; block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -1345,9 +1317,6 @@ xfs_bmap_read_extents( XFS_ERRLEVEL_LOW, ip->i_mount, block); goto error0; } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); /* * Read-ahead the next leaf block, if any. */ @@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); ASSERT(bma->idx >= 0); @@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); if (diff > 0) { - error = xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0); + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); ASSERT(!error); if (error) goto done; @@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real( temp += bma->cur->bc_private.b.allocated; ASSERT(temp <= da_old); if (temp < da_old) - xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - (int64_t)(da_old - temp), 0); + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); } /* clear out the allocated field, done with it now in any case. */ @@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; *logflagsp = 0; @@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, @@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), 0); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); /* * Nothing to do for disk quota accounting here. */ @@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; } @@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc( ASSERT(indlen > 0); if (rt) { - error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); + error = xfs_mod_frextents(mp, -((int64_t)extsz)); } else { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); } if (error) goto out_unreserve_quota; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); if (error) goto out_unreserve_blocks; @@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc( out_unreserve_blocks: if (rt) - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + xfs_mod_frextents(mp, extsz); else - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? @@ -4801,7 +4770,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } da_old = da_new = 0; } else { @@ -4835,7 +4804,7 @@ xfs_bmap_del_extent( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; case 2: @@ -4935,7 +4904,8 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); /* * Update the btree record back * to the original value. @@ -4956,7 +4926,7 @@ xfs_bmap_del_extent( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5012,10 +4982,8 @@ xfs_bmap_del_extent( * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5284,14 +5252,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, 0); + xfs_mod_frextents(mp, (int64_t)rtexts); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, 0); + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5453,6 +5420,7 @@ xfs_bmse_merge( struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; + struct xfs_mount *mp = ip->i_mount; xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); @@ -5487,19 +5455,19 @@ xfs_bmse_merge( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); left.br_blockcount = blockcount; @@ -5518,50 +5486,92 @@ xfs_bmse_shift_one( int *current_ext, struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, - int *logflags) + int *logflags, + enum shift_direction direction) { struct xfs_ifork *ifp; + struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_rec_host *adj_irecp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_bmbt_irec adj_irec; int error; int i; + int total_extents; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(gotp, &got); - startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - /* - * Check for merge if we've got an extent to the left, otherwise make - * sure there's enough room at the start of the file for the shift. - */ - if (*current_ext) { - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff < left.br_startoff + left.br_blockcount) + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, leftp, cur, - logflags); + *current_ext, gotp, adj_irecp, + cur, logflags); } - } else if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ - (*current_ext)++; +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; if (!cur) { @@ -5573,18 +5583,18 @@ xfs_bmse_shift_one( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + got.br_blockcount, got.br_state); } /* - * Shift extent records to the left to cover a hole. + * Shift extent records to the left/right to cover/create a hole. * * The maximum number of extents to be shifted in a single operation is - * @num_exts. @start_fsb specifies the file offset to start the shift and the + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb * is the length by which each extent is shifted. If there is no hole to shift * the extents into, this will be considered invalid operation and we abort @@ -5594,12 +5604,13 @@ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, - xfs_fileoff_t *next_fsb, + xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + enum shift_direction direction, int num_exts) { struct xfs_btree_cur *cur = NULL; @@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents( struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; int error = 0; int whichfork = XFS_DATA_FORK; int logflags = 0; - int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents( } /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* * Look up the extent index for the fsb where we start shifting. We can * henceforth iterate with current_ext as extent list changes are locked * out via ilock. * * gotp can be null in 2 cases: 1) if there are no extents or 2) - * start_fsb lies in a hole beyond which there are no extents. Either + * *next_fsb lies in a hole beyond which there are no extents. Either * way, we are done. */ - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); if (!gotp) { *done = 1; goto del_cursor; } - /* - * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot use the count of real extents here. - * Instead we have to calculate it from the incore fork. - */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && current_ext < total_extents) { + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags); + ¤t_ext, gotp, cur, &logflags, + direction); if (error) goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } - /* update total extent count and grab the next record */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (current_ext >= total_extents) + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; break; + } gotp = xfs_iext_get_ext(ifp, current_ext); } - /* Check if we are done */ - if (current_ext == total_extents) { - *done = 1; - } else if (next_fsb) { + if (!*done) { xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; } @@ -5696,3 +5750,189 @@ del_cursor: return error; } + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b9d8a499d2c4..6aaa0c1c7200 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) */ #define XFS_BMAP_MAX_SHIFT_EXTENTS 1 +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int num_exts); + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 81cad433df85..c72283dd8d44 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -168,7 +168,7 @@ xfs_btree_check_lptr( xfs_fsblock_t bno, /* btree block disk address */ int level) /* btree block level */ { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLFSBLOCK && XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); @@ -187,7 +187,7 @@ xfs_btree_check_sptr( { xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLAGBLOCK && bno != 0 && @@ -1825,7 +1825,7 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; @@ -2285,7 +2285,7 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) @@ -3138,7 +3138,7 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); level++; /* @@ -3582,15 +3582,15 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3634,12 +3634,12 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); } } @@ -3653,13 +3653,13 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9cb0115c6bd1..2385f8cd08ab 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -538,12 +538,12 @@ xfs_da3_root_split( oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); btree = dp->d_ops->node_tree_p(oldroot); - size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); - level = nodehdr.level; + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; /* * we are about to copy oldroot to bp, so set up the type diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 0a49b0286372..74bcbabfa523 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr { __uint16_t magic; __uint16_t count; __uint16_t usedbytes; - __uint16_t firstused; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; __u8 holes; struct { __uint16_t base; @@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr { }; /* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 5ff31be9b1cd..de1ea16f5748 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -89,7 +89,7 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -107,21 +107,21 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. @@ -134,18 +134,18 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(lastfree == 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(dup->length) <= be16_to_cpu(bf[2].length)); } @@ -160,13 +160,13 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; @@ -183,14 +183,15 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { @@ -198,13 +199,13 @@ __xfs_dir3_data_check( cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); } - XFS_WANT_CORRUPTED_RETURN(count == + XFS_WANT_CORRUPTED_RETURN(mp, count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); } return 0; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8eb718979383..4daaa662337b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -264,68 +264,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \ - XFS_SB_MVAL(BAD_FEATURES2)) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \ - XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \ - XFS_SB_PQUOTINO) - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 116ef1ddb3e3..07349a183a11 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + percpu_counter_read(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* @@ -700,7 +701,7 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -724,7 +725,7 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(j == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -944,19 +945,19 @@ newino: error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } alloc_inode: @@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); /* * See if we've landed in the parent inode record. The finobt @@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); } - XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } } @@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } @@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); return xfs_inobt_update(cur, &rec); @@ -1340,7 +1341,8 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { + percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > + mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1475,14 +1477,14 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Get the offset in the inode chunk. */ @@ -1592,7 +1594,7 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, ibtrec->ir_free, &i); @@ -1613,12 +1615,12 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && (rec.ir_freecount == ibtrec->ir_freecount), error); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b0a5fe95a3e2..dc4bfc5d88fc 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -111,14 +111,6 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -743,17 +735,15 @@ xfs_initialize_perag_data( btree += pag->pagf_btreeblks; xfs_perag_put(pag); } - /* - * Overwrite incore superblock counters with just-read data - */ + + /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = bfree + bfreelst + btree; spin_unlock(&mp->m_sb_lock); - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); return 0; } @@ -771,6 +761,10 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1d8eef9cf0f5..a56960dd1684 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1232,6 +1232,117 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1320,31 +1431,37 @@ __xfs_get_blocks( xfs_iunlock(ip, lockmode); } - - trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); } else if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK) { - /* - * For unwritten extents do not report a disk address on - * the read case (treat as if we're reading into a hole). - */ - if (create || !ISUNWRITTEN(&imap)) - xfs_map_buffer(inode, bh_result, &imap, offset); - if (create && ISUNWRITTEN(&imap)) { - if (direct) { - bh_result->b_private = inode; - set_buffer_defer_completion(bh_result); - } + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); - } + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); } /* @@ -1377,39 +1494,6 @@ __xfs_get_blocks( } } - /* - * If this is O_DIRECT or the mpage code calling tell them how large - * the mapping is, so that we can avoid repeated get_blocks calls. - * - * If the mapping spans EOF, then we have to break the mapping up as the - * mapping for blocks beyond EOF must be marked new so that sub block - * regions can be correctly zeroed. We can't do this for mappings within - * EOF unless the mapping was just allocated or is unwritten, otherwise - * the callers would overwrite existing data with zeros. Hence we have - * to split the mapping into a range up to and including EOF, and a - * second mapping for beyond EOF. - */ - if (direct || size > (1 << inode->i_blkbits)) { - xfs_off_t mapping_size; - - mapping_size = imap.br_startoff + imap.br_blockcount - iblock; - mapping_size <<= inode->i_blkbits; - - ASSERT(mapping_size > 0); - if (mapping_size > size) - mapping_size = size; - if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { - /* limit mapping to block that spans EOF */ - mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); - } - if (mapping_size > LONG_MAX) - mapping_size = LONG_MAX; - - bh_result->b_size = mapping_size; - } - return 0; out_unlock: @@ -1440,9 +1524,11 @@ xfs_get_blocks_direct( /* * Complete a direct I/O write request. * - * If the private argument is non-NULL __xfs_get_blocks signals us that we - * need to issue a transaction to convert the range from unwritten to written - * extents. + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). */ STATIC void xfs_end_io_direct_write( @@ -1454,43 +1540,71 @@ xfs_end_io_direct_write( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; - if (XFS_FORCED_SHUTDOWN(mp)) + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); return; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_end_io; /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); /* - * For direct I/O we do not know if we need to allocate blocks or not, - * so we can't preallocate an append transaction, as that results in - * nested reservations and log space deadlocks. Hence allocate the - * transaction here. While this is sub-optimal and can block IO - * completion for some time, we're stuck with doing it this way until - * we can pass the ioend to the direct IO allocation callbacks and - * avoid nesting that way. + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. */ - if (private && size > 0) { - xfs_iomap_write_unwritten(ip, offset, size); - } else if (offset + size > ip->i_d.di_size) { - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return; - } + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); - xfs_setfilesize(ip, tp, offset, size); - } +out_end_io: + xfs_end_io(&ioend->io_work); + return; } STATIC ssize_t diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 83af4c149635..f9c1c64782d3 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive( int size; int tmp; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* * Count the number of "remote" value extents. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a43d370d2c58..65fb37a18e92 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; trace_xfs_attr_node_list(context); @@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { @@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_trans_brelse(NULL, bp); return error; } - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; @@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int( struct xfs_attr_leaf_entry *entry; int retval; int i; + struct xfs_mount *mp = context->dp->i_mount; trace_xfs_attr_list_leaf(context); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); cursor = context->cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 22a5dcb70b32..a52bbd3abc7d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1376,22 +1376,19 @@ out: } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) { int done = 0; struct xfs_mount *mp = ip->i_mount; @@ -1400,21 +1397,26 @@ xfs_collapse_file_space( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; - xfs_fileoff_t start_fsb; + xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - trace_xfs_collapse_file_space(ip); + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } - next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. @@ -1427,20 +1429,28 @@ xfs_collapse_file_space( /* * Writeback and invalidate cache for the remainder of the file as we're - * about to shift down every extent from the collapse range to EOF. The - * free of the collapse range above might have already done some of - * this, but we shouldn't rely on it to do anything outside of the range - * that was freed. + * about to shift down every extent from offset to EOF. */ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset + len, -1); + offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - (offset + len) >> PAGE_CACHE_SHIFT, -1); + offset >> PAGE_CACHE_SHIFT, -1); if (error) return error; + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* @@ -1464,7 +1474,7 @@ xfs_collapse_file_space( if (error) goto out; - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&free_list, &first_block); @@ -1472,10 +1482,9 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - start_fsb = next_fsb; - error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, - &done, &next_fsb, &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; @@ -1484,18 +1493,70 @@ xfs_collapse_file_space( goto out; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; out: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + +/* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized @@ -1599,13 +1660,6 @@ xfs_swap_extent_flush( /* Verify O_DIRECT for ftmp */ if (VFS_I(ip)->i_mapping->nrpages) return -EINVAL; - - /* - * Don't try to swap extents on mmap()d files because we can't lock - * out races against page faults safely. - */ - if (mapping_mapped(VFS_I(ip)->i_mapping)) - return -EBUSY; return 0; } @@ -1633,13 +1687,14 @@ xfs_swap_extents( } /* - * Lock up the inodes against other IO and truncate to begin with. - * Then we can ensure the inodes are flushed and have no page cache - * safely. Once we have done this we can take the ilocks and do the rest - * of the checks. + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL; + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { @@ -1666,8 +1721,16 @@ xfs_swap_extents( xfs_trans_cancel(tp, 0); goto out_unlock; } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || @@ -1720,9 +1783,6 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); - /* * Before we've swapped the forks, lets set the owners of the forks * appropriately. We have to do this as we are demand paging the btree @@ -1856,5 +1916,5 @@ out_unlock: out_trans_cancel: xfs_trans_cancel(tp, 0); - goto out_unlock; + goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 736429a72a12..af97d9a1dfb4 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 507d96a57ac7..092d652bc03d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -537,9 +537,9 @@ xfs_buf_item_push( /* has a previous flush failed due to IO errors? */ if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { xfs_warn(bp->b_target->bt_mount, -"Detected failing async write on buffer block 0x%llx. Retrying async write.", +"Failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 799e5a2d334d..e85a9519a5ae 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -84,7 +84,7 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 3ee186ac1093..338e50bbfd1e 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -131,7 +131,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller %pF", + "Internal error %s at line %d of file %s. Caller %pS", tag, linenum, filename, ra); xfs_stack_trace(); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 279a76e52791..c0394ed126fc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp); /* * Macros to set EFSCORRUPTED & return/branch. */ -#define XFS_WANT_CORRUPTED_GOTO(x,l) \ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ error = -EFSCORRUPTED; \ goto l; \ } \ } -#define XFS_WANT_CORRUPTED_RETURN(x) \ +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ return -EFSCORRUPTED; \ } \ } diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index b97359ba2648..652cd3c5b58c 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -215,7 +215,7 @@ xfs_fs_get_parent( int error; struct xfs_inode *cip; - error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); + error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(error); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f12ad0a8585..8121e75352ee 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -559,7 +559,7 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; @@ -569,21 +569,42 @@ restart: * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ + spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { bool zero = false; + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from @@ -645,6 +666,8 @@ xfs_file_dio_aio_write( int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -685,10 +708,11 @@ xfs_file_dio_aio_write( goto out; count = iov_iter_count(from); pos = iocb->ki_pos; + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, pos + count - 1); + pos, end); if (ret) goto out; /* @@ -698,7 +722,7 @@ xfs_file_dio_aio_write( */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + count - 1) >> PAGE_CACHE_SHIFT); + end >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -715,8 +739,22 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, from, pos); + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } out: xfs_rw_iunlock(ip, iolock); @@ -822,6 +860,11 @@ xfs_file_write_iter( return ret; } +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + STATIC long xfs_file_fallocate( struct file *file, @@ -835,18 +878,21 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; + bool do_file_insert = 0; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~XFS_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -873,6 +919,27 @@ xfs_file_fallocate( error = xfs_collapse_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; } else { flags |= XFS_PREALLOC_SET; @@ -907,8 +974,19 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; } + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + out_unlock: xfs_iunlock(ip, iolock); return error; @@ -997,20 +1075,6 @@ xfs_file_mmap( } /* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} - -/* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). */ @@ -1385,6 +1449,55 @@ xfs_file_llseek( } } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = { }; static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, + .fault = xfs_filemap_fault, .map_pages = filemap_map_pages, - .page_mkwrite = xfs_vm_page_mkwrite, + .page_mkwrite = xfs_filemap_page_mkwrite, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a2e86e8a0fea..da82f1cb4b9b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -294,7 +294,7 @@ xfs_filestream_get_parent( if (!parent) goto out_dput; - dir = igrab(parent->d_inode); + dir = igrab(d_inode(parent)); dput(parent); out_dput: @@ -322,7 +322,7 @@ xfs_filestream_lookup_ag( pip = xfs_filestream_get_parent(ip); if (!pip) - goto out; + return NULLAGNUMBER; mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); if (mru) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 74efe5b760dc..cb7e8a29dfb6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -637,12 +637,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + spin_lock(&mp->m_sb_lock); - cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; - cnt->freeino = mp->m_sb.sb_ifree; - cnt->allocino = mp->m_sb.sb_icount; spin_unlock(&mp->m_sb_lock); return 0; } @@ -692,14 +693,9 @@ xfs_reserve_blocks( * what to do. This means that the amount of free space can * change while we do this, so we need to retry if we end up * trying to reserve more space than is available. - * - * We also use the xfs_mod_incore_sb() interface so that we - * don't have to care about whether per cpu counter are - * enabled, disabled or even compiled in.... */ retry: spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, @@ -716,7 +712,8 @@ retry: } else { __int64_t free; - free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); if (!free) goto out; /* ENOSPC and fdblks_delta = 0 */ @@ -755,8 +752,7 @@ out: * the extra reserve blocks from the reserve..... */ int error; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - fdblks_delta, 0); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); if (error == -ENOSPC) goto retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9771b7ef62ed..76a9f2783282 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -439,11 +439,11 @@ again: *ipp = ip; /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) + * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); + xfs_setup_existing_inode(ip); return 0; out_error_or_again: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6163767aa856..d6ebc85192b7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -150,6 +160,8 @@ xfs_ilock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -159,6 +171,11 @@ xfs_ilock( else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -191,6 +208,8 @@ xfs_ilock_nowait( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -202,21 +221,35 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -244,6 +277,8 @@ xfs_iunlock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -254,6 +289,11 @@ xfs_iunlock( else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -271,11 +311,14 @@ xfs_ilock_demote( xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -294,6 +337,12 @@ xfs_isilocked( return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -314,14 +363,27 @@ int xfs_lock_delays; #endif /* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; @@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass) } /* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. */ void xfs_lock_inodes( @@ -348,30 +409,27 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - ASSERT(ips && (inodes >= 2)); /* we need at least two */ + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); try_lock = 0; i = 0; - again: for (; i < inodes; i++) { ASSERT(ips[i]); - if (i && (ips[i] == ips[i-1])) /* Already locked */ + if (i && (ips[i] == ips[i - 1])) /* Already locked */ continue; /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. */ - if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) try_lock++; - } } } @@ -381,51 +439,42 @@ again: * we can't get any, we must release all we have * and try again. */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; - if (try_lock) { - /* try_lock must be 0 if i is 0. */ + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { /* - * try_lock means we have an inode locked - * that is in the AIL. + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ #ifdef DEBUG - xfs_lock_delays++; + xfs_lock_delays++; #endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } + i = 0; + try_lock = 0; + goto again; } #ifdef DEBUG @@ -440,10 +489,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_two_inodes( @@ -455,8 +504,12 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -818,7 +871,7 @@ xfs_ialloc( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); *ipp = ip; @@ -1235,12 +1288,14 @@ xfs_create( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1345,12 +1400,14 @@ xfs_create_tmpfile( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -2611,19 +2668,22 @@ xfs_remove( /* * Enter all inodes for a rename transaction into a sorted array. */ +#define __XFS_SORT_INODES 5 STATIC void xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - xfs_inode_t *temp; int i, j; + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + /* * i_tab contains a list of pointers to inodes. We initialize * the table here & we'll sort it. We will then use it to @@ -2631,25 +2691,24 @@ xfs_sort_for_rename( * * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; /* * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) + * most 5 elements to sort, so this is adequate.) */ for (i = 0; i < *num_inodes; i++) { for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; + struct xfs_inode *temp = i_tab[j]; i_tab[j] = i_tab[j-1]; i_tab[j-1] = temp; } @@ -2657,6 +2716,31 @@ xfs_sort_for_rename( } } +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + /* * xfs_cross_rename() * @@ -2685,14 +2769,14 @@ xfs_cross_rename( ip2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* * If we're renaming one or more directories across different parents, @@ -2707,16 +2791,16 @@ xfs_cross_rename( dp1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(ip1->i_d.di_mode)) { error = xfs_droplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; } /* @@ -2734,16 +2818,16 @@ xfs_cross_rename( dp2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(ip2->i_d.di_mode)) { error = xfs_droplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; } /* @@ -2771,66 +2855,108 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); -out: + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } /* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* Satisfy xfs_bumplink that this is a real tmpfile */ + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* * xfs_rename */ int xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip, - unsigned int flags) + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) { - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; trace_xfs_rename(src_dp, target_dp, src_name, target_name); - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Lock all the participating inodes. Depending upon whether @@ -2851,6 +2977,8 @@ xfs_rename( xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -2860,24 +2988,16 @@ xfs_rename( if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = -EXDEV; - goto error_return; + goto out_trans_cancel; } - /* - * Handle RENAME_EXCHANGE flags - */ - if (flags & RENAME_EXCHANGE) { - if (target_ip == NULL) { - error = -EINVAL; - goto error_return; - } - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); - if (error) - goto abort_return; - goto finish_rename; - } + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); /* * Set up the target. @@ -2890,7 +3010,7 @@ xfs_rename( if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * If target does not exist and the rename crosses @@ -2901,9 +3021,9 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error == -ENOSPC) - goto error_return; + goto out_bmap_cancel; if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2911,7 +3031,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto abort_return; + goto out_trans_abort; } } else { /* target_ip != NULL */ /* @@ -2926,7 +3046,7 @@ xfs_rename( if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = -EEXIST; - goto error_return; + goto out_trans_cancel; } } @@ -2943,7 +3063,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2954,7 +3074,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; if (src_is_directory) { /* @@ -2962,7 +3082,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; } } /* target_ip != NULL */ @@ -2979,7 +3099,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto abort_return; + goto out_trans_abort; } /* @@ -3005,49 +3125,67 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto abort_return; + goto out_trans_abort; } - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + goto out_trans_abort; -finish_rename: /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } + if (wip) { + ASSERT(wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; } - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - abort_return: + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_bmap_cancel: xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - std_return: + if (wip) + IRELE(wip); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a1cd55f3f351..8f22d20368d8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,6 +56,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ @@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ - | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* @@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_SHIFT 20 + #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) -#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 #define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) -#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) -#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) /* * For multiple groups support: if S_ISGID bit is set in the parent @@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ac4feae45eb3..87f67c6b654c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -82,7 +82,7 @@ xfs_find_handle( error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); } ip = XFS_I(inode); @@ -210,7 +210,7 @@ xfs_open_by_handle( dentry = xfs_handlereq_to_dentry(parfilp, hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - inode = dentry->d_inode; + inode = d_inode(dentry); /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { @@ -303,7 +303,7 @@ xfs_readlink_by_handle( goto out_dput; } - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (error) goto out_kfree; error = readlink_copy(hreq->ohandle, olen, link); @@ -376,7 +376,7 @@ xfs_fssetdm_by_handle( return PTR_ERR(dentry); } - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -386,7 +386,7 @@ xfs_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: @@ -429,7 +429,7 @@ xfs_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -559,7 +559,7 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, &ops[i].am_length, ops[i].am_flags); break; @@ -568,7 +568,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -578,7 +578,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -631,7 +631,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (ioflags & XFS_IO_INVIS) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -639,10 +639,13 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index bfc7c7c8a0c8..b88bdc85dd3d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -375,7 +375,7 @@ xfs_compat_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -445,7 +445,7 @@ xfs_compat_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), &ops[i].am_length, ops[i].am_flags); break; @@ -454,7 +454,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -464,7 +464,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -504,7 +504,7 @@ xfs_compat_fssetdm_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -514,7 +514,7 @@ xfs_compat_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccb1dd0d509e..38e633bad8c2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -460,8 +460,7 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; + freesp = percpu_counter_read_positive(&mp->m_fdblocks); if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { shift = 2; if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e53a90331422..f4cd7204e236 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -187,6 +187,8 @@ xfs_generic_create( else d_instantiate(dentry, inode); + xfs_finish_inode_setup(ip); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -195,6 +197,7 @@ xfs_generic_create( return error; out_cleanup_inode: + xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); @@ -301,7 +304,7 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct xfs_name name; int error; @@ -326,7 +329,7 @@ xfs_vn_unlink( xfs_dentry_to_name(&name, dentry, 0); - error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) return error; @@ -367,9 +370,11 @@ xfs_vn_symlink( goto out_cleanup_inode; d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: @@ -384,22 +389,22 @@ xfs_vn_rename( struct dentry *ndentry, unsigned int flags) { - struct inode *new_inode = ndentry->d_inode; + struct inode *new_inode = d_inode(ndentry); int omode = 0; struct xfs_name oname; struct xfs_name nname; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; /* if we are exchanging files, we need to set i_mode of both files */ if (flags & RENAME_EXCHANGE) - omode = ndentry->d_inode->i_mode; + omode = d_inode(ndentry)->i_mode; xfs_dentry_to_name(&oname, odentry, omode); - xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); + xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); - return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -421,7 +426,7 @@ xfs_vn_follow_link( if (!link) goto out_err; - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (unlikely(error)) goto out_kfree; @@ -441,7 +446,7 @@ xfs_vn_getattr( struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -766,6 +771,7 @@ xfs_setattr_size( return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); @@ -829,55 +835,27 @@ xfs_setattr_size( inode_dio_wait(inode); /* - * Do all the page cache truncate work outside the transaction context - * as the "lock" order is page lock->log space reservation. i.e. - * locking pages inside the transaction can ABBA deadlock with - * writeback. We have to do the VFS inode size update before we truncate - * the pagecache, however, to avoid racing with page faults beyond the - * new EOF they are not serialised against truncate operations except by - * page locks and size updates. + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. * - * Hence we are in a situation where a truncate can fail with ENOMEM - * from xfs_trans_reserve(), but having already truncated the in-memory - * version of the file (i.e. made user visible changes). There's not - * much we can do about this, except to hope that the caller sees ENOMEM - * and retries the truncate operation. + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. */ error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); - /* - * The "we can't serialise against page faults" pain gets worse. - * - * If the file is mapped then we have to clean the page at the old EOF - * when extending the file. Extending the file can expose changes the - * underlying page mapping (e.g. from beyond EOF to a hole or - * unwritten), and so on the next attempt to write to that page we need - * to remap it for write. i.e. we need .page_mkwrite() to be called. - * Hence we need to clean the page to clean the pte and so a new write - * fault will be triggered appropriately. - * - * If we do it before we change the inode size, then we can race with a - * page fault that maps the page with exactly the same problem. If we do - * it after we change the file size, then a new page fault can come in - * and allocate space before we've run the rest of the truncate - * transaction. That's kinda grotesque, but it's better than have data - * over a hole, and so that's the lesser evil that has been chosen here. - * - * The real solution, however, is to have some mechanism for locking out - * page faults while a truncate is in progress. - */ - if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { - error = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - round_down(oldsize, PAGE_CACHE_SIZE), - round_up(oldsize, PAGE_CACHE_SIZE) - 1); - if (error) - return error; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) @@ -968,16 +946,20 @@ xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error; if (iattr->ia_valid & ATTR_SIZE) { uint iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); - error = xfs_break_layouts(dentry->d_inode, &iolock); - if (!error) + error = xfs_break_layouts(d_inode(dentry), &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_setattr_size(ip, iattr); + } xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); @@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. + * Initialize the Linux inode and set up the operation vectors. * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. - * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1324,9 +1302,4 @@ xfs_setup_inode( inode_has_no_xattr(inode); cache_no_acl(inode); } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ea7a98e9cb70..a0f84abb0d09 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setup_inode(struct xfs_inode *); - /* * Internal setattr interfaces. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 82e314258f73..80429891dc9b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk( error = xfs_inobt_get_rec(cur, irec, &stat); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(stat == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); /* Check if the record contains the inode in request */ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c31d2c2eadc4..7c7842c85a08 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t; #undef XFS_NATIVE_HOST #endif -/* - * Feature macros (disable/enable) - */ -#ifdef CONFIG_SMP -#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#else -#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#endif - #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5a945fc3bdc..4f5784f85a5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4463,10 +4463,10 @@ xlog_do_recover( xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + xfs_buf_relse(bp); - /* We've re-read the superblock so re-initialize per-cpu counters */ - xfs_icsb_reinit_counters(log->l_mp); xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4fa80e63eea2..2ce7ee3b4ec1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,18 +43,6 @@ #include "xfs_sysfs.h" -#ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); -#else - -#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) -#endif - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -347,8 +335,7 @@ reread: goto reread; } - /* Initialize per-cpu counters */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp) if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; - xfs_icsb_sync_counters(mp, 0); - /* * we don't need to do this if we are updating the superblock * counters on every modification. @@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } -/* - * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply - * a delta to a specified field in the in-core superblock. Simply - * switch on the field indicated and apply the delta to that field. - * Fields are not allowed to dip below zero, so if the delta would - * do this do not apply it and return EINVAL. - * - * The m_sb_lock must be held when this routine is called. - */ -STATIC int -xfs_mod_incore_sb_unlocked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) { - int scounter; /* short counter for 32 bit fields */ - long long lcounter; /* long counter for 64 bit fields */ - long long res_used, rem; - - /* - * With the in-core superblock spin lock held, switch - * on the indicated field. Apply the delta to the - * proper field. If the fields value would dip below - * 0, then do not apply the delta and return EINVAL. - */ - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = (long long)mp->m_sb.sb_icount; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_icount = lcounter; - return 0; - case XFS_SBS_IFREE: - lcounter = (long long)mp->m_sb.sb_ifree; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_ifree = lcounter; - return 0; - case XFS_SBS_FDBLOCKS: - lcounter = (long long) - mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (delta > 0) { /* Putting blocks back */ - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - rem = delta - res_used; - mp->m_resblks_avail = mp->m_resblks; - lcounter += rem; - } - } else { /* Taking blocks away */ - lcounter += delta; - if (lcounter >= 0) { - mp->m_sb.sb_fdblocks = lcounter + - XFS_ALLOC_SET_ASIDE(mp); - return 0; - } - - /* - * We are out of blocks, use any available reserved - * blocks if were allowed to. - */ - if (!rsvd) - return -ENOSPC; - - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; - return 0; - } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_fsname); - return -ENOSPC; - } - - mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - return 0; - case XFS_SBS_FREXTENTS: - lcounter = (long long)mp->m_sb.sb_frextents; - lcounter += delta; - if (lcounter < 0) { - return -ENOSPC; - } - mp->m_sb.sb_frextents = lcounter; - return 0; - case XFS_SBS_DBLOCKS: - lcounter = (long long)mp->m_sb.sb_dblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_dblocks = lcounter; - return 0; - case XFS_SBS_AGCOUNT: - scounter = mp->m_sb.sb_agcount; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_agcount = scounter; - return 0; - case XFS_SBS_IMAX_PCT: - scounter = mp->m_sb.sb_imax_pct; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_imax_pct = scounter; - return 0; - case XFS_SBS_REXTSIZE: - scounter = mp->m_sb.sb_rextsize; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextsize = scounter; - return 0; - case XFS_SBS_RBMBLOCKS: - scounter = mp->m_sb.sb_rbmblocks; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rbmblocks = scounter; - return 0; - case XFS_SBS_RBLOCKS: - lcounter = (long long)mp->m_sb.sb_rblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rblocks = lcounter; - return 0; - case XFS_SBS_REXTENTS: - lcounter = (long long)mp->m_sb.sb_rextents; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextents = lcounter; - return 0; - case XFS_SBS_REXTSLOG: - scounter = mp->m_sb.sb_rextslog; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextslog = scounter; - return 0; - default: + /* deltas are +/-64, hence the large batch size of 128. */ + __percpu_counter_add(&mp->m_icount, delta, 128); + if (percpu_counter_compare(&mp->m_icount, 0) < 0) { ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; } + return 0; } -/* - * xfs_mod_incore_sb() is used to change a field in the in-core - * superblock structure by the specified delta. This modification - * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() - * routine to do the work. - */ int -xfs_mod_incore_sb( +xfs_mod_ifree( struct xfs_mount *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) + int64_t delta) { - int status; - -#ifdef HAVE_PERCPU_SB - ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); -#endif - spin_lock(&mp->m_sb_lock); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - return status; + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; } -/* - * Change more than one field in the in-core superblock structure at a time. - * - * The fields and changes to those fields are specified in the array of - * xfs_mod_sb structures passed in. Either all of the specified deltas - * will be applied or none of them will. If any modified field dips below 0, - * then all modifications will be backed out and EINVAL will be returned. - * - * Note that this function may not be used for the superblock values that - * are tracked with the in-memory per-cpu counters - a direct call to - * xfs_icsb_modify_counters is required for these. - */ int -xfs_mod_incore_sb_batch( +xfs_mod_fdblocks( struct xfs_mount *mp, - xfs_mod_sb_t *msb, - uint nmsb, - int rsvd) + int64_t delta, + bool rsvd) { - xfs_mod_sb_t *msbp; - int error = 0; + int64_t lcounter; + long long res_used; + s32 batch; + + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } + + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } /* - * Loop through the array of mod structures and apply each individually. - * If any fail, then back out all those which have already been applied. - * Do all of this within the scope of the m_sb_lock so that all of the - * changes will be atomic. + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * batch size is set to a maximum of 1024 blocks - if we are + * allocating of freeing extents larger than this then we aren't + * going to be hammering the counter lock so a lock per update + * is not a problem. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ +#define __BATCH 1024 + if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) + batch = 1; + else + batch = __BATCH; +#undef __BATCH + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (percpu_counter_compare(&mp->m_fdblocks, + XFS_ALLOC_SET_ASIDE(mp)) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - for (msbp = msb; msbp < (msb + nmsb); msbp++) { - ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || - msbp->msb_field > XFS_SBS_FDBLOCKS); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - msbp->msb_delta, rsvd); - if (error) - goto unwind; + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: spin_unlock(&mp->m_sb_lock); - return 0; + return -ENOSPC; +} -unwind: - while (--msbp >= msb) { - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - -msbp->msb_delta, rsvd); - ASSERT(error == 0); - } +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; spin_unlock(&mp->m_sb_lock); - return error; + return ret; } /* @@ -1407,573 +1275,3 @@ xfs_dev_is_read_only( } return 0; } - -#ifdef HAVE_PERCPU_SB -/* - * Per-cpu incore superblock counters - * - * Simple concept, difficult implementation - * - * Basically, replace the incore superblock counters with a distributed per cpu - * counter for contended fields (e.g. free block count). - * - * Difficulties arise in that the incore sb is used for ENOSPC checking, and - * hence needs to be accurately read when we are running low on space. Hence - * there is a method to enable and disable the per-cpu counters based on how - * much "stuff" is available in them. - * - * Basically, a counter is enabled if there is enough free resource to justify - * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local - * ENOSPC), then we disable the counters to synchronise all callers and - * re-distribute the available resources. - * - * If, once we redistributed the available resources, we still get a failure, - * we disable the per-cpu counter and go through the slow path. - * - * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain its resources back to - * the global superblock. We do this after disabling the counter to prevent - * more threads from queueing up on the counter. - * - * Essentially, this means that we still need a lock in the fast path to enable - * synchronisation between the global counters and the per-cpu counters. This - * is not a problem because the lock will be local to a CPU almost all the time - * and have little contention except when we get to ENOSPC conditions. - * - * Basically, this lock becomes a barrier that enables us to lock out the fast - * path while we do things like enabling and disabling counters and - * synchronising the counters. - * - * Locking rules: - * - * 1. m_sb_lock before picking up per-cpu locks - * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires m_sb_lock + per cpu locks - * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding m_sb_lock - * 6. enabling or disabling a counter requires holding the m_sb_lock - * and _none_ of the per-cpu locks. - * - * Disabled counters are only ever re-enabled by a balance operation - * that results in more free resources per CPU than a given threshold. - * To ensure counters don't remain disabled, they are rebalanced when - * the global resource goes above a higher threshold (i.e. some hysteresis - * is present to prevent thrashing). - */ - -#ifdef CONFIG_HOTPLUG_CPU -/* - * hot-plug CPU notifier support. - * - * We need a notifier per filesystem as we need to be able to identify - * the filesystem to balance the counters out. This is achieved by - * having a notifier block embedded in the xfs_mount_t and doing pointer - * magic to get the mount pointer from the notifier block address. - */ -STATIC int -xfs_icsb_cpu_notify( - struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - xfs_icsb_cnts_t *cntp; - xfs_mount_t *mp; - - mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); - cntp = (xfs_icsb_cnts_t *) - per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - /* Easy Case - initialize the area and locks, and - * then rebalance when online does everything else for us. */ - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - xfs_icsb_lock(mp); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* Disable all the counters, then fold the dead cpu's - * count into the total on the global superblock and - * re-enable the counters. */ - xfs_icsb_lock(mp); - spin_lock(&mp->m_sb_lock); - xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); - xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); - xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); - - mp->m_sb.sb_icount += cntp->icsb_icount; - mp->m_sb.sb_ifree += cntp->icsb_ifree; - mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; - - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - - xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); - spin_unlock(&mp->m_sb_lock); - xfs_icsb_unlock(mp); - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -int -xfs_icsb_init_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); - if (mp->m_sb_cnts == NULL) - return -ENOMEM; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - } - - mutex_init(&mp->m_icsb_mutex); - - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - - return 0; -} - -void -xfs_icsb_reinit_counters( - xfs_mount_t *mp) -{ - xfs_icsb_lock(mp); - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); -} - -void -xfs_icsb_destroy_counters( - xfs_mount_t *mp) -{ - if (mp->m_sb_cnts) { - unregister_hotcpu_notifier(&mp->m_icsb_notifier); - free_percpu(mp->m_sb_cnts); - } - mutex_destroy(&mp->m_icsb_mutex); -} - -STATIC void -xfs_icsb_lock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { - ndelay(1000); - } -} - -STATIC void -xfs_icsb_unlock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); -} - - -STATIC void -xfs_icsb_lock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_lock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_unlock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_unlock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_count( - xfs_mount_t *mp, - xfs_icsb_cnts_t *cnt, - int flags) -{ - xfs_icsb_cnts_t *cntp; - int i; - - memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_lock_all_counters(mp); - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - cnt->icsb_icount += cntp->icsb_icount; - cnt->icsb_ifree += cntp->icsb_ifree; - cnt->icsb_fdblocks += cntp->icsb_fdblocks; - } - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_unlock_all_counters(mp); -} - -STATIC int -xfs_icsb_counter_disabled( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - return test_bit(field, &mp->m_icsb_counters); -} - -STATIC void -xfs_icsb_disable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - xfs_icsb_cnts_t cnt; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - /* - * If we are already disabled, then there is nothing to do - * here. We check before locking all the counters to avoid - * the expensive lock operation when being called in the - * slow path and the counter is already disabled. This is - * safe because the only time we set or clear this state is under - * the m_icsb_mutex. - */ - if (xfs_icsb_counter_disabled(mp, field)) - return; - - xfs_icsb_lock_all_counters(mp); - if (!test_and_set_bit(field, &mp->m_icsb_counters)) { - /* drain back to superblock */ - - xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); - switch(field) { - case XFS_SBS_ICOUNT: - mp->m_sb.sb_icount = cnt.icsb_icount; - break; - case XFS_SBS_IFREE: - mp->m_sb.sb_ifree = cnt.icsb_ifree; - break; - case XFS_SBS_FDBLOCKS: - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - break; - default: - BUG(); - } - } - - xfs_icsb_unlock_all_counters(mp); -} - -STATIC void -xfs_icsb_enable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field, - uint64_t count, - uint64_t resid) -{ - xfs_icsb_cnts_t *cntp; - int i; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - xfs_icsb_lock_all_counters(mp); - for_each_online_cpu(i) { - cntp = per_cpu_ptr(mp->m_sb_cnts, i); - switch (field) { - case XFS_SBS_ICOUNT: - cntp->icsb_icount = count + resid; - break; - case XFS_SBS_IFREE: - cntp->icsb_ifree = count + resid; - break; - case XFS_SBS_FDBLOCKS: - cntp->icsb_fdblocks = count + resid; - break; - default: - BUG(); - break; - } - resid = 0; - } - clear_bit(field, &mp->m_icsb_counters); - xfs_icsb_unlock_all_counters(mp); -} - -void -xfs_icsb_sync_counters_locked( - xfs_mount_t *mp, - int flags) -{ - xfs_icsb_cnts_t cnt; - - xfs_icsb_count(mp, &cnt, flags); - - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) - mp->m_sb.sb_icount = cnt.icsb_icount; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) - mp->m_sb.sb_ifree = cnt.icsb_ifree; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; -} - -/* - * Accurate update of per-cpu counters to incore superblock - */ -void -xfs_icsb_sync_counters( - xfs_mount_t *mp, - int flags) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, flags); - spin_unlock(&mp->m_sb_lock); -} - -/* - * Balance and enable/disable counters as necessary. - * - * Thresholds for re-enabling counters are somewhat magic. inode counts are - * chosen to be the same number as single on disk allocation chunk per CPU, and - * free blocks is something far enough zero that we aren't going thrash when we - * get near ENOSPC. We also need to supply a minimum we require per cpu to - * prevent looping endlessly when xfs_alloc_space asks for more than will - * be distributed to a single CPU but each CPU has enough blocks to be - * reenabled. - * - * Note that we can be called when counters are already disabled. - * xfs_icsb_disable_counter() optimises the counter locking in this case to - * prevent locking every per-cpu counter needlessly. - */ - -#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 -#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) -STATIC void -xfs_icsb_balance_counter_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int min_per_cpu) -{ - uint64_t count, resid; - int weight = num_online_cpus(); - uint64_t min = (uint64_t)min_per_cpu; - - /* disable counter and sync counter */ - xfs_icsb_disable_counter(mp, field); - - /* update counters - first CPU gets residual*/ - switch (field) { - case XFS_SBS_ICOUNT: - count = mp->m_sb.sb_icount; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_IFREE: - count = mp->m_sb.sb_ifree; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_FDBLOCKS: - count = mp->m_sb.sb_fdblocks; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) - return; - break; - default: - BUG(); - count = resid = 0; /* quiet, gcc */ - break; - } - - xfs_icsb_enable_counter(mp, field, count, resid); -} - -STATIC void -xfs_icsb_balance_counter( - xfs_mount_t *mp, - xfs_sb_field_t fields, - int min_per_cpu) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); - spin_unlock(&mp->m_sb_lock); -} - -int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) -{ - xfs_icsb_cnts_t *icsbp; - long long lcounter; /* long counter for 64 bit fields */ - int ret = 0; - - might_sleep(); -again: - preempt_disable(); - icsbp = this_cpu_ptr(mp->m_sb_cnts); - - /* - * if the counter is disabled, go to slow path - */ - if (unlikely(xfs_icsb_counter_disabled(mp, field))) - goto slow_path; - xfs_icsb_lock_cntr(icsbp); - if (unlikely(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock_cntr(icsbp); - goto slow_path; - } - - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = icsbp->icsb_icount; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_icount = lcounter; - break; - - case XFS_SBS_IFREE: - lcounter = icsbp->icsb_ifree; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_ifree = lcounter; - break; - - case XFS_SBS_FDBLOCKS: - BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); - - lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - break; - default: - BUG(); - break; - } - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - return 0; - -slow_path: - preempt_enable(); - - /* - * serialise with a mutex so we don't burn lots of cpu on - * the superblock lock. We still need to hold the superblock - * lock, however, when we modify the global structures. - */ - xfs_icsb_lock(mp); - - /* - * Now running atomically. - * - * If the counter is enabled, someone has beaten us to rebalancing. - * Drop the lock and try again in the fast path.... - */ - if (!(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock(mp); - goto again; - } - - /* - * The counter is currently disabled. Because we are - * running atomically here, we know a rebalance cannot - * be in progress. Hence we can go straight to operating - * on the global superblock. We do not call xfs_mod_incore_sb() - * here even though we need to get the m_sb_lock. Doing so - * will cause us to re-enter this function and deadlock. - * Hence we get the m_sb_lock ourselves and then call - * xfs_mod_incore_sb_unlocked() as the unlocked path operates - * directly on the global counters. - */ - spin_lock(&mp->m_sb_lock); - ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - /* - * Now that we've modified the global superblock, we - * may be able to re-enable the distributed counters - * (e.g. lots of space just got freed). After that - * we are done. - */ - if (ret != -ENOSPC) - xfs_icsb_balance_counter(mp, field, 0); - xfs_icsb_unlock(mp); - return ret; - -balance_counter: - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - - /* - * We may have multiple threads here if multiple per-cpu - * counters run dry at the same time. This will mean we can - * do more balances than strictly necessary but it is not - * the common slowpath case. - */ - xfs_icsb_lock(mp); - - /* - * running atomically. - * - * This will leave the counter in the correct state for future - * accesses. After the rebalance, we simply try again and our retry - * will either succeed through the fast path or slow path without - * another balance operation being required. - */ - xfs_icsb_balance_counter(mp, field, delta); - xfs_icsb_unlock(mp); - goto again; -} - -#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0d8abd6364d9..8c995a2ccb6f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,8 +18,6 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -#ifdef __KERNEL__ - struct xlog; struct xfs_inode; struct xfs_mru_cache; @@ -29,44 +27,6 @@ struct xfs_quotainfo; struct xfs_dir_ops; struct xfs_da_geometry; -#ifdef HAVE_PERCPU_SB - -/* - * Valid per-cpu incore superblock counters. Note that if you add new counters, - * you may need to define new counter disabled bit field descriptors as there - * are more possible fields in the superblock that can fit in a bitfield on a - * 32 bit platform. The XFS_SBS_* values for the current current counters just - * fit. - */ -typedef struct xfs_icsb_cnts { - uint64_t icsb_fdblocks; - uint64_t icsb_ifree; - uint64_t icsb_icount; - unsigned long icsb_flags; -} xfs_icsb_cnts_t; - -#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ - -#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ - -extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_reinit_counters(struct xfs_mount *); -extern void xfs_icsb_destroy_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters(struct xfs_mount *, int); -extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); -extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, - int64_t, int); - -#else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_reinit_counters(mp) do { } while (0) -#define xfs_icsb_sync_counters(mp, flags) do { } while (0) -#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) -#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ - xfs_mod_incore_sb(mp, field, delta, rsvd) -#endif - /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { XFS_LOWSP_1_PCNT = 0, @@ -81,8 +41,13 @@ typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ struct xfs_ail *m_ail; /* fs active log item list */ - xfs_sb_t m_sb; /* copy of fs superblock */ + + struct xfs_sb m_sb; /* copy of fs superblock */ spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -152,12 +117,6 @@ typedef struct xfs_mount { const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ -#ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ - unsigned long m_icsb_counters; /* disabled per-cpu counters */ - struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ - struct mutex m_icsb_mutex; /* balancer sync lock */ -#endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks @@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * Per-cpu superblock locking functions - */ -#ifdef HAVE_PERCPU_SB -static inline void -xfs_icsb_lock(xfs_mount_t *mp) -{ - mutex_lock(&mp->m_icsb_mutex); -} - -static inline void -xfs_icsb_unlock(xfs_mount_t *mp) -{ - mutex_unlock(&mp->m_icsb_mutex); -} -#else -#define xfs_icsb_lock(mp) -#define xfs_icsb_unlock(mp) -#endif - -/* - * This structure is for use by the xfs_mod_incore_sb_batch() routine. - * xfs_growfs can specify a few fields which are more than int limit - */ -typedef struct xfs_mod_sb { - xfs_sb_field_t msb_field; /* Field to modify, see below */ - int64_t msb_delta; /* Change to make to specified field */ -} xfs_mod_sb_t; - -/* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. */ @@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); - extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, - uint, int); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); @@ -399,6 +332,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); -#endif /* __KERNEL__ */ - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 30ecca3037e3..f8a674d7f092 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -437,7 +437,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_KERNEL)) + if (radix_tree_preload(GFP_NOFS)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 365dd57ea760..981a657eca39 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,7 +31,8 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool with_imutex) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -40,8 +41,12 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index b7fbfce660f6..8147ac108820 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); #else -static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fbbb9e62e274..5538468c7f63 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; int committed; + bool need_alloc = true; *ip = NULL; /* @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; } } @@ -758,7 +760,7 @@ xfs_qm_qino_alloc( return error; } - if (!*ip) { + if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { @@ -794,11 +796,14 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); - return error; } - return 0; + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8fcc4ccc5c79..858e1e62bbaa 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ @@ -361,28 +359,10 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - xfs_warn(mp, - "delaylog is the default now, option is deprecated."); - } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - xfs_warn(mp, - "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, "ihashsize")) { - xfs_warn(mp, - "ihashsize no longer used, option is deprecated."); - } else if (!strcmp(this_char, "osyncisdsync")) { - xfs_warn(mp, - "osyncisdsync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "osyncisosync")) { - xfs_warn(mp, - "osyncisosync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - xfs_warn(mp, - "irixsgid is now a sysctl(2) variable, option is deprecated."); } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -986,6 +966,8 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } @@ -1033,23 +1015,6 @@ xfs_free_fsname( kfree(mp->m_logname); } -STATIC void -xfs_fs_put_super( - struct super_block *sb) -{ - struct xfs_mount *mp = XFS_M(sb); - - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); - - xfs_freesb(mp); - xfs_icsb_destroy_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - xfs_free_fsname(mp); - kfree(mp); -} - STATIC int xfs_fs_sync_fs( struct super_block *sb, @@ -1083,8 +1048,11 @@ xfs_fs_statfs( { struct xfs_mount *mp = XFS_M(dentry->d_sb); xfs_sb_t *sbp = &mp->m_sb; - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; xfs_extlen_t lsize; __int64_t ffree; @@ -1095,17 +1063,21 @@ xfs_fs_statfs( statp->f_fsid.val[0] = (u32)id; statp->f_fsid.val[1] = (u32)(id >> 32); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); spin_lock(&mp->m_sb_lock); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1117,10 +1089,9 @@ xfs_fs_statfs( sbp->sb_icount); /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + ffree = statp->f_files - (icount - ifree); statp->f_ffree = max_t(__int64_t, ffree, 0); - spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == @@ -1256,6 +1227,12 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1401,6 +1378,51 @@ xfs_finish_flags( return 0; } +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + STATIC int xfs_fs_fill_super( struct super_block *sb, @@ -1449,7 +1471,7 @@ xfs_fs_fill_super( if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = xfs_init_percpu_counters(mp); if (error) goto out_destroy_workqueues; @@ -1507,7 +1529,7 @@ xfs_fs_fill_super( out_free_sb: xfs_freesb(mp); out_destroy_counters: - xfs_icsb_destroy_counters(mp); + xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: @@ -1524,6 +1546,24 @@ out_destroy_workqueues: goto out_free_sb; } +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + STATIC struct dentry * xfs_fs_mount( struct file_system_type *fs_type, diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2b830c2f322e..499058fea303 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 25791df6f638..3df411eadb86 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -177,7 +177,7 @@ xfs_symlink( int pathlen; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -221,7 +221,7 @@ xfs_symlink( XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) - goto std_return; + return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; @@ -241,7 +241,7 @@ xfs_symlink( } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); @@ -252,7 +252,7 @@ xfs_symlink( */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; - goto error_return; + goto out_trans_cancel; } /* @@ -261,7 +261,7 @@ xfs_symlink( error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; /* * Check for ability to enter directory entry, if no space reserved. @@ -269,7 +269,7 @@ xfs_symlink( if (!resblks) { error = xfs_dir_canenter(tp, dp, link_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * Initialize the bmap freelist prior to calling either @@ -282,15 +282,14 @@ xfs_symlink( */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto error_return; - goto error1; - } + if (error) + goto out_trans_cancel; /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); @@ -330,7 +329,7 @@ xfs_symlink( XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, &free_list); if (error) - goto error2; + goto out_bmap_cancel; if (resblks) resblks -= fs_blocks; @@ -348,7 +347,7 @@ xfs_symlink( BTOBB(byte_cnt), 0); if (!bp) { error = -ENOMEM; - goto error2; + goto out_bmap_cancel; } bp->b_ops = &xfs_symlink_buf_ops; @@ -378,7 +377,7 @@ xfs_symlink( error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, &first_block, &free_list, resblks); if (error) - goto error2; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -392,10 +391,13 @@ xfs_symlink( } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -403,20 +405,28 @@ xfs_symlink( *ipp = ip; return 0; - error2: - IRELE(ip); - error1: +out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e34d988..615781bf4ee5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pf", + TP_printk("dev %d:%d agno %u refcount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %pf", + "lock %d error %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %pf", + "flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %pf", + "freeblks %u longest %u caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index eb90cd59a0ec..220ef2c906b2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -173,7 +173,7 @@ xfs_trans_reserve( uint rtextents) { int error = 0; - int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); @@ -184,8 +184,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return -ENOSPC; @@ -236,8 +235,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -((int64_t)rtextents), rsvd); + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -268,8 +266,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - (int64_t)blocks, rsvd); + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); tp->t_blk_res = 0; } @@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + /* * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations * and apply superblock counter changes to the in-core superblock. The @@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas( * applied to the in-core superblock. The idea is that that has already been * done. * - * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). - * However, we have to ensure that we only modify each superblock field only - * once because the application of the delta values may not be atomic. That can - * lead to ENOSPC races occurring if we have two separate modifcations of the - * free space counter to put back the entire reservation and then take away - * what we used. - * * If we are not logging superblock counters, then the inode allocated/free and * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we @@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas( */ void xfs_trans_unreserve_and_mod_sb( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ - xfs_mod_sb_t *msbp; - xfs_mount_t *mp = tp->t_mountp; - /* REFERENCED */ - int error; - int rsvd; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; - int64_t idelta = 0; - int64_t ifreedelta = 0; - - msbp = msb; - rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; /* calculate deltas */ if (tp->t_blk_res > 0) @@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - blkdelta, rsvd); + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); if (error) goto out; } if (idelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, - idelta, rsvd); + error = xfs_mod_icount(mp, idelta); if (error) goto out_undo_fdblocks; } if (ifreedelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, - ifreedelta, rsvd); + error = xfs_mod_ifree(mp, ifreedelta); if (error) goto out_undo_icount; } + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + /* apply remaining deltas */ - if (rtxdelta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = rtxdelta; - msbp++; + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); + if (error) + goto out_undo_ifree; } - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_dblocks_delta != 0) { - msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = tp->t_dblocks_delta; - msbp++; - } - if (tp->t_agcount_delta != 0) { - msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = tp->t_agcount_delta; - msbp++; - } - if (tp->t_imaxpct_delta != 0) { - msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = tp->t_imaxpct_delta; - msbp++; - } - if (tp->t_rextsize_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = tp->t_rextsize_delta; - msbp++; - } - if (tp->t_rbmblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = tp->t_rbmblocks_delta; - msbp++; - } - if (tp->t_rblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = tp->t_rblocks_delta; - msbp++; - } - if (tp->t_rextents_delta != 0) { - msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = tp->t_rextents_delta; - msbp++; - } - if (tp->t_rextslog_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = tp->t_rextslog_delta; - msbp++; - } + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; } - - /* - * If we need to change anything, do it. - */ - if (msbp > msb) { - error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, - (uint)(msbp - msb), rsvd); + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); if (error) - goto out_undo_ifreecount; + goto out_undo_dblocks; } - + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); return; -out_undo_ifreecount: +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); if (ifreedelta) - xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); + xfs_mod_ifree(mp, -ifreedelta); out_undo_icount: if (idelta) - xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); + xfs_mod_icount(mp, -idelta); out_undo_fdblocks: if (blkdelta) - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); + xfs_mod_fdblocks(mp, -blkdelta, rsvd); out: ASSERT(error == 0); return; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 69f6e475de97..c036815183cb 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -35,7 +35,7 @@ static int xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error, asize = size; if (strcmp(name, "") == 0) @@ -57,7 +57,7 @@ static int xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); if (strcmp(name, "") == 0) return -EINVAL; @@ -197,7 +197,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index 444671e9c65d..dd86c5fc102d 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -3,11 +3,15 @@ #include <linux/io.h> +#include <asm/acpi.h> + +#ifndef acpi_os_ioremap static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { return ioremap_cache(phys, size); } +#endif void __iomem *__init_refok acpi_os_map_iomem(acpi_physical_address phys, acpi_size size); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index b95dc32a6e6b..4188a4d3b597 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -196,7 +196,7 @@ struct acpi_processor_flags { struct acpi_processor { acpi_handle handle; u32 acpi_id; - u32 phys_id; /* CPU hardware ID such as APIC ID for x86 */ + phys_cpuid_t phys_id; /* CPU hardware ID such as APIC ID for x86 */ u32 id; /* CPU logical ID allocated by OS */ u32 pblk; int performance_platform_limit; @@ -310,8 +310,8 @@ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) #endif /* CONFIG_CPU_FREQ */ /* in processor_core.c */ -int acpi_get_phys_id(acpi_handle, int type, u32 acpi_id); -int acpi_map_cpuid(int phys_id, u32 acpi_id); +phys_cpuid_t acpi_get_phys_id(acpi_handle, int type, u32 acpi_id); +int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id); int acpi_get_cpuid(acpi_handle, int type, u32 acpi_id); /* in processor_pdc.c */ diff --git a/include/dt-bindings/dma/jz4780-dma.h b/include/dt-bindings/dma/jz4780-dma.h new file mode 100644 index 000000000000..df017fdfb44e --- /dev/null +++ b/include/dt-bindings/dma/jz4780-dma.h @@ -0,0 +1,49 @@ +#ifndef __DT_BINDINGS_DMA_JZ4780_DMA_H__ +#define __DT_BINDINGS_DMA_JZ4780_DMA_H__ + +/* + * Request type numbers for the JZ4780 DMA controller (written to the DRTn + * register for the channel). + */ +#define JZ4780_DMA_I2S1_TX 0x4 +#define JZ4780_DMA_I2S1_RX 0x5 +#define JZ4780_DMA_I2S0_TX 0x6 +#define JZ4780_DMA_I2S0_RX 0x7 +#define JZ4780_DMA_AUTO 0x8 +#define JZ4780_DMA_SADC_RX 0x9 +#define JZ4780_DMA_UART4_TX 0xc +#define JZ4780_DMA_UART4_RX 0xd +#define JZ4780_DMA_UART3_TX 0xe +#define JZ4780_DMA_UART3_RX 0xf +#define JZ4780_DMA_UART2_TX 0x10 +#define JZ4780_DMA_UART2_RX 0x11 +#define JZ4780_DMA_UART1_TX 0x12 +#define JZ4780_DMA_UART1_RX 0x13 +#define JZ4780_DMA_UART0_TX 0x14 +#define JZ4780_DMA_UART0_RX 0x15 +#define JZ4780_DMA_SSI0_TX 0x16 +#define JZ4780_DMA_SSI0_RX 0x17 +#define JZ4780_DMA_SSI1_TX 0x18 +#define JZ4780_DMA_SSI1_RX 0x19 +#define JZ4780_DMA_MSC0_TX 0x1a +#define JZ4780_DMA_MSC0_RX 0x1b +#define JZ4780_DMA_MSC1_TX 0x1c +#define JZ4780_DMA_MSC1_RX 0x1d +#define JZ4780_DMA_MSC2_TX 0x1e +#define JZ4780_DMA_MSC2_RX 0x1f +#define JZ4780_DMA_PCM0_TX 0x20 +#define JZ4780_DMA_PCM0_RX 0x21 +#define JZ4780_DMA_SMB0_TX 0x24 +#define JZ4780_DMA_SMB0_RX 0x25 +#define JZ4780_DMA_SMB1_TX 0x26 +#define JZ4780_DMA_SMB1_RX 0x27 +#define JZ4780_DMA_SMB2_TX 0x28 +#define JZ4780_DMA_SMB2_RX 0x29 +#define JZ4780_DMA_SMB3_TX 0x2a +#define JZ4780_DMA_SMB3_RX 0x2b +#define JZ4780_DMA_SMB4_TX 0x2c +#define JZ4780_DMA_SMB4_RX 0x2d +#define JZ4780_DMA_DES_TX 0x2e +#define JZ4780_DMA_DES_RX 0x2f + +#endif /* __DT_BINDINGS_DMA_JZ4780_DMA_H__ */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index dd12127f171c..e4da5e35e29c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -79,6 +79,7 @@ enum acpi_irq_model_id { ACPI_IRQ_MODEL_IOAPIC, ACPI_IRQ_MODEL_IOSAPIC, ACPI_IRQ_MODEL_PLATFORM, + ACPI_IRQ_MODEL_GIC, ACPI_IRQ_MODEL_COUNT }; @@ -152,9 +153,14 @@ void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void); +#ifndef PHYS_CPUID_INVALID +typedef u32 phys_cpuid_t; +#define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) +#endif + #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ -int acpi_map_cpu(acpi_handle handle, int physid, int *pcpu); +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu); int acpi_unmap_cpu(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ diff --git a/include/linux/acpi_irq.h b/include/linux/acpi_irq.h new file mode 100644 index 000000000000..f10c87265855 --- /dev/null +++ b/include/linux/acpi_irq.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_ACPI_IRQ_H +#define _LINUX_ACPI_IRQ_H + +#include <linux/irq.h> + +#ifndef acpi_irq_init +static inline void acpi_irq_init(void) { } +#endif + +#endif /* _LINUX_ACPI_IRQ_H */ diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 179b38ffd351..388574ea38ed 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -60,12 +60,15 @@ struct dma_chan_ref { * dependency chain * @ASYNC_TX_FENCE: specify that the next operation in the dependency * chain uses this operation's result as an input + * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the + * input data. Required for rmw case. */ enum async_tx_flags { ASYNC_TX_XOR_ZERO_DST = (1 << 0), ASYNC_TX_XOR_DROP_DST = (1 << 1), ASYNC_TX_ACK = (1 << 2), ASYNC_TX_FENCE = (1 << 3), + ASYNC_TX_PQ_XOR_DST = (1 << 4), }; /** diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 71e05bbf8ceb..4763ad64e832 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -50,6 +50,19 @@ #define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) #define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) #define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ +#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42) +#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43) +#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) +#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) +#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) +#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */ +#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ +#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ +#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) +#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ +#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) +// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY +#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -93,7 +106,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_EXPORT_PEER | \ CEPH_FEATURE_OSDMAP_ENC | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ - CEPH_FEATURE_OSD_PRIMARY_AFFINITY) + CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_CRUSH_V4) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 31eb03d0c766..d7d072a25c27 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -323,6 +323,7 @@ enum { CEPH_MDS_OP_MKSNAP = 0x01400, CEPH_MDS_OP_RMSNAP = 0x01401, CEPH_MDS_OP_LSSNAP = 0x00402, + CEPH_MDS_OP_RENAMESNAP = 0x01403, }; extern const char *ceph_mds_op_name(int op); diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h index 1df086d7882d..29cf897cc5cd 100644 --- a/include/linux/ceph/debugfs.h +++ b/include/linux/ceph/debugfs.h @@ -7,13 +7,7 @@ #define CEPH_DEFINE_SHOW_FUNC(name) \ static int name##_open(struct inode *inode, struct file *file) \ { \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ + return single_open(file, name, inode->i_private); \ } \ \ static const struct file_operations name##_fops = { \ diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 16fff9608848..30f92cefaa72 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -135,6 +135,7 @@ struct ceph_client { struct dentry *debugfs_dir; struct dentry *debugfs_monmap; struct dentry *debugfs_osdmap; + struct dentry *debugfs_options; #endif }; @@ -191,6 +192,7 @@ extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 561ea896c657..e55c08bc3a96 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -175,13 +175,12 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) __u8 version; if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { - pr_warning("incomplete pg encoding"); - + pr_warn("incomplete pg encoding\n"); return -EINVAL; } version = ceph_decode_8(p); if (version > 1) { - pr_warning("do not understand pg encoding %d > 1", + pr_warn("do not understand pg encoding %d > 1\n", (int)version); return -EINVAL; } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 135509821c39..d27d0152271f 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -253,4 +253,10 @@ extern void clocksource_of_init(void); static inline void clocksource_of_init(void) {} #endif +#ifdef CONFIG_ACPI +void acpi_generic_timer_init(void); +#else +static inline void acpi_generic_timer_init(void) { } +#endif + #endif /* _LINUX_CLOCKSOURCE_H */ diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 4fad5f8ee01d..48a1a7d100f1 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -96,13 +96,15 @@ struct crush_rule { * uniform O(1) poor poor * list O(n) optimal poor * tree O(log n) good good - * straw O(n) optimal optimal + * straw O(n) better better + * straw2 O(n) optimal optimal */ enum { CRUSH_BUCKET_UNIFORM = 1, CRUSH_BUCKET_LIST = 2, CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 + CRUSH_BUCKET_STRAW = 4, + CRUSH_BUCKET_STRAW2 = 5, }; extern const char *crush_bucket_alg_name(int alg); @@ -149,6 +151,11 @@ struct crush_bucket_straw { __u32 *straws; /* 16-bit fixed point */ }; +struct crush_bucket_straw2 { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ +}; + /* @@ -189,6 +196,7 @@ extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b); extern void crush_destroy_bucket(struct crush_bucket *b); extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 694e1fe1c4b4..2f0b431b73e0 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -163,6 +163,33 @@ struct dma_buf_attachment { }; /** + * struct dma_buf_export_info - holds information needed to export a dma_buf + * @exp_name: name of the exporting module - useful for debugging. + * @ops: Attach allocator-defined dma buf ops to the new buffer + * @size: Size of the buffer + * @flags: mode flags for the file + * @resv: reservation-object, NULL to allocate default one + * @priv: Attach private data of allocator to this buffer + * + * This structure holds the information required to export the buffer. Used + * with dma_buf_export() only. + */ +struct dma_buf_export_info { + const char *exp_name; + const struct dma_buf_ops *ops; + size_t size; + int flags; + struct reservation_object *resv; + void *priv; +}; + +/** + * helper macro for exporters; zeros and fills in most common values + */ +#define DEFINE_DMA_BUF_EXPORT_INFO(a) \ + struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME } + +/** * get_dma_buf - convenience wrapper for get_file. * @dmabuf: [in] pointer to dma_buf * @@ -181,12 +208,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *dmabuf_attach); -struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, - size_t size, int flags, const char *, - struct reservation_object *); - -#define dma_buf_export(priv, ops, size, flags, resv) \ - dma_buf_export_named(priv, ops, size, flags, KBUILD_MODNAME, resv) +struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info); int dma_buf_fd(struct dma_buf *dmabuf, int flags); struct dma_buf *dma_buf_get(int fd); diff --git a/include/linux/amba/xilinx_dma.h b/include/linux/dma/xilinx_dma.h index 34b98f276ed0..34b98f276ed0 100644 --- a/include/linux/amba/xilinx_dma.h +++ b/include/linux/dma/xilinx_dma.h diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b6997a0cb528..ad419757241f 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -11,10 +11,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * The full GNU General Public License is included in this distribution in the * file called COPYING. */ @@ -574,7 +570,6 @@ struct dma_tx_state { * @copy_align: alignment shift for memcpy operations * @xor_align: alignment shift for xor operations * @pq_align: alignment shift for pq operations - * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @src_addr_widths: bit mask of src addr widths the device supports @@ -625,7 +620,6 @@ struct dma_device { u8 copy_align; u8 xor_align; u8 pq_align; - u8 fill_align; #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; @@ -826,12 +820,6 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, return dmaengine_check_align(dev->pq_align, off1, off2, len); } -static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, - size_t off2, size_t len) -{ - return dmaengine_check_align(dev->fill_align, off1, off2, len); -} - static inline void dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) { @@ -1098,7 +1086,6 @@ void dma_async_device_unregister(struct dma_device *device); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); struct dma_chan *dma_get_slave_channel(struct dma_chan *chan); struct dma_chan *dma_get_any_slave_channel(struct dma_device *device); -struct dma_chan *net_dma_find_channel(void); #define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y) #define dma_request_slave_channel_compat(mask, x, y, dev, name) \ __dma_request_slave_channel_compat(&(mask), x, y, dev, name) @@ -1116,27 +1103,4 @@ static inline struct dma_chan return __dma_request_channel(mask, fn, fn_param); } - -/* --- Helper iov-locking functions --- */ - -struct dma_page_list { - char __user *base_address; - int nr_pages; - struct page **pages; -}; - -struct dma_pinned_list { - int nr_iovecs; - struct dma_page_list page_list[0]; -}; - -struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len); -void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list); - -dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len); -dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, struct page *page, - unsigned int offset, size_t len); - #endif /* DMAENGINE_H */ diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 31591686ac2d..996111000a8c 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -21,4 +21,10 @@ struct space_resv { #define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) #define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FALLOC_FL_SUPPORTED_MASK (FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + #endif /* _FALLOC_H_ */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c7496f263860..35ec87e490b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1820,7 +1820,7 @@ struct super_operations { #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) #define __I_DIO_WAKEUP 9 -#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) +#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define __I_DIRTY_TIME_EXPIRED 12 @@ -2644,6 +2644,9 @@ enum { /* filesystem can handle aio writes beyond i_size */ DIO_ASYNC_EXTEND = 0x04, + + /* inode/fs/bdev does not need truncate protection */ + DIO_SKIP_DIO_COUNT = 0x08, }; void dio_end_io(struct bio *bio, int error); @@ -2666,7 +2669,31 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, #endif void inode_dio_wait(struct inode *inode); -void inode_dio_done(struct inode *inode); + +/* + * inode_dio_begin - signal start of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +static inline void inode_dio_begin(struct inode *inode) +{ + atomic_inc(&inode->i_dio_count); +} + +/* + * inode_dio_end - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +static inline void inode_dio_end(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index a65208a8fe18..796ef9645827 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -115,10 +115,19 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) * Extended Capability Register */ -#define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) +#define ecap_pss(e) ((e >> 35) & 0x1f) +#define ecap_eafs(e) ((e >> 34) & 0x1) +#define ecap_nwfs(e) ((e >> 33) & 0x1) +#define ecap_srs(e) ((e >> 31) & 0x1) +#define ecap_ers(e) ((e >> 30) & 0x1) +#define ecap_prs(e) ((e >> 29) & 0x1) +#define ecap_pasid(e) ((e >> 28) & 0x1) +#define ecap_dis(e) ((e >> 27) & 0x1) +#define ecap_nest(e) ((e >> 26) & 0x1) +#define ecap_mts(e) ((e >> 25) & 0x1) +#define ecap_ecs(e) ((e >> 24) & 0x1) #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) -#define ecap_max_iotlb_offset(e) \ - (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16) +#define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) #define ecap_coherent(e) ((e) & 0x1) #define ecap_qis(e) ((e) & 0x2) #define ecap_pass_through(e) ((e >> 6) & 0x1) @@ -180,6 +189,9 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define DMA_GSTS_IRES (((u32)1) << 25) #define DMA_GSTS_CFIS (((u32)1) << 23) +/* DMA_RTADDR_REG */ +#define DMA_RTADDR_RTT (((u64)1) << 11) + /* CCMD_REG */ #define DMA_CCMD_ICC (((u64)1) << 63) #define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) diff --git a/include/linux/irqchip/arm-gic-acpi.h b/include/linux/irqchip/arm-gic-acpi.h new file mode 100644 index 000000000000..de3419ed3937 --- /dev/null +++ b/include/linux/irqchip/arm-gic-acpi.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2014, Linaro Ltd. + * Author: Tomasz Nowicki <tomasz.nowicki@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef ARM_GIC_ACPI_H_ +#define ARM_GIC_ACPI_H_ + +#ifdef CONFIG_ACPI + +/* + * Hard code here, we can not get memory size from MADT (but FDT does), + * Actually no need to do that, because this size can be inferred + * from GIC spec. + */ +#define ACPI_GICV2_DIST_MEM_SIZE (SZ_4K) +#define ACPI_GIC_CPU_IF_MEM_SIZE (SZ_8K) + +struct acpi_table_header; + +int gic_v2_acpi_init(struct acpi_table_header *table); +void acpi_gic_init(void); +#else +static inline void acpi_gic_init(void) { } +#endif + +#endif /* ARM_GIC_ACPI_H_ */ diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 9962c6bb1311..6db19f35f7c5 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -61,8 +61,8 @@ struct lguest_data { u32 tsc_khz; /* Fields initialized by the Guest at boot: */ - /* Instruction range to suppress interrupts even if enabled */ - unsigned long noirq_start, noirq_end; + /* Instruction to suppress interrupts even if enabled */ + unsigned long noirq_iret; /* Address above which page tables are all identical. */ unsigned long kernel_address; /* The vector to try to use for system calls (0x40 or 0x80). */ diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 0e166b92f5b4..324a34683971 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -16,6 +16,7 @@ #ifndef __LINUX_MFD_CROS_EC_H #define __LINUX_MFD_CROS_EC_H +#include <linux/cdev.h> #include <linux/notifier.h> #include <linux/mfd/cros_ec_commands.h> #include <linux/mutex.h> @@ -38,20 +39,20 @@ enum { /* * @version: Command version number (often 0) * @command: Command to send (EC_CMD_...) - * @outdata: Outgoing data to EC * @outsize: Outgoing length in bytes - * @indata: Where to put the incoming data from EC * @insize: Max number of bytes to accept from EC * @result: EC's response to the command (separate from communication failure) + * @outdata: Outgoing data to EC + * @indata: Where to put the incoming data from EC */ struct cros_ec_command { uint32_t version; uint32_t command; - uint8_t *outdata; uint32_t outsize; - uint8_t *indata; uint32_t insize; uint32_t result; + uint8_t outdata[EC_PROTO2_MAX_PARAM_SIZE]; + uint8_t indata[EC_PROTO2_MAX_PARAM_SIZE]; }; /** @@ -59,9 +60,17 @@ struct cros_ec_command { * * @ec_name: name of EC device (e.g. 'chromeos-ec') * @phys_name: name of physical comms layer (e.g. 'i2c-4') - * @dev: Device pointer + * @dev: Device pointer for physical comms device + * @vdev: Device pointer for virtual comms device + * @cdev: Character device structure for virtual comms device * @was_wake_device: true if this device was set to wake the system from * sleep at the last suspend + * @cmd_readmem: direct read of the EC memory-mapped region, if supported + * @offset is within EC_LPC_ADDR_MEMMAP region. + * @bytes: number of bytes to read. zero means "read a string" (including + * the trailing '\0'). At most only EC_MEMMAP_SIZE bytes can be read. + * Caller must ensure that the buffer is large enough for the result when + * reading a string. * * @priv: Private data * @irq: Interrupt to use @@ -90,8 +99,12 @@ struct cros_ec_device { const char *ec_name; const char *phys_name; struct device *dev; + struct device *vdev; + struct cdev cdev; bool was_wake_device; struct class *cros_class; + int (*cmd_readmem)(struct cros_ec_device *ec, unsigned int offset, + unsigned int bytes, void *dest); /* These are used to implement the platform-specific interface */ void *priv; diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h index 605812820e48..24b86d538e88 100644 --- a/include/linux/mfd/tmio.h +++ b/include/linux/mfd/tmio.h @@ -111,6 +111,8 @@ struct dma_chan; * data for the MMC controller */ struct tmio_mmc_data { + void *chan_priv_tx; + void *chan_priv_rx; unsigned int hclk; unsigned long capabilities; unsigned long capabilities2; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index f9ce34bec45b..83e80ab94500 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1345,6 +1345,10 @@ int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, + int port); +__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port); +void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port); int mlx4_flow_attach(struct mlx4_dev *dev, struct mlx4_net_trans_rule *rule, u64 *reg_id); int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b086070c3a5..0755b9fd03a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -499,7 +499,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); + return !PageSlab(page) && !PageHeadHuge(page); } /* diff --git a/include/linux/mmc/sh_mobile_sdhi.h b/include/linux/mmc/sh_mobile_sdhi.h index da77e5e2041d..95d6f0314a7d 100644 --- a/include/linux/mmc/sh_mobile_sdhi.h +++ b/include/linux/mmc/sh_mobile_sdhi.h @@ -7,14 +7,4 @@ #define SH_MOBILE_SDHI_IRQ_SDCARD "sdcard" #define SH_MOBILE_SDHI_IRQ_SDIO "sdio" -struct sh_mobile_sdhi_info { - int dma_slave_tx; - int dma_slave_rx; - unsigned long tmio_flags; - unsigned long tmio_caps; - unsigned long tmio_caps2; - u32 tmio_ocr_mask; /* available MMC voltages */ - unsigned int cd_gpio; -}; - #endif /* LINUX_MMC_SH_MOBILE_SDHI_H */ diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index 5f487d776411..29975c73a953 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -77,7 +77,7 @@ /* ensure we never evaluate anything shorted than an unsigned long * to zero, and ensure we'll never miss the end of an comparison (bjd) */ -#define map_calc_words(map) ((map_bankwidth(map) + (sizeof(unsigned long)-1))/ sizeof(unsigned long)) +#define map_calc_words(map) ((map_bankwidth(map) + (sizeof(unsigned long)-1)) / sizeof(unsigned long)) #ifdef CONFIG_MTD_MAP_BANK_WIDTH_8 # ifdef map_bankwidth @@ -181,7 +181,7 @@ static inline int map_bankwidth_supported(int w) } } -#define MAX_MAP_LONGS ( ((MAX_MAP_BANKWIDTH*8) + BITS_PER_LONG - 1) / BITS_PER_LONG ) +#define MAX_MAP_LONGS (((MAX_MAP_BANKWIDTH * 8) + BITS_PER_LONG - 1) / BITS_PER_LONG) typedef union { unsigned long x[MAX_MAP_LONGS]; @@ -264,20 +264,22 @@ void unregister_mtd_chip_driver(struct mtd_chip_driver *); struct mtd_info *do_map_probe(const char *name, struct map_info *map); void map_destroy(struct mtd_info *mtd); -#define ENABLE_VPP(map) do { if(map->set_vpp) map->set_vpp(map, 1); } while(0) -#define DISABLE_VPP(map) do { if(map->set_vpp) map->set_vpp(map, 0); } while(0) +#define ENABLE_VPP(map) do { if (map->set_vpp) map->set_vpp(map, 1); } while (0) +#define DISABLE_VPP(map) do { if (map->set_vpp) map->set_vpp(map, 0); } while (0) #define INVALIDATE_CACHED_RANGE(map, from, size) \ - do { if(map->inval_cache) map->inval_cache(map, from, size); } while(0) + do { if (map->inval_cache) map->inval_cache(map, from, size); } while (0) static inline int map_word_equal(struct map_info *map, map_word val1, map_word val2) { int i; - for (i=0; i<map_words(map); i++) { + + for (i = 0; i < map_words(map); i++) { if (val1.x[i] != val2.x[i]) return 0; } + return 1; } @@ -286,9 +288,9 @@ static inline map_word map_word_and(struct map_info *map, map_word val1, map_wor map_word r; int i; - for (i=0; i<map_words(map); i++) { + for (i = 0; i < map_words(map); i++) r.x[i] = val1.x[i] & val2.x[i]; - } + return r; } @@ -297,9 +299,9 @@ static inline map_word map_word_clr(struct map_info *map, map_word val1, map_wor map_word r; int i; - for (i=0; i<map_words(map); i++) { + for (i = 0; i < map_words(map); i++) r.x[i] = val1.x[i] & ~val2.x[i]; - } + return r; } @@ -308,22 +310,33 @@ static inline map_word map_word_or(struct map_info *map, map_word val1, map_word map_word r; int i; - for (i=0; i<map_words(map); i++) { + for (i = 0; i < map_words(map); i++) r.x[i] = val1.x[i] | val2.x[i]; - } + return r; } -#define map_word_andequal(m, a, b, z) map_word_equal(m, z, map_word_and(m, a, b)) +static inline int map_word_andequal(struct map_info *map, map_word val1, map_word val2, map_word val3) +{ + int i; + + for (i = 0; i < map_words(map); i++) { + if ((val1.x[i] & val2.x[i]) != val3.x[i]) + return 0; + } + + return 1; +} static inline int map_word_bitsset(struct map_info *map, map_word val1, map_word val2) { int i; - for (i=0; i<map_words(map); i++) { + for (i = 0; i < map_words(map); i++) { if (val1.x[i] & val2.x[i]) return 1; } + return 0; } @@ -355,14 +368,16 @@ static inline map_word map_word_load_partial(struct map_info *map, map_word orig if (map_bankwidth_is_large(map)) { char *dest = (char *)&orig; + memcpy(dest+start, buf, len); } else { - for (i=start; i < start+len; i++) { + for (i = start; i < start+len; i++) { int bitpos; + #ifdef __LITTLE_ENDIAN - bitpos = i*8; + bitpos = i * 8; #else /* __BIG_ENDIAN */ - bitpos = (map_bankwidth(map)-1-i)*8; + bitpos = (map_bankwidth(map) - 1 - i) * 8; #endif orig.x[0] &= ~(0xff << bitpos); orig.x[0] |= (unsigned long)buf[i-start] << bitpos; @@ -384,9 +399,10 @@ static inline map_word map_word_ff(struct map_info *map) if (map_bankwidth(map) < MAP_FF_LIMIT) { int bw = 8 * map_bankwidth(map); + r.x[0] = (1UL << bw) - 1; } else { - for (i=0; i<map_words(map); i++) + for (i = 0; i < map_words(map); i++) r.x[i] = ~0UL; } return r; @@ -407,7 +423,7 @@ static inline map_word inline_map_read(struct map_info *map, unsigned long ofs) r.x[0] = __raw_readq(map->virt + ofs); #endif else if (map_bankwidth_is_large(map)) - memcpy_fromio(r.x, map->virt+ofs, map->bankwidth); + memcpy_fromio(r.x, map->virt + ofs, map->bankwidth); else BUG(); diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 4720b86ee73d..e5409524bb0a 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -155,6 +155,8 @@ enum spi_nor_option_flags { * @write: [DRIVER-SPECIFIC] write data to the SPI NOR * @erase: [DRIVER-SPECIFIC] erase a sector of the SPI NOR * at the offset @offs + * @lock: [FLASH-SPECIFIC] lock a region of the SPI NOR + * @unlock: [FLASH-SPECIFIC] unlock a region of the SPI NOR * @priv: the private data */ struct spi_nor { @@ -189,6 +191,9 @@ struct spi_nor { size_t len, size_t *retlen, const u_char *write_buf); int (*erase)(struct spi_nor *nor, loff_t offs); + int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len); + int (*flash_unlock)(struct spi_nor *nor, loff_t ofs, uint64_t len); + void *priv; }; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index ed43cb74b11d..32201c269890 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -16,6 +16,13 @@ #include <linux/uidgid.h> #include <uapi/linux/nfs4.h> +enum nfs4_acl_whotype { + NFS4_ACL_WHO_NAMED = 0, + NFS4_ACL_WHO_OWNER, + NFS4_ACL_WHO_GROUP, + NFS4_ACL_WHO_EVERYONE, +}; + struct nfs4_ace { uint32_t type; uint32_t flag; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 410abd172feb..b95f914ce083 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -511,6 +511,7 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned * Try to write back everything synchronously (but check the * return value!) */ +extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4cb3eaa89cf7..93ab6071bbe9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -255,11 +255,13 @@ struct nfs4_layoutget { struct nfs4_getdeviceinfo_args { struct nfs4_sequence_args seq_args; struct pnfs_device *pdev; + __u32 notify_types; }; struct nfs4_getdeviceinfo_res { struct nfs4_sequence_res seq_res; struct pnfs_device *pdev; + __u32 notification; }; struct nfs4_layoutcommit_args { @@ -1271,11 +1273,15 @@ struct nfs42_falloc_args { nfs4_stateid falloc_stateid; u64 falloc_offset; u64 falloc_length; + const u32 *falloc_bitmask; }; struct nfs42_falloc_res { struct nfs4_sequence_res seq_res; unsigned int status; + + struct nfs_fattr *falloc_fattr; + const struct nfs_server *falloc_server; }; struct nfs42_seek_args { diff --git a/include/linux/of.h b/include/linux/of.h index 5f124f685e07..ddeaae6d2083 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -305,6 +305,7 @@ extern int of_property_read_string_helper(struct device_node *np, extern int of_device_is_compatible(const struct device_node *device, const char *); extern bool of_device_is_available(const struct device_node *device); +extern bool of_device_is_big_endian(const struct device_node *device); extern const void *of_get_property(const struct device_node *node, const char *name, int *lenp); @@ -467,6 +468,11 @@ static inline bool of_device_is_available(const struct device_node *device) return false; } +static inline bool of_device_is_big_endian(const struct device_node *device) +{ + return false; +} + static inline struct property *of_find_property(const struct device_node *np, const char *name, int *lenp) diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 0ff360d5b3b3..587ee507965d 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -33,6 +33,8 @@ extern void *of_fdt_get_property(const void *blob, extern int of_fdt_is_compatible(const void *blob, unsigned long node, const char *compat); +extern bool of_fdt_is_big_endian(const void *blob, + unsigned long node); extern int of_fdt_match(const void *blob, unsigned long node, const char *const *compat); extern void of_fdt_unflatten_tree(unsigned long *blob, diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index bfec136a6d1e..d884929a7747 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -37,8 +37,6 @@ extern int of_irq_parse_one(struct device_node *device, int index, extern unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data); extern int of_irq_to_resource(struct device_node *dev, int index, struct resource *r); -extern int of_irq_to_resource_table(struct device_node *dev, - struct resource *res, int nr_irqs); extern void of_irq_init(const struct of_device_id *matches); @@ -46,6 +44,8 @@ extern void of_irq_init(const struct of_device_id *matches); extern int of_irq_count(struct device_node *dev); extern int of_irq_get(struct device_node *dev, int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); +extern int of_irq_to_resource_table(struct device_node *dev, + struct resource *res, int nr_irqs); #else static inline int of_irq_count(struct device_node *dev) { @@ -59,6 +59,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name) { return 0; } +static inline int of_irq_to_resource_table(struct device_node *dev, + struct resource *res, int nr_irqs) +{ + return 0; +} #endif #if defined(CONFIG_OF) diff --git a/include/linux/platform_data/dma-imx-sdma.h b/include/linux/platform_data/dma-imx-sdma.h index eabac4e2fc99..2d08816720f6 100644 --- a/include/linux/platform_data/dma-imx-sdma.h +++ b/include/linux/platform_data/dma-imx-sdma.h @@ -48,6 +48,9 @@ struct sdma_script_start_addrs { s32 ssish_2_mcu_addr; s32 hdmi_dma_addr; /* End of v2 array */ + s32 zcanfd_2_mcu_addr; + s32 zqspi_2_mcu_addr; + /* End of v3 array */ }; /** diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 73069cb6c54a..a7a06d1dcf9c 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -72,6 +72,7 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; /* Routine choices */ struct raid6_calls { void (*gen_syndrome)(int, size_t, void **); + void (*xor_syndrome)(int, int, int, size_t, void **); int (*valid)(void); /* Returns 1 if this routine set is usable */ const char *name; /* Name of this routine set */ int prefer; /* Has special performance attribute */ diff --git a/include/linux/shdma-base.h b/include/linux/shdma-base.h index abdf1f229dc3..dd0ba502ccb3 100644 --- a/include/linux/shdma-base.h +++ b/include/linux/shdma-base.h @@ -69,6 +69,7 @@ struct shdma_chan { int id; /* Raw id of this channel */ int irq; /* Channel IRQ */ int slave_id; /* Client ID for slave DMA */ + int real_slave_id; /* argument passed to filter function */ int hw_req; /* DMA request line for slave DMA - same * as MID/RID, used with DT */ enum shdma_pm_state pm_state; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 06793b598f44..66e374d62f64 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -773,6 +773,7 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); +struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index aadc6a04e1ac..807371357160 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -142,12 +142,18 @@ typedef __be32 rpc_fraghdr; (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4)) /* - * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. + * Well-known netids. See: + * + * http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml */ #define RPCBIND_NETID_UDP "udp" #define RPCBIND_NETID_TCP "tcp" +#define RPCBIND_NETID_RDMA "rdma" +#define RPCBIND_NETID_SCTP "sctp" #define RPCBIND_NETID_UDP6 "udp6" #define RPCBIND_NETID_TCP6 "tcp6" +#define RPCBIND_NETID_RDMA6 "rdma6" +#define RPCBIND_NETID_SCTP6 "sctp6" #define RPCBIND_NETID_LOCAL "local" /* diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 64a0a0a97b23..c984c85981ea 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -41,11 +41,6 @@ #define _LINUX_SUNRPC_XPRTRDMA_H /* - * rpcbind (v3+) RDMA netid. - */ -#define RPCBIND_NETID_RDMA "rdma" - -/* * Constants. Max RPC/NFS header is big enough to account for * additional marshaling buffers passed down by Linux client. * diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 28f0e65b9a11..8f4d4bfa6d46 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -108,8 +108,6 @@ struct virtio_device { void *priv; }; -bool virtio_device_is_legacy_only(struct virtio_device_id id); - static inline struct virtio_device *dev_to_virtio(struct device *_dev) { return container_of(_dev, struct virtio_device, dev); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index ca3ed78e5ec7..1e306f727edc 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -298,13 +298,6 @@ static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val) } \ } while(0) -static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset) -{ - u8 ret; - vdev->config->get(vdev, offset, &ret, sizeof(ret)); - return ret; -} - /* Read @count fields, @bytes each. */ static inline void __virtio_cread_many(struct virtio_device *vdev, unsigned int offset, @@ -326,7 +319,6 @@ static inline void __virtio_cread_many(struct virtio_device *vdev, } while (gen != old); } - static inline void virtio_cread_bytes(struct virtio_device *vdev, unsigned int offset, void *buf, size_t len) @@ -334,6 +326,13 @@ static inline void virtio_cread_bytes(struct virtio_device *vdev, __virtio_cread_many(vdev, offset, buf, len, 1); } +static inline u8 virtio_cread8(struct virtio_device *vdev, unsigned int offset) +{ + u8 ret; + vdev->config->get(vdev, offset, &ret, sizeof(ret)); + return ret; +} + static inline void virtio_cwrite8(struct virtio_device *vdev, unsigned int offset, u8 val) { @@ -374,7 +373,6 @@ static inline u64 virtio_cread64(struct virtio_device *vdev, unsigned int offset) { u64 ret; - vdev->config->get(vdev, offset, &ret, sizeof(ret)); __virtio_cread_many(vdev, offset, &ret, 1, sizeof(ret)); return virtio64_to_cpu(vdev, (__force __virtio64)ret); } diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 67e06fe18c03..8e50888a6d59 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -21,19 +21,20 @@ * actually quite cheap. */ -#ifdef CONFIG_SMP static inline void virtio_mb(bool weak_barriers) { +#ifdef CONFIG_SMP if (weak_barriers) smp_mb(); else +#endif mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) - smp_rmb(); + dma_rmb(); else rmb(); } @@ -41,26 +42,10 @@ static inline void virtio_rmb(bool weak_barriers) static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) - smp_wmb(); + dma_wmb(); else wmb(); } -#else -static inline void virtio_mb(bool weak_barriers) -{ - mb(); -} - -static inline void virtio_rmb(bool weak_barriers) -{ - rmb(); -} - -static inline void virtio_wmb(bool weak_barriers) -{ - wmb(); -} -#endif struct virtio_device; struct virtqueue; diff --git a/include/net/bonding.h b/include/net/bonding.h index fda6feeb6c1f..78ed135e9dea 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -30,13 +30,6 @@ #include <net/bond_alb.h> #include <net/bond_options.h> -#define DRV_VERSION "3.7.1" -#define DRV_RELDATE "April 27, 2011" -#define DRV_NAME "bonding" -#define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" - -#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n" - #define BOND_MAX_ARP_TARGETS 16 #define BOND_DEFAULT_MIIMON 100 diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h index d3583d3ee193..54e7af301888 100644 --- a/include/target/iscsi/iscsi_target_core.h +++ b/include/target/iscsi/iscsi_target_core.h @@ -20,6 +20,8 @@ #define ISCSIT_MIN_TAGS 16 #define ISCSIT_EXTRA_TAGS 8 #define ISCSIT_TCP_BACKLOG 256 +#define ISCSI_RX_THREAD_NAME "iscsi_trx" +#define ISCSI_TX_THREAD_NAME "iscsi_ttx" /* struct iscsi_node_attrib sanity values */ #define NA_DATAOUT_TIMEOUT 3 @@ -60,6 +62,7 @@ #define TA_CACHE_CORE_NPS 0 /* T10 protection information disabled by default */ #define TA_DEFAULT_T10_PI 0 +#define TA_DEFAULT_FABRIC_PROT_TYPE 0 #define ISCSI_IOV_DATA_BUFFER 5 @@ -600,8 +603,11 @@ struct iscsi_conn { struct iscsi_tpg_np *tpg_np; /* Pointer to parent session */ struct iscsi_session *sess; - /* Pointer to thread_set in use for this conn's threads */ - struct iscsi_thread_set *thread_set; + int bitmap_id; + int rx_thread_active; + struct task_struct *rx_thread; + int tx_thread_active; + struct task_struct *tx_thread; /* list_head for session connection list */ struct list_head conn_list; } ____cacheline_aligned; @@ -767,6 +773,7 @@ struct iscsi_tpg_attrib { u32 demo_mode_discovery; u32 default_erl; u8 t10_pi; + u32 fabric_prot_type; struct iscsi_portal_group *tpg; }; @@ -871,10 +878,10 @@ struct iscsit_global { /* Unique identifier used for the authentication daemon */ u32 auth_id; u32 inactive_ts; - /* Thread Set bitmap count */ - int ts_bitmap_count; +#define ISCSIT_BITMAP_BITS 262144 /* Thread Set bitmap pointer */ unsigned long *ts_bitmap; + spinlock_t ts_bitmap_lock; /* Used for iSCSI discovery session authentication */ struct iscsi_node_acl discovery_acl; struct iscsi_portal_group *discovery_tpg; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 672150b6aaf5..480e9f82dfea 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -165,10 +165,8 @@ enum se_cmd_flags_table { SCF_SEND_DELAYED_TAS = 0x00004000, SCF_ALUA_NON_OPTIMIZED = 0x00008000, SCF_PASSTHROUGH_SG_TO_MEM_NOALLOC = 0x00020000, - SCF_ACK_KREF = 0x00040000, SCF_COMPARE_AND_WRITE = 0x00080000, SCF_COMPARE_AND_WRITE_POST = 0x00100000, - SCF_CMD_XCOPY_PASSTHROUGH = 0x00200000, }; /* struct se_dev_entry->lun_flags and struct se_lun->lun_access */ @@ -520,11 +518,11 @@ struct se_cmd { struct list_head se_cmd_list; struct completion cmd_wait_comp; struct kref cmd_kref; - struct target_core_fabric_ops *se_tfo; + const struct target_core_fabric_ops *se_tfo; sense_reason_t (*execute_cmd)(struct se_cmd *); sense_reason_t (*execute_rw)(struct se_cmd *, struct scatterlist *, u32, enum dma_data_direction); - sense_reason_t (*transport_complete_callback)(struct se_cmd *); + sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool); unsigned char *t_task_cdb; unsigned char __t_task_cdb[TCM_MAX_COMMAND_SIZE]; @@ -591,6 +589,7 @@ struct se_node_acl { bool acl_stop:1; u32 queue_depth; u32 acl_index; + enum target_prot_type saved_prot_type; #define MAX_ACL_TAG_SIZE 64 char acl_tag[MAX_ACL_TAG_SIZE]; /* Used for PR SPEC_I_PT=1 and REGISTER_AND_MOVE */ @@ -616,6 +615,7 @@ struct se_session { unsigned sess_tearing_down:1; u64 sess_bin_isid; enum target_prot_op sup_prot_ops; + enum target_prot_type sess_prot_type; struct se_node_acl *se_node_acl; struct se_portal_group *se_tpg; void *fabric_sess_ptr; @@ -890,7 +890,7 @@ struct se_portal_group { /* List of TCM sessions associated wth this TPG */ struct list_head tpg_sess_list; /* Pointer to $FABRIC_MOD dependent code */ - struct target_core_fabric_ops *se_tpg_tfo; + const struct target_core_fabric_ops *se_tpg_tfo; struct se_wwn *se_tpg_wwn; struct config_group tpg_group; struct config_group *tpg_default_groups[7]; diff --git a/include/target/target_core_configfs.h b/include/target/target_core_configfs.h index e0801386e4dc..25bb04c4209e 100644 --- a/include/target/target_core_configfs.h +++ b/include/target/target_core_configfs.h @@ -5,12 +5,6 @@ #define TARGET_CORE_NAME_MAX_LEN 64 #define TARGET_FABRIC_NAME_SIZE 32 -extern struct target_fabric_configfs *target_fabric_configfs_init( - struct module *, const char *); -extern void target_fabric_configfs_free(struct target_fabric_configfs *); -extern int target_fabric_configfs_register(struct target_fabric_configfs *); -extern void target_fabric_configfs_deregister(struct target_fabric_configfs *); - struct target_fabric_configfs_template { struct config_item_type tfc_discovery_cit; struct config_item_type tfc_wwn_cit; diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 22a4e98eec80..17c7f5ac7ea0 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -2,6 +2,8 @@ #define TARGET_CORE_FABRIC_H struct target_core_fabric_ops { + struct module *module; + const char *name; struct configfs_subsystem *tf_subsys; char *(*get_fabric_name)(void); u8 (*get_fabric_proto_ident)(struct se_portal_group *); @@ -27,6 +29,14 @@ struct target_core_fabric_ops { * inquiry response */ int (*tpg_check_demo_mode_login_only)(struct se_portal_group *); + /* + * Optionally used as a configfs tunable to determine when + * target-core should signal the PROTECT=1 feature bit for + * backends that don't support T10-PI, so that either fabric + * HW offload or target-core emulation performs the associated + * WRITE_STRIP and READ_INSERT operations. + */ + int (*tpg_check_prot_fabric_only)(struct se_portal_group *); struct se_node_acl *(*tpg_alloc_fabric_acl)( struct se_portal_group *); void (*tpg_release_fabric_acl)(struct se_portal_group *, @@ -82,8 +92,23 @@ struct target_core_fabric_ops { struct se_node_acl *(*fabric_make_nodeacl)(struct se_portal_group *, struct config_group *, const char *); void (*fabric_drop_nodeacl)(struct se_node_acl *); + + struct configfs_attribute **tfc_discovery_attrs; + struct configfs_attribute **tfc_wwn_attrs; + struct configfs_attribute **tfc_tpg_base_attrs; + struct configfs_attribute **tfc_tpg_np_base_attrs; + struct configfs_attribute **tfc_tpg_attrib_attrs; + struct configfs_attribute **tfc_tpg_auth_attrs; + struct configfs_attribute **tfc_tpg_param_attrs; + struct configfs_attribute **tfc_tpg_nacl_base_attrs; + struct configfs_attribute **tfc_tpg_nacl_attrib_attrs; + struct configfs_attribute **tfc_tpg_nacl_auth_attrs; + struct configfs_attribute **tfc_tpg_nacl_param_attrs; }; +int target_register_template(const struct target_core_fabric_ops *fo); +void target_unregister_template(const struct target_core_fabric_ops *fo); + struct se_session *transport_init_session(enum target_prot_op); int transport_alloc_session_tags(struct se_session *, unsigned int, unsigned int); @@ -95,13 +120,15 @@ void transport_register_session(struct se_portal_group *, struct se_node_acl *, struct se_session *, void *); void target_get_session(struct se_session *); void target_put_session(struct se_session *); +ssize_t target_show_dynamic_sessions(struct se_portal_group *, char *); void transport_free_session(struct se_session *); void target_put_nacl(struct se_node_acl *); void transport_deregister_session_configfs(struct se_session *); void transport_deregister_session(struct se_session *); -void transport_init_se_cmd(struct se_cmd *, struct target_core_fabric_ops *, +void transport_init_se_cmd(struct se_cmd *, + const struct target_core_fabric_ops *, struct se_session *, u32, int, int, unsigned char *); sense_reason_t transport_lookup_cmd_lun(struct se_cmd *, u32); sense_reason_t target_setup_cmd_from_cdb(struct se_cmd *, unsigned char *); @@ -153,8 +180,8 @@ int core_tpg_set_initiator_node_queue_depth(struct se_portal_group *, unsigned char *, u32, int); int core_tpg_set_initiator_node_tag(struct se_portal_group *, struct se_node_acl *, const char *); -int core_tpg_register(struct target_core_fabric_ops *, struct se_wwn *, - struct se_portal_group *, void *, int); +int core_tpg_register(const struct target_core_fabric_ops *, + struct se_wwn *, struct se_portal_group *, void *, int); int core_tpg_deregister(struct se_portal_group *); /* SAS helpers */ diff --git a/include/target/target_core_fabric_configfs.h b/include/target/target_core_fabric_configfs.h index b32a14905cfa..7a0649c09e79 100644 --- a/include/target/target_core_fabric_configfs.h +++ b/include/target/target_core_fabric_configfs.h @@ -90,6 +90,11 @@ static struct target_fabric_tpg_attribute _fabric##_tpg_##_name = \ _fabric##_tpg_store_##_name); +#define TF_TPG_BASE_ATTR_RO(_fabric, _name) \ +static struct target_fabric_tpg_attribute _fabric##_tpg_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + _fabric##_tpg_show_##_name); + CONFIGFS_EATTR_STRUCT(target_fabric_wwn, target_fabric_configfs); #define TF_WWN_ATTR(_fabric, _name, _mode) \ static struct target_fabric_wwn_attribute _fabric##_wwn_##_name = \ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 572e6503394a..7f79cf459591 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -407,10 +407,10 @@ TRACE_EVENT(btrfs_sync_file, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); __entry->ino = inode->i_ino; - __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->parent = d_inode(dentry->d_parent)->i_ino; __entry->datasync = datasync; __entry->root_objectid = BTRFS_I(inode)->root->root_key.objectid; diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h index 7f20707849bb..fc733d28117a 100644 --- a/include/trace/events/ext3.h +++ b/include/trace/events/ext3.h @@ -439,10 +439,10 @@ TRACE_EVENT(ext3_sync_file_enter, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - __entry->dev = dentry->d_inode->i_sb->s_dev; - __entry->ino = dentry->d_inode->i_ino; + __entry->dev = d_inode(dentry)->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; - __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %ld datasync %d ", @@ -710,9 +710,9 @@ TRACE_EVENT(ext3_unlink_enter, TP_fast_assign( __entry->parent = parent->i_ino; - __entry->ino = dentry->d_inode->i_ino; - __entry->size = dentry->d_inode->i_size; - __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; + __entry->size = d_inode(dentry)->i_size; + __entry->dev = d_inode(dentry)->i_sb->s_dev; ), TP_printk("dev %d,%d ino %lu size %lld parent %ld", @@ -734,8 +734,8 @@ TRACE_EVENT(ext3_unlink_exit, ), TP_fast_assign( - __entry->ino = dentry->d_inode->i_ino; - __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; + __entry->dev = d_inode(dentry)->i_sb->s_dev; __entry->ret = ret; ), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 47fca36ee426..08ec3dd27630 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -872,10 +872,10 @@ TRACE_EVENT(ext4_sync_file_enter, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - __entry->dev = dentry->d_inode->i_sb->s_dev; - __entry->ino = dentry->d_inode->i_ino; + __entry->dev = d_inode(dentry)->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; - __entry->parent = dentry->d_parent->d_inode->i_ino; + __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", @@ -1453,10 +1453,10 @@ TRACE_EVENT(ext4_unlink_enter, ), TP_fast_assign( - __entry->dev = dentry->d_inode->i_sb->s_dev; - __entry->ino = dentry->d_inode->i_ino; + __entry->dev = d_inode(dentry)->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; __entry->parent = parent->i_ino; - __entry->size = dentry->d_inode->i_size; + __entry->size = d_inode(dentry)->i_size; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", @@ -1477,8 +1477,8 @@ TRACE_EVENT(ext4_unlink_exit, ), TP_fast_assign( - __entry->dev = dentry->d_inode->i_sb->s_dev; - __entry->ino = dentry->d_inode->i_ino; + __entry->dev = d_inode(dentry)->i_sb->s_dev; + __entry->ino = d_inode(dentry)->i_ino; __entry->ret = ret; ), diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 640954b9ecf9..1a0006a76b00 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -431,6 +431,7 @@ header-y += virtio_blk.h header-y += virtio_config.h header-y += virtio_console.h header-y += virtio_ids.h +header-y += virtio_input.h header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h index d1197ae3723c..3e445a760f14 100644 --- a/include/uapi/linux/falloc.h +++ b/include/uapi/linux/falloc.h @@ -41,4 +41,21 @@ */ #define FALLOC_FL_ZERO_RANGE 0x10 +/* + * FALLOC_FL_INSERT_RANGE is use to insert space within the file size without + * overwriting any existing data. The contents of the file beyond offset are + * shifted towards right by len bytes to create a hole. As such, this + * operation will increase the size of the file by len bytes. + * + * Different filesystems may implement different limitations on the granularity + * of the operation. Most will limit operations to filesystem block size + * boundaries, but this boundary may be larger or smaller depending on + * the filesystem and/or the configuration of the filesystem or file. + * + * Attempting to insert space using this flag at OR beyond the end of + * the file is considered an illegal operation - just use ftruncate(2) or + * fallocate(2) with mode 0 for such type of operations. + */ +#define FALLOC_FL_INSERT_RANGE 0x20 + #endif /* _UAPI_FALLOC_H_ */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f574d7be7631..4b60056776d1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -813,6 +813,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MIPS_MSA 112 #define KVM_CAP_S390_INJECT_IRQ 113 #define KVM_CAP_S390_IRQ_STATE 114 +#define KVM_CAP_PPC_HWRNG 115 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 35f5f4c6c260..adc0aff83fbb 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -162,13 +162,6 @@ */ #define NFS4_MAX_BACK_CHANNEL_OPS 2 -enum nfs4_acl_whotype { - NFS4_ACL_WHO_NAMED = 0, - NFS4_ACL_WHO_OWNER, - NFS4_ACL_WHO_GROUP, - NFS4_ACL_WHO_EVERYONE, -}; - #endif /* _UAPI_LINUX_NFS4_H */ /* diff --git a/include/uapi/linux/nfs_idmap.h b/include/uapi/linux/nfs_idmap.h index 8d4b1c7b24d4..038e36c96669 100644 --- a/include/uapi/linux/nfs_idmap.h +++ b/include/uapi/linux/nfs_idmap.h @@ -1,5 +1,5 @@ /* - * include/linux/nfs_idmap.h + * include/uapi/linux/nfs_idmap.h * * UID and GID to name mapping for clients. * diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h index 0bf130a1c58d..28ec6c9c421a 100644 --- a/include/uapi/linux/nfsd/debug.h +++ b/include/uapi/linux/nfsd/debug.h @@ -12,14 +12,6 @@ #include <linux/sunrpc/debug.h> /* - * Enable debugging for nfsd. - * Requires RPC_DEBUG. - */ -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define NFSD_DEBUG 1 -#endif - -/* * knfsd debug flags */ #define NFSDDBG_SOCK 0x0001 diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index d3bd6ffec041..0df7bd5d2fb1 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -21,6 +21,9 @@ /* * Export flags. + * + * Please update the expflags[] array in fs/nfsd/export.c when adding + * a new flag. */ #define NFSEXP_READONLY 0x0001 #define NFSEXP_INSECURE_PORT 0x0002 diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 49f4210d4394..2ae6131e69a5 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -78,6 +78,12 @@ #define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. * read requests will only be sent here in @@ -101,6 +107,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_CLEAN 0 #define MD_SB_ERRORS 1 +#define MD_SB_CLUSTERED 5 /* MD is clustered */ #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ /* diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 74e7c60c4716..1cb8aa6850b5 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -62,6 +62,7 @@ #define STOP_ARRAY _IO (MD_MAJOR, 0x32) #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) /* 63 partitions with the alternate major number (mdp) */ #define MdpMinorShift 6 diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index b483d1909d3e..b67f99d3c520 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -6,7 +6,7 @@ #include <linux/types.h> #include <linux/uio.h> -#define TCMU_VERSION "1.0" +#define TCMU_VERSION "2.0" /* * Ring Design @@ -39,9 +39,13 @@ * should process the next packet the same way, and so on. */ -#define TCMU_MAILBOX_VERSION 1 +#define TCMU_MAILBOX_VERSION 2 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ +/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */ +#define xstr(s) str(s) +#define str(s) #s + struct tcmu_mailbox { __u16 version; __u16 flags; @@ -64,31 +68,36 @@ enum tcmu_opcode { * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. */ struct tcmu_cmd_entry_hdr { - __u32 len_op; + __u32 len_op; + __u16 cmd_id; + __u8 kflags; +#define TCMU_UFLAG_UNKNOWN_OP 0x1 + __u8 uflags; + } __packed; #define TCMU_OP_MASK 0x7 -static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) +static inline enum tcmu_opcode tcmu_hdr_get_op(__u32 len_op) { - return hdr->len_op & TCMU_OP_MASK; + return len_op & TCMU_OP_MASK; } -static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) +static inline void tcmu_hdr_set_op(__u32 *len_op, enum tcmu_opcode op) { - hdr->len_op &= ~TCMU_OP_MASK; - hdr->len_op |= (op & TCMU_OP_MASK); + *len_op &= ~TCMU_OP_MASK; + *len_op |= (op & TCMU_OP_MASK); } -static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) +static inline __u32 tcmu_hdr_get_len(__u32 len_op) { - return hdr->len_op & ~TCMU_OP_MASK; + return len_op & ~TCMU_OP_MASK; } -static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) +static inline void tcmu_hdr_set_len(__u32 *len_op, __u32 len) { - hdr->len_op &= TCMU_OP_MASK; - hdr->len_op |= len; + *len_op &= TCMU_OP_MASK; + *len_op |= len; } /* Currently the same as SCSI_SENSE_BUFFERSIZE */ @@ -97,13 +106,14 @@ static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) struct tcmu_cmd_entry { struct tcmu_cmd_entry_hdr hdr; - uint16_t cmd_id; - uint16_t __pad1; - union { struct { + uint32_t iov_cnt; + uint32_t iov_bidi_cnt; + uint32_t iov_dif_cnt; uint64_t cdb_off; - uint64_t iov_cnt; + uint64_t __pad1; + uint64_t __pad2; struct iovec iov[0]; } req; struct { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 4b0488f20b2e..984169a819ee 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -25,6 +25,7 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +#include <linux/types.h> #include <linux/virtio_ids.h> #include <linux/virtio_config.h> @@ -38,9 +39,9 @@ struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ - __le32 num_pages; + __u32 num_pages; /* Number of pages we've actually got in balloon. */ - __le32 actual; + __u32 actual; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ @@ -51,9 +52,32 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ #define VIRTIO_BALLOON_S_NR 6 +/* + * Memory statistics structure. + * Driver fills an array of these structures and passes to device. + * + * NOTE: fields are laid out in a way that would make compiler add padding + * between and after fields, so we have to use compiler-specific attributes to + * pack it, to disable this padding. This also often causes compiler to + * generate suboptimal code. + * + * We maintain this statistics structure format for backwards compatibility, + * but don't follow this example. + * + * If implementing a similar structure, do something like the below instead: + * struct virtio_balloon_stat { + * __virtio16 tag; + * __u8 reserved[6]; + * __virtio64 val; + * }; + * + * In other words, add explicit reserved fields to align field and + * structure boundaries at field size, avoiding compiler padding + * without the packed attribute. + */ struct virtio_balloon_stat { - __u16 tag; - __u64 val; + __virtio16 tag; + __virtio64 val; } __attribute__((packed)); #endif /* _LINUX_VIRTIO_BALLOON_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 284fc3a05f7b..5f60aa4be50a 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -39,5 +39,6 @@ #define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_INPUT 18 /* virtio input */ #endif /* _LINUX_VIRTIO_IDS_H */ diff --git a/include/uapi/linux/virtio_input.h b/include/uapi/linux/virtio_input.h new file mode 100644 index 000000000000..a7fe5c8fb135 --- /dev/null +++ b/include/uapi/linux/virtio_input.h @@ -0,0 +1,76 @@ +#ifndef _LINUX_VIRTIO_INPUT_H +#define _LINUX_VIRTIO_INPUT_H +/* This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ + +#include <linux/types.h> + +enum virtio_input_config_select { + VIRTIO_INPUT_CFG_UNSET = 0x00, + VIRTIO_INPUT_CFG_ID_NAME = 0x01, + VIRTIO_INPUT_CFG_ID_SERIAL = 0x02, + VIRTIO_INPUT_CFG_ID_DEVIDS = 0x03, + VIRTIO_INPUT_CFG_PROP_BITS = 0x10, + VIRTIO_INPUT_CFG_EV_BITS = 0x11, + VIRTIO_INPUT_CFG_ABS_INFO = 0x12, +}; + +struct virtio_input_absinfo { + __u32 min; + __u32 max; + __u32 fuzz; + __u32 flat; + __u32 res; +}; + +struct virtio_input_devids { + __u16 bustype; + __u16 vendor; + __u16 product; + __u16 version; +}; + +struct virtio_input_config { + __u8 select; + __u8 subsel; + __u8 size; + __u8 reserved[5]; + union { + char string[128]; + __u8 bitmap[128]; + struct virtio_input_absinfo abs; + struct virtio_input_devids ids; + } u; +}; + +struct virtio_input_event { + __le16 type; + __le16 code; + __le32 value; +}; + +#endif /* _LINUX_VIRTIO_INPUT_H */ diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 46145a5277fe..a45be6bdcf5b 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -864,7 +864,7 @@ struct snd_ctl_elem_id { snd_ctl_elem_iface_t iface; /* interface identifier */ unsigned int device; /* device/client number */ unsigned int subdevice; /* subdevice (substream) number */ - unsigned char name[44]; /* ASCII name of item */ + unsigned char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; /* ASCII name of item */ unsigned int index; /* index of item */ }; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7635a1cf99f3..3aaea7ffd077 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -466,7 +466,7 @@ out_unlock: static int mqueue_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME; dir->i_size -= DIRENT_SIZE; @@ -770,7 +770,7 @@ static struct file *do_open(struct path *path, int oflag) if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return ERR_PTR(-EINVAL); acc = oflag2acc[oflag & O_ACCMODE]; - if (inode_permission(path->dentry->d_inode, acc)) + if (inode_permission(d_inode(path->dentry), acc)) return ERR_PTR(-EACCES); return dentry_open(path, oflag, current_cred()); } @@ -802,7 +802,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, ro = mnt_want_write(mnt); /* we'll drop it in any case */ error = 0; - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); @@ -811,7 +811,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, path.mnt = mntget(mnt); if (oflag & O_CREAT) { - if (path.dentry->d_inode) { /* entry already exists */ + if (d_really_is_positive(path.dentry)) { /* entry already exists */ audit_inode(name, path.dentry, 0); if (oflag & O_EXCL) { error = -EEXIST; @@ -824,12 +824,12 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, goto out; } audit_inode_parent_hidden(name, root); - filp = do_create(ipc_ns, root->d_inode, + filp = do_create(ipc_ns, d_inode(root), &path, oflag, mode, u_attr ? &attr : NULL); } } else { - if (!path.dentry->d_inode) { + if (d_really_is_negative(path.dentry)) { error = -ENOENT; goto out; } @@ -848,7 +848,7 @@ out_putfd: put_unused_fd(fd); fd = error; } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); if (!ro) mnt_drop_write(mnt); out_putname: @@ -873,7 +873,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - mutex_lock_nested(&mnt->mnt_root->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { @@ -881,17 +881,17 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) goto out_unlock; } - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { err = -ENOENT; } else { ihold(inode); - err = vfs_unlink(dentry->d_parent->d_inode, dentry, NULL); + err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); out_unlock: - mutex_unlock(&mnt->mnt_root->d_inode->i_mutex); + mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex); if (inode) iput(inode); mnt_drop_write(mnt); diff --git a/ipc/shm.c b/ipc/shm.c index d280a74af2ef..6d767071c367 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1132,7 +1132,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; - size = i_size_read(path.dentry->d_inode); + size = i_size_read(d_inode(path.dentry)); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); diff --git a/kernel/audit.c b/kernel/audit.c index 72ab759a0b43..1c13e4267de6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -43,6 +43,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/file.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> @@ -107,6 +108,7 @@ static u32 audit_rate_limit; * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) +static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_overflow = 0; @@ -338,13 +340,13 @@ static int audit_set_backlog_limit(u32 limit) static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", - &audit_backlog_wait_time, timeout); + &audit_backlog_wait_time_master, timeout); } static int audit_set_enabled(u32 state) { int rc; - if (state < AUDIT_OFF || state > AUDIT_LOCKED) + if (state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state); @@ -663,7 +665,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) case AUDIT_MAKE_EQUIV: /* Only support auditd and auditctl in initial pid namespace * for now. */ - if ((task_active_pid_ns(current) != &init_pid_ns)) + if (task_active_pid_ns(current) != &init_pid_ns) return -EPERM; if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) @@ -834,7 +836,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time; + s.backlog_wait_time = audit_backlog_wait_time_master; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -877,8 +879,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) { if (sizeof(s) > (size_t)nlh->nlmsg_len) return -EINVAL; - if (s.backlog_wait_time < 0 || - s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) + if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME) return -EINVAL; err = audit_set_backlog_wait_time(s.backlog_wait_time); if (err < 0) @@ -1385,7 +1386,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; + if (!reserve) + audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { @@ -1759,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } else audit_log_format(ab, " name=(null)"); - if (n->ino != (unsigned long)-1) { + if (n->ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu" " dev=%02x:%02x mode=%#ho" " ouid=%u ogid=%u rdev=%02x:%02x", @@ -1771,7 +1773,6 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - } if (n->osid != 0) { char *ctx = NULL; u32 len; @@ -1838,11 +1839,29 @@ error_path: } EXPORT_SYMBOL(audit_log_task_context); +void audit_log_d_path_exe(struct audit_buffer *ab, + struct mm_struct *mm) +{ + struct file *exe_file; + + if (!mm) + goto out_null; + + exe_file = get_mm_exe_file(mm); + if (!exe_file) + goto out_null; + + audit_log_d_path(ab, " exe=", &exe_file->f_path); + fput(exe_file); + return; +out_null: + audit_log_format(ab, " exe=(null)"); +} + void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; char comm[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; char *tty; if (!ab) @@ -1878,13 +1897,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } else - audit_log_format(ab, " exe=(null)"); + audit_log_d_path_exe(ab, tsk->mm); audit_log_task_context(ab); } EXPORT_SYMBOL(audit_log_task_info); @@ -1915,7 +1928,7 @@ void audit_log_link_denied(const char *operation, struct path *link) /* Generate AUDIT_PATH record with object. */ name->type = AUDIT_TYPE_NORMAL; - audit_copy_inode(name, link->dentry, link->dentry->d_inode); + audit_copy_inode(name, link->dentry, d_backing_inode(link->dentry)); audit_log_name(current->audit_context, name, link, 0, NULL); out: kfree(name); diff --git a/kernel/audit.h b/kernel/audit.h index 1caa0d345d90..d641f9bb3ed0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -257,6 +257,9 @@ extern struct list_head audit_filter_list[]; extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); +extern void audit_log_d_path_exe(struct audit_buffer *ab, + struct mm_struct *mm); + /* audit watch functions */ #ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2e0c97427b33..b0f9877273fc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -37,6 +37,7 @@ struct audit_chunk { static LIST_HEAD(tree_list); static LIST_HEAD(prune_list); +static struct task_struct *prune_thread; /* * One struct chunk is attached to each inode of interest. @@ -576,7 +577,7 @@ int audit_remove_tree_rule(struct audit_krule *rule) static int compare_root(struct vfsmount *mnt, void *arg) { - return mnt->mnt_root->d_inode == arg; + return d_backing_inode(mnt->mnt_root) == arg; } void audit_trim_trees(void) @@ -648,7 +649,58 @@ void audit_put_tree(struct audit_tree *tree) static int tag_mount(struct vfsmount *mnt, void *arg) { - return tag_chunk(mnt->mnt_root->d_inode, arg); + return tag_chunk(d_backing_inode(mnt->mnt_root), arg); +} + +/* + * That gets run when evict_chunk() ends up needing to kill audit_tree. + * Runs from a separate thread. + */ +static int prune_tree_thread(void *unused) +{ + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&prune_list)) + schedule(); + __set_current_state(TASK_RUNNING); + + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(&prune_list)) { + struct audit_tree *victim; + + victim = list_entry(prune_list.next, + struct audit_tree, list); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + } + return 0; +} + +static int audit_launch_prune(void) +{ + if (prune_thread) + return 0; + prune_thread = kthread_create(prune_tree_thread, NULL, + "audit_prune_tree"); + if (IS_ERR(prune_thread)) { + pr_err("cannot start thread audit_prune_tree"); + prune_thread = NULL; + return -ENOMEM; + } else { + wake_up_process(prune_thread); + return 0; + } } /* called with audit_filter_mutex */ @@ -674,6 +726,12 @@ int audit_add_tree_rule(struct audit_krule *rule) /* do not set rule->tree yet */ mutex_unlock(&audit_filter_mutex); + if (unlikely(!prune_thread)) { + err = audit_launch_prune(); + if (err) + goto Err; + } + err = kern_path(tree->pathname, 0, &path); if (err) goto Err; @@ -811,36 +869,10 @@ int audit_tag_tree(char *old, char *new) return failed; } -/* - * That gets run when evict_chunk() ends up needing to kill audit_tree. - * Runs from a separate thread. - */ -static int prune_tree_thread(void *unused) -{ - mutex_lock(&audit_cmd_mutex); - mutex_lock(&audit_filter_mutex); - - while (!list_empty(&prune_list)) { - struct audit_tree *victim; - - victim = list_entry(prune_list.next, struct audit_tree, list); - list_del_init(&victim->list); - - mutex_unlock(&audit_filter_mutex); - - prune_one(victim); - - mutex_lock(&audit_filter_mutex); - } - - mutex_unlock(&audit_filter_mutex); - mutex_unlock(&audit_cmd_mutex); - return 0; -} static void audit_schedule_prune(void) { - kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); + wake_up_process(prune_thread); } /* @@ -907,9 +939,9 @@ static void evict_chunk(struct audit_chunk *chunk) for (n = 0; n < chunk->count; n++) list_del_init(&chunk->owners[n].list); spin_unlock(&hash_lock); + mutex_unlock(&audit_filter_mutex); if (need_prune) audit_schedule_prune(); - mutex_unlock(&audit_filter_mutex); } static int audit_tree_handle_event(struct fsnotify_group *group, diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index ad9c1682f616..6e30024d9aac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -146,7 +146,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) /* Initialize a parent watch entry. */ static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); struct audit_parent *parent; int ret; @@ -361,11 +361,11 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&parent->dentry->d_inode->i_mutex); - if (d->d_inode) { + mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + if (d_is_positive(d)) { /* update watch filter fields */ - watch->dev = d->d_inode->i_sb->s_dev; - watch->ino = d->d_inode->i_ino; + watch->dev = d_backing_inode(d)->i_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; } dput(d); return 0; @@ -426,7 +426,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(parent_path.dentry->d_inode); + parent = audit_find_parent(d_backing_inode(parent_path.dentry)); if (!parent) { parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { @@ -482,7 +482,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = d_backing_inode(((struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): inode = (struct inode *)data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dc4ae70a7413..9fb9d1cb83ce 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1629,7 +1629,7 @@ retry: rcu_read_lock(); seq = read_seqbegin(&rename_lock); for(;;) { - struct inode *inode = d->d_inode; + struct inode *inode = d_backing_inode(d); if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; + const struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1853,7 +1853,7 @@ void __audit_inode_child(const struct inode *parent, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = dentry->d_inode; + const struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; @@ -2361,7 +2361,6 @@ static void audit_log_task(struct audit_buffer *ab) kuid_t auid, uid; kgid_t gid; unsigned int sessionid; - struct mm_struct *mm = current->mm; char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); @@ -2376,13 +2375,7 @@ static void audit_log_task(struct audit_buffer *ab) audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } else - audit_log_format(ab, " exe=(null)"); + audit_log_d_path_exe(ab, current->mm); } /** diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4139a0f8b558..54f0e7fcd0e2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -357,8 +357,8 @@ select_insn: ALU64_MOD_X: if (unlikely(SRC == 0)) return 0; - tmp = DST; - DST = do_div(tmp, SRC); + div64_u64_rem(DST, SRC, &tmp); + DST = tmp; CONT; ALU_MOD_X: if (unlikely(SRC == 0)) @@ -367,8 +367,8 @@ select_insn: DST = do_div(tmp, (u32) SRC); CONT; ALU64_MOD_K: - tmp = DST; - DST = do_div(tmp, IMM); + div64_u64_rem(DST, IMM, &tmp); + DST = tmp; CONT; ALU_MOD_K: tmp = (u32) DST; @@ -377,7 +377,7 @@ select_insn: ALU64_DIV_X: if (unlikely(SRC == 0)) return 0; - do_div(DST, SRC); + DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: if (unlikely(SRC == 0)) @@ -387,7 +387,7 @@ select_insn: DST = (u32) tmp; CONT; ALU64_DIV_K: - do_div(DST, IMM); + DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: tmp = (u32) DST; diff --git a/kernel/module.c b/kernel/module.c index 650b038ae520..42a1d2afb217 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -387,9 +387,9 @@ static bool check_symbol(const struct symsearch *syms, pr_warn("Symbol %s is marked as UNUSED, however this module is " "using it.\n", fsa->name); pr_warn("This symbol will go away in the future.\n"); - pr_warn("Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " + pr_warn("Please evaluate if this is the right api to use and " + "if it really is, submit a report to the linux kernel " + "mailing list together with submitting your code for " "inclusion.\n"); } #endif @@ -2511,7 +2511,8 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = vmalloc(info->len); + info->hdr = __vmalloc(info->len, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; diff --git a/kernel/params.c b/kernel/params.c index 728e05b167de..a22d6a759b1a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -173,9 +173,9 @@ static char *next_arg(char *args, char **param, char **val) if (args[i-1] == '"') args[i-1] = '\0'; } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; diff --git a/kernel/relay.c b/kernel/relay.c index 5a56d3c8dc03..e9dbaeb8fd65 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -407,7 +407,7 @@ static inline void relay_set_buf_dentry(struct rchan_buf *buf, struct dentry *dentry) { buf->dentry = dentry; - buf->dentry->d_inode->i_size = buf->early_bytes; + d_inode(buf->dentry)->i_size = buf->early_bytes; } static struct dentry *relay_create_buf_file(struct rchan *chan, @@ -733,7 +733,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) buf->padding[old_subbuf] = buf->prev_padding; buf->subbufs_produced++; if (buf->dentry) - buf->dentry->d_inode->i_size += + d_inode(buf->dentry)->i_size += buf->chan->subbuf_size - buf->padding[old_subbuf]; else diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91eecaaa43e0..05330494a0df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6079,7 +6079,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, struct dentry *ret = trace_create_file(name, mode, parent, data, fops); if (ret) /* See tracing_get_cpu() */ - ret->d_inode->i_cdev = (void *)(cpu + 1); + d_inode(ret)->i_cdev = (void *)(cpu + 1); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7da1dfeb322e..c4de47fc5cca 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -494,8 +494,8 @@ static void remove_event_file_dir(struct ftrace_event_file *file) if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (child->d_inode) /* probably unneeded */ - child->d_inode->i_private = NULL; + if (d_really_is_positive(child)) /* probably unneeded */ + d_inode(child)->i_private = NULL; } spin_unlock(&dir->d_lock); @@ -565,6 +565,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; + int ret; /* * The buf format can be <subsystem>:<event-name> @@ -590,7 +591,13 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set); + + /* Put back the colon to allow this to be called again */ + if (buf) + *(buf - 1) = ':'; + + return ret; } /** @@ -1753,6 +1760,8 @@ static void update_event_printk(struct ftrace_event_call *call, ptr++; /* Check for alpha chars like ULL */ } while (isalnum(*ptr)); + if (!*ptr) + break; /* * A number must have some kind of delimiter after * it, and we can ignore that too. @@ -1779,12 +1788,16 @@ static void update_event_printk(struct ftrace_event_call *call, do { ptr++; } while (isalnum(*ptr) || *ptr == '_'); + if (!*ptr) + break; /* * If what comes after this variable is a '.' or * '->' then we can continue to ignore that string. */ if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { ptr += *ptr == '.' ? 1 : 2; + if (!*ptr) + break; goto skip_more; } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 9cfea4c6d314..a51e79688455 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1308,15 +1308,19 @@ void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ struct fgraph_data *data; + gfp_t gfpflags; int cpu; iter->private = NULL; - data = kzalloc(sizeof(*data), GFP_KERNEL); + /* We can be called in atomic context via ftrace_dump() */ + gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + + data = kzalloc(sizeof(*data), gfpflags); if (!data) goto out_err; - data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags); if (!data->cpu_data) goto out_err_free; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d60fe62ec4fa..6dd022c7b5bc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -443,7 +443,7 @@ static int create_trace_uprobe(int argc, char **argv) if (ret) goto fail_address_parse; - inode = igrab(path.dentry->d_inode); + inode = igrab(d_inode(path.dentry)); path_put(&path); if (!inode || !S_ISREG(inode->i_mode)) { diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index dbef2314901e..975c6e0434bd 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -131,11 +131,12 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) static inline const struct raid6_calls *raid6_choose_gen( void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) { - unsigned long perf, bestperf, j0, j1; + unsigned long perf, bestgenperf, bestxorperf, j0, j1; + int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ const struct raid6_calls *const *algo; const struct raid6_calls *best; - for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { if (!best || (*algo)->prefer >= best->prefer) { if ((*algo)->valid && !(*algo)->valid()) continue; @@ -153,19 +154,45 @@ static inline const struct raid6_calls *raid6_choose_gen( } preempt_enable(); - if (perf > bestperf) { - bestperf = perf; + if (perf > bestgenperf) { + bestgenperf = perf; best = *algo; } - pr_info("raid6: %-8s %5ld MB/s\n", (*algo)->name, + pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + + if (!(*algo)->xor_syndrome) + continue; + + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + if (best == *algo) + bestxorperf = perf; + + pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); } } if (best) { - pr_info("raid6: using algorithm %s (%ld MB/s)\n", + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", best->name, - (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + if (best->xor_syndrome) + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); raid6_call = *best; } else pr_err("raid6: Yikes! No algorithm found!\n"); diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc index 7cc12b532e95..bec27fce7501 100644 --- a/lib/raid6/altivec.uc +++ b/lib/raid6/altivec.uc @@ -119,6 +119,7 @@ int raid6_have_altivec(void) const struct raid6_calls raid6_altivec$# = { raid6_altivec$#_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_altivec, "altivecx$#", 0 diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index bc3b1dd436eb..76734004358d 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -89,6 +89,7 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x1 = { raid6_avx21_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x1", 1 /* Has cache hints */ @@ -150,6 +151,7 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x2 = { raid6_avx22_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x2", 1 /* Has cache hints */ @@ -242,6 +244,7 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_avx2x4 = { raid6_avx24_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_avx2, "avx2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc index 5b50f8dfc5d2..558aeac9342a 100644 --- a/lib/raid6/int.uc +++ b/lib/raid6/int.uc @@ -107,9 +107,48 @@ static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) } } +static void raid6_int$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + /* P/Q data pages */ + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + wq$$ = w1$$ ^ w2$$; + } + *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + } + +} + const struct raid6_calls raid6_intx$# = { raid6_int$#_gen_syndrome, - NULL, /* always valid */ + raid6_int$#_xor_syndrome, + NULL, /* always valid */ "int" NSTRING "x$#", 0 }; diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c index 590c71c9e200..b3b0e1fcd3af 100644 --- a/lib/raid6/mmx.c +++ b/lib/raid6/mmx.c @@ -76,6 +76,7 @@ static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx1 = { raid6_mmx1_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx1", 0 @@ -134,6 +135,7 @@ static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_mmxx2 = { raid6_mmx2_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_mmx, "mmxx2", 0 diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c index 36ad4705df1a..d9ad6ee284f4 100644 --- a/lib/raid6/neon.c +++ b/lib/raid6/neon.c @@ -42,6 +42,7 @@ } \ struct raid6_calls const raid6_neonx ## _n = { \ raid6_neon ## _n ## _gen_syndrome, \ + NULL, /* XOR not yet implemented */ \ raid6_have_neon, \ "neonx" #_n, \ 0 \ diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c index f76297139445..9025b8ca9aa3 100644 --- a/lib/raid6/sse1.c +++ b/lib/raid6/sse1.c @@ -92,6 +92,7 @@ static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x1 = { raid6_sse11_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x1", 1 /* Has cache hints */ @@ -154,6 +155,7 @@ static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_sse1x2 = { raid6_sse12_gen_syndrome, + NULL, /* XOR not yet implemented */ raid6_have_sse1_or_mmxext, "sse1x2", 1 /* Has cache hints */ diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c index 85b82c85f28e..1d2276b007ee 100644 --- a/lib/raid6/sse2.c +++ b/lib/raid6/sse2.c @@ -88,8 +88,58 @@ static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } + +static void raid6_sse21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("pxor %xmm4,%xmm2"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm5,%xmm4"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_sse2x1 = { raid6_sse21_gen_syndrome, + raid6_sse21_xor_syndrome, raid6_have_sse2, "sse2x1", 1 /* Has cache hints */ @@ -150,8 +200,76 @@ static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } + static void raid6_sse22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); + } + const struct raid6_calls raid6_sse2x2 = { raid6_sse22_gen_syndrome, + raid6_sse22_xor_syndrome, raid6_have_sse2, "sse2x2", 1 /* Has cache hints */ @@ -248,8 +366,117 @@ static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } + static void raid6_sse24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 64 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); + asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); + asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + asm volatile("pxor %xmm12,%xmm10"); + asm volatile("pxor %xmm14,%xmm11"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+32])); + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); + asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); + } + + const struct raid6_calls raid6_sse2x4 = { raid6_sse24_gen_syndrome, + raid6_sse24_xor_syndrome, raid6_have_sse2, "sse2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 5a485b7a7d3c..3bebbabdb510 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -28,11 +28,11 @@ char *dataptrs[NDISKS]; char data[NDISKS][PAGE_SIZE]; char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; -static void makedata(void) +static void makedata(int start, int stop) { int i, j; - for (i = 0; i < NDISKS; i++) { + for (i = start; i <= stop; i++) { for (j = 0; j < PAGE_SIZE; j++) data[i][j] = rand(); @@ -91,34 +91,55 @@ int main(int argc, char *argv[]) { const struct raid6_calls *const *algo; const struct raid6_recov_calls *const *ra; - int i, j; + int i, j, p1, p2; int err = 0; - makedata(); + makedata(0, NDISKS-1); for (ra = raid6_recov_algos; *ra; ra++) { if ((*ra)->valid && !(*ra)->valid()) continue; + raid6_2data_recov = (*ra)->data2; raid6_datap_recov = (*ra)->datap; printf("using recovery %s\n", (*ra)->name); for (algo = raid6_algos; *algo; algo++) { - if (!(*algo)->valid || (*algo)->valid()) { - raid6_call = **algo; + if ((*algo)->valid && !(*algo)->valid()) + continue; + + raid6_call = **algo; + + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + + if (!raid6_call.xor_syndrome) + continue; + + for (p1 = 0; p1 < NDISKS-2; p1++) + for (p2 = p1; p2 < NDISKS-2; p2++) { - /* Nuke syndromes */ - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + /* Simulate rmw run */ + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + makedata(p1, p2); + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); - /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, - (void **)&dataptrs); + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } - for (i = 0; i < NDISKS-1; i++) - for (j = i+1; j < NDISKS; j++) - err += test_disks(i, j); - } } printf("\n"); } diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc index e7c29459cbcd..2dd291a11264 100644 --- a/lib/raid6/tilegx.uc +++ b/lib/raid6/tilegx.uc @@ -80,6 +80,7 @@ void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) const struct raid6_calls raid6_tilegx$# = { raid6_tilegx$#_gen_syndrome, + NULL, /* XOR not yet implemented */ NULL, "tilegx$#", 0 diff --git a/mm/shmem.c b/mm/shmem.c index 1ea2400b5245..de981370fbc5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; @@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int ret; /* @@ -2298,7 +2298,7 @@ out: static int shmem_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); @@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } @@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru } old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = - old_dentry->d_inode->i_ctime = - new_dentry->d_inode->i_ctime = CURRENT_TIME; + d_inode(old_dentry)->i_ctime = + d_inode(new_dentry)->i_ctime = CURRENT_TIME; return 0; } @@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) */ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) @@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc return error; } - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { @@ -2476,14 +2476,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); + nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink); return NULL; } static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; - int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); if (page) unlock_page(page); @@ -2574,7 +2574,7 @@ static int shmem_xattr_validate(const char *name) static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2595,7 +2595,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, static int shmem_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2615,7 +2615,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, static int shmem_removexattr(struct dentry *dentry, const char *name) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); int err; /* @@ -2635,7 +2635,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(&info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ec565508e904..79e8f71aef5b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -490,6 +490,43 @@ out: } EXPORT_SYMBOL(ceph_parse_options); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +{ + struct ceph_options *opt = client->options; + size_t pos = m->count; + + if (opt->name) + seq_printf(m, "name=%s,", opt->name); + if (opt->key) + seq_puts(m, "secret=<hidden>,"); + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, "fsid=%pU,", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, "noshare,"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, "nocrc,"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, "nocephx_require_signatures,"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, "notcp_nodelay,"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, "osdkeepalivetimeout=%d,", + opt->osd_keepalive_timeout); + + /* drop redundant comma */ + if (m->count != pos) + m->count--; + + return 0; +} +EXPORT_SYMBOL(ceph_print_client_options); + u64 ceph_client_id(struct ceph_client *client) { return client->monc.auth->global_id; diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 16bc199d9a62..9d84ce4ea0df 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -17,6 +17,7 @@ const char *crush_bucket_alg_name(int alg) case CRUSH_BUCKET_LIST: return "list"; case CRUSH_BUCKET_TREE: return "tree"; case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; default: return "unknown"; } } @@ -40,6 +41,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; } return 0; } @@ -77,6 +80,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b); } +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + void crush_destroy_bucket(struct crush_bucket *b) { switch (b->alg) { @@ -92,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b) case CRUSH_BUCKET_STRAW: crush_destroy_bucket_straw((struct crush_bucket_straw *)b); break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; } } diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h new file mode 100644 index 000000000000..6192c7fc958c --- /dev/null +++ b/net/ceph/crush/crush_ln_table.h @@ -0,0 +1,166 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + + +// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) +// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + +static int64_t __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, + }; + + + // LL_tbl[k] = 2^48*log2(1.0+k/2^15); +static int64_t __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + + + + +#endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1ef53c04415..5b47736d27d9 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -20,7 +20,7 @@ #include <linux/crush/crush.h> #include <linux/crush/hash.h> -#include <linux/crush/mapper.h> +#include "crush_ln_table.h" /* * Implement the core CRUSH mapping algorithm. @@ -238,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } +// compute 2^44*log2(input+1) +uint64_t crush_ln(unsigned xin) +{ + unsigned x=xin, x1; + int iexpon, index1, index2; + uint64_t RH, LH, LL, xl64, result; + + x++; + + // normalize input + iexpon = 15; + while(!(x&0x18000)) { x<<=1; iexpon--; } + + index1 = (x>>8)<<1; + // RH ~ 2^56/index1 + RH = __RH_LH_tbl[index1 - 256]; + // LH ~ 2^48 * log2(index1/256) + LH = __RH_LH_tbl[index1 + 1 - 256]; + + // RH*x ~ 2^48 * (2^15 + xf), xf<2^8 + xl64 = (int64_t)x * RH; + xl64 >>= 48; + x1 = xl64; + + result = iexpon; + result <<= (12 + 32); + + index2 = x1 & 0xff; + // LL ~ 2^48*log2(1.0+index2/2^15) + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48-12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * + */ + +static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, + int x, int r) +{ + unsigned i, high = 0; + unsigned u; + unsigned w; + __s64 ln, draw, high_draw = 0; + + for (i = 0; i < bucket->h.size; i++) { + w = bucket->item_weights[i]; + if (w) { + u = crush_hash32_3(bucket->h.hash, x, + bucket->h.items[i], r); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + draw = div64_s64(ln, w); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + + static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); @@ -255,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) case CRUSH_BUCKET_STRAW: return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose((struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } + /* * true if device is marked "out" (failed, fully offloaded) * of the cluster @@ -290,6 +390,7 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector + * @out_size: size of the out vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make * @local_retries: localized retries @@ -304,6 +405,7 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + int out_size, unsigned int tries, unsigned int recurse_tries, unsigned int local_retries, @@ -322,6 +424,7 @@ static int crush_choose_firstn(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; + int count = out_size; dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", recurse_to_leaf ? "_LEAF" : "", @@ -329,7 +432,7 @@ static int crush_choose_firstn(const struct crush_map *map, tries, recurse_tries, local_retries, local_fallback_retries, parent_r); - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -403,7 +506,7 @@ static int crush_choose_firstn(const struct crush_map *map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, - out2, outpos, + out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, @@ -463,6 +566,7 @@ reject: dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; + count--; } dprintk("CHOOSE returns %d\n", outpos); @@ -654,6 +758,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int out_size; /* * the original choose_total_tries value was off by one (it * counted "retries" and not "tries"). add one. @@ -761,6 +866,7 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + result_max-osize, choose_tries, recurse_tries, choose_local_retries, @@ -770,11 +876,13 @@ int crush_do_rule(const struct crush_map *map, c+osize, 0); } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); crush_choose_indep( map, map->buckets[-1-w[i]], weight, weight_max, - x, numrep, numrep, + x, out_size, numrep, curstep->arg2, o+osize, j, choose_tries, @@ -783,7 +891,7 @@ int crush_do_rule(const struct crush_map *map, recurse_to_leaf, c+osize, 0); - osize += numrep; + osize += out_size; } } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 14d9995097cc..593dc2eabcc8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -22,6 +22,7 @@ * .../monmap - current monmap * .../osdc - active osd requests * .../monc - mon client state + * .../client_options - libceph-only (i.e. not rbd or cephfs) options * .../dentry_lru - dump contents of dentry lru * .../caps - expose cap (reservation) stats * .../bdi - symlink to ../../bdi/something @@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp) return 0; } +static int client_options_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + int ret; + + ret = ceph_print_client_options(s, client); + if (ret) + return ret; + + seq_putc(s, '\n'); + return 0; +} + CEPH_DEFINE_SHOW_FUNC(monmap_show) CEPH_DEFINE_SHOW_FUNC(osdmap_show) CEPH_DEFINE_SHOW_FUNC(monc_show) CEPH_DEFINE_SHOW_FUNC(osdc_show) +CEPH_DEFINE_SHOW_FUNC(client_options_show) int ceph_debugfs_init(void) { @@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_osdmap) goto out; + client->debugfs_options = debugfs_create_file("client_options", + 0600, + client->debugfs_dir, + client, + &client_options_show_fops); + if (!client->debugfs_options) + goto out; + return 0; out: @@ -252,6 +275,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { dout("ceph_debugfs_client_cleanup %p\n", client); + debugfs_remove(client->debugfs_options); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a9f4ae45b7fb..967080a9f043 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->error_msg = "connect error"; - return ret; } @@ -2145,12 +2143,10 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect got WAIT as client\n"); con->error_msg = "protocol error, got WAIT as client"; return -1; default: - pr_err("connect protocol error, will retry\n"); con->error_msg = "protocol error, garbage tag during connect"; return -1; } @@ -2282,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con) crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", + pr_err("read_partial_message bad hdr crc %u != expected %u\n", crc, con->in_hdr.crc); return -EBADMSG; } @@ -2313,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con) pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); con->error_msg = "bad message sequence # for incoming message"; - return -EBADMSG; + return -EBADE; } /* allocate message? */ @@ -2660,6 +2655,8 @@ more: switch (ret) { case -EBADMSG: con->error_msg = "bad crc"; + /* fall through */ + case -EBADE: ret = -EIO; break; case -EIO: @@ -2838,7 +2835,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on read"; + if (!con->error_msg) + con->error_msg = "socket error on read"; fault = true; break; } @@ -2847,7 +2845,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on write"; + if (!con->error_msg) + con->error_msg = "socket error on write"; fault = true; } @@ -2869,11 +2868,13 @@ static void con_work(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + con->error_msg = NULL; + WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); @@ -3295,8 +3296,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) */ if (*skip) return 0; - con->error_msg = "error allocating memory for incoming message"; + con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b8c3fde5b04f..15796696d64e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -122,6 +122,22 @@ bad: return -EINVAL; } +static int crush_decode_straw2_bucket(void **p, void *end, + struct crush_bucket_straw2 *b) +{ + int j; + dout("crush_decode_straw2_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) + b->item_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + static int skip_name_map(void **p, void *end) { int len; @@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(struct crush_bucket_straw2); + break; default: err = -EINVAL; goto bad; @@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (err < 0) goto bad; break; + case CRUSH_BUCKET_STRAW2: + err = crush_decode_straw2_bucket(p, end, + (struct crush_bucket_straw2 *)b); + if (err < 0) + goto bad; + break; } } diff --git a/net/core/dev.c b/net/core/dev.c index 1796cef55ab5..c7ba0388f1be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3079,7 +3079,7 @@ static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - if (next_cpu != RPS_NO_CPU) { + if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: - * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. @@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && - (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } - if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; @@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - int cpu; + unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = ACCESS_ONCE(rflow->cpu); - if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 456ead534e10..3cfff2a3d651 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -280,13 +280,14 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** - * build_skb - build a network buffer + * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of fragment, or 0 if head was kmalloced + * @frag_size: size of data, or 0 if head was kmalloced * * Allocate a new &sk_buff. Caller provides space holding head and * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator. + * @frag_size is 0, otherwise data should come from the page allocator + * or vmalloc() * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : @@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb); * before giving packet to stack. * RX rings only contains data buffers, not full skbs. */ -struct sk_buff *build_skb(void *data, unsigned int frag_size) +struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct skb_shared_info *shinfo; struct sk_buff *skb; @@ -311,11 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - if (frag_size) { - skb->head_frag = 1; - if (virt_to_head_page(data)->pfmemalloc) - skb->pfmemalloc = 1; - } atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -332,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) return skb; } + +/* build_skb() is wrapper over __build_skb(), that specifically + * takes care of skb->head and skb->pfmemalloc + * This means that if @frag_size is not zero, then @data must be backed + * by a page fragment, not kmalloc() or vmalloc() + */ +struct sk_buff *build_skb(void *data, unsigned int frag_size) +{ + struct sk_buff *skb = __build_skb(data, frag_size); + + if (skb && frag_size) { + skb->head_frag = 1; + if (virt_to_head_page(data)->pfmemalloc) + skb->pfmemalloc = 1; + } + return skb; +} EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 57d3e1af5630..0522fc9bfb0a 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 62cabee42fbe..635dbba93d01 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb, if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; + default: + break; } return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 19909d0786a2..ec4adbdcb9b4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1629,13 +1629,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size, if (data == NULL) return NULL; - skb = build_skb(data, size); + skb = __build_skb(data, size); if (skb == NULL) vfree(data); - else { - skb->head_frag = 0; + else skb->destructor = netlink_skb_destructor; - } return skb; } diff --git a/net/socket.c b/net/socket.c index 3e33959f3ce5..884e32997698 100644 --- a/net/socket.c +++ b/net/socket.c @@ -312,7 +312,7 @@ static const struct super_operations sockfs_ops = { static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations sockfs_dentry_operations = { @@ -375,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) &socket_file_ops); if (unlikely(IS_ERR(file))) { /* drop dentry, keep inode */ - ihold(path.dentry->d_inode); + ihold(d_inode(path.dentry)); path_put(&path); return file; } @@ -497,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, ssize_t len; ssize_t used = 0; - len = security_inode_listsecurity(dentry->d_inode, buffer, size); + len = security_inode_listsecurity(d_inode(dentry), buffer, size); if (len < 0) return len; used += len; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 2d12b76b5a64..d81186d34558 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work) } dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); - rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL, + rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL, &free_list, destroy_msg, -ETIMEDOUT); dput(dentry); } @@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg) dentry = dget(pipe->dentry); spin_unlock(&pipe->lock); if (dentry) { - wake_up(&RPC_I(dentry->d_inode)->waitq); + wake_up(&RPC_I(d_inode(dentry))->waitq); dput(dentry); } return res; @@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); if (err) return err; - rpci = RPC_I(dentry->d_inode); + rpci = RPC_I(d_inode(dentry)); rpci->private = private; rpci->pipe = pipe; fsnotify_create(dir, dentry); @@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry) int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry) static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rpc_close_pipes(inode); return __rpc_unlink(dir, dentry); @@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, if (!dentry) return ERR_PTR(-ENOMEM); } - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) return dentry; dput(dentry); return ERR_PTR(-EEXIST); @@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; struct qstr name; int i; @@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent, if (dentry == NULL) continue; - if (dentry->d_inode == NULL) + if (d_really_is_negative(dentry)) goto next; - switch (dentry->d_inode->i_mode & S_IFMT) { + switch (d_inode(dentry)->i_mode & S_IFMT) { default: BUG(); case S_IFREG: @@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent, const struct rpc_filelist *files, int start, int eof) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); @@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; @@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, int (*populate)(struct dentry *, void *), void *args_populate) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); int error; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); @@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, int error; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); @@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; int err; @@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry) int error = 0; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); mutex_unlock(&dir->i_mutex); @@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) struct dentry *clnt_dir = pipe_dentry->d_parent; struct dentry *gssd_dir = clnt_dir->d_parent; - __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); dput(pipe_dentry); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b91fd9c597b4..337ca851a350 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) if (!task->tk_timeout) return; - dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, task->tk_timeout * 1000 / HZ); + dprintk("RPC: %5u setting alarm for %u ms\n", + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9949722d99ce..1d4fe24af06a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -326,6 +326,15 @@ out_unlock: xprt_clear_locked(xprt); } +static void xprt_task_clear_bytes_sent(struct rpc_task *task) +{ + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } +} + /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -336,11 +345,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136fd5694..579f72bbcf4b 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o +xprtrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 000000000000..302d4ebf6fbf --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r); + return rc; +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->mr_rkey = mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_device *device; + int rc, nsegs = seg->mr_nsegs; + LIST_HEAD(l); + + list_add(&seg1->rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +fmr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *r; + LIST_HEAD(list); + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) + list_add(&r->r.fmr->list, &list); + + rc = ib_unmap_fmr(&list); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); +} + +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, + .ro_reset = fmr_op_reset, + .ro_destroy = fmr_op_destroy, + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 000000000000..dff0481dbcf8 --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + dprintk("RPC: %s: frmr %p (stale), status %d\n", + __func__, r, wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; + } + + return 0; +} + +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + struct ib_device *device; + + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + /* Force rpcrdma_buffer_get() to retry */ + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +frwr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) { + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + __frwr_release(r); + rc = __frwr_init(r, pd, device, depth); + if (rc) { + dprintk("RPC: %s: mw %p left %s\n", + __func__, r, + (r->r.frmr.fr_state == FRMR_IS_STALE ? + "stale" : "valid")); + continue; + } + + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, + .ro_reset = frwr_op_reset, + .ro_destroy = frwr_op_destroy, + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 000000000000..ba518af16787 --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia->ri_id->device, seg, + rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_id->device, seg); + read_unlock(&ia->ri_qplock); + + return 1; +} + +static void +physical_op_reset(struct rpcrdma_xprt *r_xprt) +{ +} + +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, + .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, + .ro_reset = physical_op_reset, + .ro_destroy = physical_op_destroy, + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde82fa0c..2c53ea9e1b83 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ @@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } - return n; -} + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + return n; -/* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); + return n; } /* @@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192baa59f3..54f23b1be986 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void @@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); @@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); @@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e28909fddd30..4870d272e006 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/sunrpc/addr.h> #include <asm/bitops.h> #include "xprt_rdma.h" @@ -62,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -188,7 +186,7 @@ static const char * const wc_status[] = { "remote access error", "remote operation error", "transport retry counter exceeded", - "RNR retrycounter exceeded", + "RNR retry counter exceeded", "local RDD violation error", "remove invalid RD request", "operation aborted", @@ -206,21 +204,17 @@ static const char * const wc_status[] = { static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", __func__, COMPLETION_MSG(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,9 +474,8 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, CONNECTION_MSG(event->event)); break; } @@ -491,19 +484,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", + sap, rpc_get_port(sap), ia->ri_id->device->name, - ia->ri_memreg_strategy, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif @@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; @@ -944,21 +895,9 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + ia->ri_ops->ro_reset(xprt); + id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -1123,91 +1062,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); - } -} - -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ @@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). +/* rpcrdma_unmap_one() was already done during deregistration. * Redo only the ib_post_send(). */ static void @@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } /* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -static void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - -/* * Prepost any receive buffer, then post send. * * Receive buffer is donated to hardware, reclaimed upon recv completion. @@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; @@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; + int bytes, segments; - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} - -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6f0885..78e0b8beaa36 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -105,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -213,6 +210,7 @@ struct rpcrdma_mw { struct ib_fmr *fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; @@ -258,7 +256,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -340,6 +337,29 @@ struct rpcrdma_stats { }; /* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_reset)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + +/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 433f287ee548..5266ea7b922b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -305,7 +305,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->path.dentry; - if (dentry && dentry->d_inode == i) { + if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); goto found; } @@ -778,7 +778,7 @@ static struct sock *unix_find_other(struct net *net, err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); err = inode_permission(inode, MAY_WRITE); if (err) goto put_fail; @@ -839,7 +839,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) */ err = security_path_mknod(&path, dentry, mode, 0); if (!err) { - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); if (!err) { res->mnt = mntget(path.mnt); res->dentry = dget(dentry); @@ -905,7 +905,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out_up; } addr->hash = UNIX_HASH_SIZE; - hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1); spin_lock(&unix_table_lock); u->path = path; list = &unix_socket_table[hash]; diff --git a/net/unix/diag.c b/net/unix/diag.c index ef542fbca9fe..c512f64d5287 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) if (dentry) { struct unix_diag_vfs uv = { - .udiag_vfs_ino = dentry->d_inode->i_ino, + .udiag_vfs_ino = d_backing_inode(dentry)->i_ino, .udiag_vfs_dev = dentry->d_sb->s_dev, }; diff --git a/scripts/check_extable.sh b/scripts/check_extable.sh new file mode 100755 index 000000000000..0fb6b1c97c27 --- /dev/null +++ b/scripts/check_extable.sh @@ -0,0 +1,146 @@ +#! /bin/bash +# (c) 2015, Quentin Casasnovas <quentin.casasnovas@oracle.com> + +obj=$1 + +file ${obj} | grep -q ELF || (echo "${obj} is not and ELF file." 1>&2 ; exit 0) + +# Bail out early if there isn't an __ex_table section in this object file. +objdump -hj __ex_table ${obj} 2> /dev/null > /dev/null +[ $? -ne 0 ] && exit 0 + +white_list=.text,.fixup + +suspicious_relocs=$(objdump -rj __ex_table ${obj} | tail -n +6 | + grep -v $(eval echo -e{${white_list}}) | awk '{print $3}') + +# No suspicious relocs in __ex_table, jobs a good'un +[ -z "${suspicious_relocs}" ] && exit 0 + + +# After this point, something is seriously wrong since we just found out we +# have some relocations in __ex_table which point to sections which aren't +# white listed. If you're adding a new section in the Linux kernel, and +# you're expecting this section to contain code which can fault (i.e. the +# __ex_table relocation to your new section is expected), simply add your +# new section to the white_list variable above. If not, you're probably +# doing something wrong and the rest of this code is just trying to print +# you more information about it. + +function find_section_offset_from_symbol() +{ + eval $(objdump -t ${obj} | grep ${1} | sed 's/\([0-9a-f]\+\) .\{7\} \([^ \t]\+\).*/section="\2"; section_offset="0x\1" /') + + # addr2line takes addresses in hexadecimal... + section_offset=$(printf "0x%016x" $(( ${section_offset} + $2 )) ) +} + +function find_symbol_and_offset_from_reloc() +{ + # Extract symbol and offset from the objdump output + eval $(echo $reloc | sed 's/\([^+]\+\)+\?\(0x[0-9a-f]\+\)\?/symbol="\1"; symbol_offset="\2"/') + + # When the relocation points to the begining of a symbol or section, it + # won't print the offset since it is zero. + if [ -z "${symbol_offset}" ]; then + symbol_offset=0x0 + fi +} + +function find_alt_replacement_target() +{ + # The target of the .altinstr_replacement is the relocation just before + # the .altinstr_replacement one. + eval $(objdump -rj .altinstructions ${obj} | grep -B1 "${section}+${section_offset}" | head -n1 | awk '{print $3}' | + sed 's/\([^+]\+\)+\(0x[0-9a-f]\+\)/alt_target_section="\1"; alt_target_offset="\2"/') +} + +function handle_alt_replacement_reloc() +{ + # This will define alt_target_section and alt_target_section_offset + find_alt_replacement_target ${section} ${section_offset} + + echo "Error: found a reference to .altinstr_replacement in __ex_table:" + addr2line -fip -j ${alt_target_section} -e ${obj} ${alt_target_offset} | awk '{print "\t" $0}' + + error=true +} + +function is_executable_section() +{ + objdump -hwj ${section} ${obj} | grep -q CODE + return $? +} + +function handle_suspicious_generic_reloc() +{ + if is_executable_section ${section}; then + # We've got a relocation to a non white listed _executable_ + # section, print a warning so the developper adds the section to + # the white list or fix his code. We try to pretty-print the file + # and line number where that relocation was added. + echo "Warning: found a reference to section \"${section}\" in __ex_table:" + addr2line -fip -j ${section} -e ${obj} ${section_offset} | awk '{print "\t" $0}' + else + # Something is definitively wrong here since we've got a relocation + # to a non-executable section, there's no way this would ever be + # running in the kernel. + echo "Error: found a reference to non-executable section \"${section}\" in __ex_table at offset ${section_offset}" + error=true + fi +} + +function handle_suspicious_reloc() +{ + case "${section}" in + ".altinstr_replacement") + handle_alt_replacement_reloc ${section} ${section_offset} + ;; + *) + handle_suspicious_generic_reloc ${section} ${section_offset} + ;; + esac +} + +function diagnose() +{ + + for reloc in ${suspicious_relocs}; do + # Let's find out where the target of the relocation in __ex_table + # is, this will define ${symbol} and ${symbol_offset} + find_symbol_and_offset_from_reloc ${reloc} + + # When there's a global symbol at the place of the relocation, + # objdump will use it instead of giving us a section+offset, so + # let's find out which section is this symbol in and the total + # offset withing that section. + find_section_offset_from_symbol ${symbol} ${symbol_offset} + + # In this case objdump was presenting us with a reloc to a symbol + # rather than a section. Now that we've got the actual section, + # we can skip it if it's in the white_list. + if [ -z "$( echo $section | grep -v $(eval echo -e{${white_list}}))" ]; then + continue; + fi + + # Will either print a warning if the relocation happens to be in a + # section we do not know but has executable bit set, or error out. + handle_suspicious_reloc + done +} + +function check_debug_info() { + objdump -hj .debug_info ${obj} 2> /dev/null > /dev/null || + echo -e "${obj} does not contain debug information, the addr2line output will be limited.\n" \ + "Recompile ${obj} with CONFIG_DEBUG_INFO to get a more useful output." +} + +check_debug_info + +diagnose + +if [ "${error}" ]; then + exit 1 +fi + +exit 0 diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index d439856f8176..91ee1b2e0f9a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -776,6 +776,7 @@ static const char *sech_name(struct elf_info *elf, Elf_Shdr *sechdr) * "foo" will match an exact string equal to "foo" * "*foo" will match a string that ends with "foo" * "foo*" will match a string that begins with "foo" + * "*foo*" will match a string that contains "foo" */ static int match(const char *sym, const char * const pat[]) { @@ -784,8 +785,17 @@ static int match(const char *sym, const char * const pat[]) p = *pat++; const char *endp = p + strlen(p) - 1; + /* "*foo*" */ + if (*p == '*' && *endp == '*') { + char *here, *bare = strndup(p + 1, strlen(p) - 2); + + here = strstr(sym, bare); + free(bare); + if (here != NULL) + return 1; + } /* "*foo" */ - if (*p == '*') { + else if (*p == '*') { if (strrcmp(sym, p + 1) == 0) return 1; } @@ -873,7 +883,10 @@ static void check_section(const char *modname, struct elf_info *elf, #define ALL_EXIT_SECTIONS EXIT_SECTIONS, ALL_XXXEXIT_SECTIONS #define DATA_SECTIONS ".data", ".data.rel" -#define TEXT_SECTIONS ".text", ".text.unlikely" +#define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ + ".kprobes.text" +#define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ + ".fixup", ".entry.text", ".exception.text", ".text.*" #define INIT_SECTIONS ".init.*" #define MEM_INIT_SECTIONS ".meminit.*" @@ -881,6 +894,9 @@ static void check_section(const char *modname, struct elf_info *elf, #define EXIT_SECTIONS ".exit.*" #define MEM_EXIT_SECTIONS ".memexit.*" +#define ALL_TEXT_SECTIONS ALL_INIT_TEXT_SECTIONS, ALL_EXIT_TEXT_SECTIONS, \ + TEXT_SECTIONS, OTHER_TEXT_SECTIONS + /* init data sections */ static const char *const init_data_sections[] = { ALL_INIT_DATA_SECTIONS, NULL }; @@ -892,6 +908,9 @@ static const char *const init_sections[] = { ALL_INIT_SECTIONS, NULL }; static const char *const init_exit_sections[] = {ALL_INIT_SECTIONS, ALL_EXIT_SECTIONS, NULL }; +/* all text sections */ +static const char *const text_sections[] = { ALL_TEXT_SECTIONS, NULL }; + /* data section */ static const char *const data_sections[] = { DATA_SECTIONS, NULL }; @@ -910,6 +929,7 @@ static const char *const data_sections[] = { DATA_SECTIONS, NULL }; static const char *const head_sections[] = { ".head.text*", NULL }; static const char *const linker_symbols[] = { "__init_begin", "_sinittext", "_einittext", NULL }; +static const char *const optim_symbols[] = { "*.constprop.*", NULL }; enum mismatch { TEXT_TO_ANY_INIT, @@ -921,34 +941,65 @@ enum mismatch { ANY_INIT_TO_ANY_EXIT, ANY_EXIT_TO_ANY_INIT, EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, }; +/** + * Describe how to match sections on different criterias: + * + * @fromsec: Array of sections to be matched. + * + * @bad_tosec: Relocations applied to a section in @fromsec to a section in + * this array is forbidden (black-list). Can be empty. + * + * @good_tosec: Relocations applied to a section in @fromsec must be + * targetting sections in this array (white-list). Can be empty. + * + * @mismatch: Type of mismatch. + * + * @symbol_white_list: Do not match a relocation to a symbol in this list + * even if it is targetting a section in @bad_to_sec. + * + * @handler: Specific handler to call when a match is found. If NULL, + * default_mismatch_handler() will be called. + * + */ struct sectioncheck { const char *fromsec[20]; - const char *tosec[20]; + const char *bad_tosec[20]; + const char *good_tosec[20]; enum mismatch mismatch; const char *symbol_white_list[20]; + void (*handler)(const char *modname, struct elf_info *elf, + const struct sectioncheck* const mismatch, + Elf_Rela *r, Elf_Sym *sym, const char *fromsec); + }; +static void extable_mismatch_handler(const char *modname, struct elf_info *elf, + const struct sectioncheck* const mismatch, + Elf_Rela *r, Elf_Sym *sym, + const char *fromsec); + static const struct sectioncheck sectioncheck[] = { /* Do not reference init/exit code/data from * normal code and data */ { .fromsec = { TEXT_SECTIONS, NULL }, - .tosec = { ALL_INIT_SECTIONS, NULL }, + .bad_tosec = { ALL_INIT_SECTIONS, NULL }, .mismatch = TEXT_TO_ANY_INIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, { .fromsec = { DATA_SECTIONS, NULL }, - .tosec = { ALL_XXXINIT_SECTIONS, NULL }, + .bad_tosec = { ALL_XXXINIT_SECTIONS, NULL }, .mismatch = DATA_TO_ANY_INIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, { .fromsec = { DATA_SECTIONS, NULL }, - .tosec = { INIT_SECTIONS, NULL }, + .bad_tosec = { INIT_SECTIONS, NULL }, .mismatch = DATA_TO_ANY_INIT, .symbol_white_list = { "*_template", "*_timer", "*_sht", "*_ops", @@ -957,56 +1008,66 @@ static const struct sectioncheck sectioncheck[] = { }, { .fromsec = { TEXT_SECTIONS, NULL }, - .tosec = { ALL_EXIT_SECTIONS, NULL }, + .bad_tosec = { ALL_EXIT_SECTIONS, NULL }, .mismatch = TEXT_TO_ANY_EXIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, { .fromsec = { DATA_SECTIONS, NULL }, - .tosec = { ALL_EXIT_SECTIONS, NULL }, + .bad_tosec = { ALL_EXIT_SECTIONS, NULL }, .mismatch = DATA_TO_ANY_EXIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, /* Do not reference init code/data from meminit code/data */ { .fromsec = { ALL_XXXINIT_SECTIONS, NULL }, - .tosec = { INIT_SECTIONS, NULL }, + .bad_tosec = { INIT_SECTIONS, NULL }, .mismatch = XXXINIT_TO_SOME_INIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, /* Do not reference exit code/data from memexit code/data */ { .fromsec = { ALL_XXXEXIT_SECTIONS, NULL }, - .tosec = { EXIT_SECTIONS, NULL }, + .bad_tosec = { EXIT_SECTIONS, NULL }, .mismatch = XXXEXIT_TO_SOME_EXIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, /* Do not use exit code/data from init code */ { .fromsec = { ALL_INIT_SECTIONS, NULL }, - .tosec = { ALL_EXIT_SECTIONS, NULL }, + .bad_tosec = { ALL_EXIT_SECTIONS, NULL }, .mismatch = ANY_INIT_TO_ANY_EXIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, /* Do not use init code/data from exit code */ { .fromsec = { ALL_EXIT_SECTIONS, NULL }, - .tosec = { ALL_INIT_SECTIONS, NULL }, + .bad_tosec = { ALL_INIT_SECTIONS, NULL }, .mismatch = ANY_EXIT_TO_ANY_INIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, }, { .fromsec = { ALL_PCI_INIT_SECTIONS, NULL }, - .tosec = { INIT_SECTIONS, NULL }, + .bad_tosec = { INIT_SECTIONS, NULL }, .mismatch = ANY_INIT_TO_ANY_EXIT, .symbol_white_list = { NULL }, }, /* Do not export init/exit functions or data */ { .fromsec = { "__ksymtab*", NULL }, - .tosec = { INIT_SECTIONS, EXIT_SECTIONS, NULL }, + .bad_tosec = { INIT_SECTIONS, EXIT_SECTIONS, NULL }, .mismatch = EXPORT_TO_INIT_EXIT, .symbol_white_list = { DEFAULT_SYMBOL_WHITE_LIST, NULL }, +}, +{ + .fromsec = { "__ex_table", NULL }, + /* If you're adding any new black-listed sections in here, consider + * adding a special 'printer' for them in scripts/check_extable. + */ + .bad_tosec = { ".altinstr_replacement", NULL }, + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, } }; @@ -1017,10 +1078,22 @@ static const struct sectioncheck *section_mismatch( int elems = sizeof(sectioncheck) / sizeof(struct sectioncheck); const struct sectioncheck *check = §ioncheck[0]; + /* + * The target section could be the SHT_NUL section when we're + * handling relocations to un-resolved symbols, trying to match it + * doesn't make much sense and causes build failures on parisc and + * mn10300 architectures. + */ + if (*tosec == '\0') + return NULL; + for (i = 0; i < elems; i++) { - if (match(fromsec, check->fromsec) && - match(tosec, check->tosec)) - return check; + if (match(fromsec, check->fromsec)) { + if (check->bad_tosec[0] && match(tosec, check->bad_tosec)) + return check; + if (check->good_tosec[0] && !match(tosec, check->good_tosec)) + return check; + } check++; } return NULL; @@ -1067,6 +1140,17 @@ static const struct sectioncheck *section_mismatch( * This pattern is identified by * refsymname = __init_begin, _sinittext, _einittext * + * Pattern 5: + * GCC may optimize static inlines when fed constant arg(s) resulting + * in functions like cpumask_empty() -- generating an associated symbol + * cpumask_empty.constprop.3 that appears in the audit. If the const that + * is passed in comes from __init, like say nmi_ipi_mask, we get a + * meaningless section warning. May need to add isra symbols too... + * This pattern is identified by + * tosec = init section + * fromsec = text section + * refsymname = *.constprop.* + * **/ static int secref_whitelist(const struct sectioncheck *mismatch, const char *fromsec, const char *fromsym, @@ -1099,6 +1183,12 @@ static int secref_whitelist(const struct sectioncheck *mismatch, if (match(tosym, linker_symbols)) return 0; + /* Check for pattern 5 */ + if (match(fromsec, text_sections) && + match(tosec, init_sections) && + match(fromsym, optim_symbols)) + return 0; + return 1; } @@ -1261,6 +1351,15 @@ static void print_section_list(const char * const list[20]) fprintf(stderr, "\n"); } +static inline void get_pretty_name(int is_func, const char** name, const char** name_p) +{ + switch (is_func) { + case 0: *name = "variable"; *name_p = ""; break; + case 1: *name = "function"; *name_p = "()"; break; + default: *name = "(unknown reference)"; *name_p = ""; break; + } +} + /* * Print a warning about a section mismatch. * Try to find symbols near it so user can find it. @@ -1280,21 +1379,13 @@ static void report_sec_mismatch(const char *modname, char *prl_from; char *prl_to; - switch (from_is_func) { - case 0: from = "variable"; from_p = ""; break; - case 1: from = "function"; from_p = "()"; break; - default: from = "(unknown reference)"; from_p = ""; break; - } - switch (to_is_func) { - case 0: to = "variable"; to_p = ""; break; - case 1: to = "function"; to_p = "()"; break; - default: to = "(unknown reference)"; to_p = ""; break; - } - sec_mismatch_count++; if (!sec_mismatch_verbose) return; + get_pretty_name(from_is_func, &from, &from_p); + get_pretty_name(to_is_func, &to, &to_p); + warn("%s(%s+0x%llx): Section mismatch in reference from the %s %s%s " "to the %s %s:%s%s\n", modname, fromsec, fromaddr, from, fromsym, from_p, to, tosec, @@ -1408,41 +1499,179 @@ static void report_sec_mismatch(const char *modname, tosym, prl_to, prl_to, tosym); free(prl_to); break; + case EXTABLE_TO_NON_TEXT: + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; } fprintf(stderr, "\n"); } -static void check_section_mismatch(const char *modname, struct elf_info *elf, - Elf_Rela *r, Elf_Sym *sym, const char *fromsec) +static void default_mismatch_handler(const char *modname, struct elf_info *elf, + const struct sectioncheck* const mismatch, + Elf_Rela *r, Elf_Sym *sym, const char *fromsec) { const char *tosec; - const struct sectioncheck *mismatch; + Elf_Sym *to; + Elf_Sym *from; + const char *tosym; + const char *fromsym; + + from = find_elf_symbol2(elf, r->r_offset, fromsec); + fromsym = sym_name(elf, from); + + if (!strncmp(fromsym, "reference___initcall", + sizeof("reference___initcall")-1)) + return; tosec = sec_name(elf, get_secindex(elf, sym)); - mismatch = section_mismatch(fromsec, tosec); + to = find_elf_symbol(elf, r->r_addend, sym); + tosym = sym_name(elf, to); + + /* check whitelist - we may ignore it */ + if (secref_whitelist(mismatch, + fromsec, fromsym, tosec, tosym)) { + report_sec_mismatch(modname, mismatch, + fromsec, r->r_offset, fromsym, + is_function(from), tosec, tosym, + is_function(to)); + } +} + +static int is_executable_section(struct elf_info* elf, unsigned int section_index) +{ + if (section_index > elf->num_sections) + fatal("section_index is outside elf->num_sections!\n"); + + return ((elf->sechdrs[section_index].sh_flags & SHF_EXECINSTR) == SHF_EXECINSTR); +} + +/* + * We rely on a gross hack in section_rel[a]() calling find_extable_entry_size() + * to know the sizeof(struct exception_table_entry) for the target architecture. + */ +static unsigned int extable_entry_size = 0; +static void find_extable_entry_size(const char* const sec, const Elf_Rela* r) +{ + /* + * If we're currently checking the second relocation within __ex_table, + * that relocation offset tells us the offsetof(struct + * exception_table_entry, fixup) which is equal to sizeof(struct + * exception_table_entry) divided by two. We use that to our advantage + * since there's no portable way to get that size as every architecture + * seems to go with different sized types. Not pretty but better than + * hard-coding the size for every architecture.. + */ + if (!extable_entry_size) + extable_entry_size = r->r_offset * 2; +} + +static inline bool is_extable_fault_address(Elf_Rela *r) +{ + /* + * extable_entry_size is only discovered after we've handled the + * _second_ relocation in __ex_table, so only abort when we're not + * handling the first reloc and extable_entry_size is zero. + */ + if (r->r_offset && extable_entry_size == 0) + fatal("extable_entry size hasn't been discovered!\n"); + + return ((r->r_offset == 0) || + (r->r_offset % extable_entry_size == 0)); +} + +#define is_second_extable_reloc(Start, Cur, Sec) \ + (((Cur) == (Start) + 1) && (strcmp("__ex_table", (Sec)) == 0)) + +static void report_extable_warnings(const char* modname, struct elf_info* elf, + const struct sectioncheck* const mismatch, + Elf_Rela* r, Elf_Sym* sym, + const char* fromsec, const char* tosec) +{ + Elf_Sym* fromsym = find_elf_symbol2(elf, r->r_offset, fromsec); + const char* fromsym_name = sym_name(elf, fromsym); + Elf_Sym* tosym = find_elf_symbol(elf, r->r_addend, sym); + const char* tosym_name = sym_name(elf, tosym); + const char* from_pretty_name; + const char* from_pretty_name_p; + const char* to_pretty_name; + const char* to_pretty_name_p; + + get_pretty_name(is_function(fromsym), + &from_pretty_name, &from_pretty_name_p); + get_pretty_name(is_function(tosym), + &to_pretty_name, &to_pretty_name_p); + + warn("%s(%s+0x%lx): Section mismatch in reference" + " from the %s %s%s to the %s %s:%s%s\n", + modname, fromsec, (long)r->r_offset, from_pretty_name, + fromsym_name, from_pretty_name_p, + to_pretty_name, tosec, tosym_name, to_pretty_name_p); + + if (!match(tosec, mismatch->bad_tosec) && + is_executable_section(elf, get_secindex(elf, sym))) + fprintf(stderr, + "The relocation at %s+0x%lx references\n" + "section \"%s\" which is not in the list of\n" + "authorized sections. If you're adding a new section\n" + "and/or if this reference is valid, add \"%s\" to the\n" + "list of authorized sections to jump to on fault.\n" + "This can be achieved by adding \"%s\" to \n" + "OTHER_TEXT_SECTIONS in scripts/mod/modpost.c.\n", + fromsec, (long)r->r_offset, tosec, tosec, tosec); +} + +static void extable_mismatch_handler(const char* modname, struct elf_info *elf, + const struct sectioncheck* const mismatch, + Elf_Rela* r, Elf_Sym* sym, + const char *fromsec) +{ + const char* tosec = sec_name(elf, get_secindex(elf, sym)); + + sec_mismatch_count++; + + if (sec_mismatch_verbose) + report_extable_warnings(modname, elf, mismatch, r, sym, + fromsec, tosec); + + if (match(tosec, mismatch->bad_tosec)) + fatal("The relocation at %s+0x%lx references\n" + "section \"%s\" which is black-listed.\n" + "Something is seriously wrong and should be fixed.\n" + "You might get more information about where this is\n" + "coming from by using scripts/check_extable.sh %s\n", + fromsec, (long)r->r_offset, tosec, modname); + else if (!is_executable_section(elf, get_secindex(elf, sym))) { + if (is_extable_fault_address(r)) + fatal("The relocation at %s+0x%lx references\n" + "section \"%s\" which is not executable, IOW\n" + "it is not possible for the kernel to fault\n" + "at that address. Something is seriously wrong\n" + "and should be fixed.\n", + fromsec, (long)r->r_offset, tosec); + else + fatal("The relocation at %s+0x%lx references\n" + "section \"%s\" which is not executable, IOW\n" + "the kernel will fault if it ever tries to\n" + "jump to it. Something is seriously wrong\n" + "and should be fixed.\n", + fromsec, (long)r->r_offset, tosec); + } +} + +static void check_section_mismatch(const char *modname, struct elf_info *elf, + Elf_Rela *r, Elf_Sym *sym, const char *fromsec) +{ + const char *tosec = sec_name(elf, get_secindex(elf, sym));; + const struct sectioncheck *mismatch = section_mismatch(fromsec, tosec); + if (mismatch) { - Elf_Sym *to; - Elf_Sym *from; - const char *tosym; - const char *fromsym; - - from = find_elf_symbol2(elf, r->r_offset, fromsec); - fromsym = sym_name(elf, from); - to = find_elf_symbol(elf, r->r_addend, sym); - tosym = sym_name(elf, to); - - if (!strncmp(fromsym, "reference___initcall", - sizeof("reference___initcall")-1)) - return; - - /* check whitelist - we may ignore it */ - if (secref_whitelist(mismatch, - fromsec, fromsym, tosec, tosym)) { - report_sec_mismatch(modname, mismatch, - fromsec, r->r_offset, fromsym, - is_function(from), tosec, tosym, - is_function(to)); - } + if (mismatch->handler) + mismatch->handler(modname, elf, mismatch, + r, sym, fromsec); + else + default_mismatch_handler(modname, elf, mismatch, + r, sym, fromsec); } } @@ -1582,6 +1811,8 @@ static void section_rela(const char *modname, struct elf_info *elf, /* Skip special sections */ if (is_shndx_special(sym->st_shndx)) continue; + if (is_second_extable_reloc(start, rela, fromsec)) + find_extable_entry_size(fromsec, &r); check_section_mismatch(modname, elf, &r, sym, fromsec); } } @@ -1640,6 +1871,8 @@ static void section_rel(const char *modname, struct elf_info *elf, /* Skip special sections */ if (is_shndx_special(sym->st_shndx)) continue; + if (is_second_extable_reloc(start, rel, fromsec)) + find_extable_entry_size(fromsec, &r); check_section_mismatch(modname, elf, &r, sym, fromsec); } } diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 7db9954f1af2..ad4fa49ad1db 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -365,7 +365,7 @@ void __aa_fs_profile_rmdir(struct aa_profile *profile) if (!profile->dents[i]) continue; - r = profile->dents[i]->d_inode->i_private; + r = d_inode(profile->dents[i])->i_private; securityfs_remove(profile->dents[i]); aa_put_replacedby(r); profile->dents[i] = NULL; diff --git a/security/apparmor/file.c b/security/apparmor/file.c index fdaa50cb1876..913f377a038a 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -259,7 +259,7 @@ unsigned int aa_str_perms(struct aa_dfa *dfa, unsigned int start, */ static inline bool is_deleted(struct dentry *dentry) { - if (d_unlinked(dentry) && dentry->d_inode->i_nlink == 0) + if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) return 1; return 0; } @@ -351,8 +351,8 @@ int aa_path_link(struct aa_profile *profile, struct dentry *old_dentry, struct path link = { new_dir->mnt, new_dentry }; struct path target = { new_dir->mnt, old_dentry }; struct path_cond cond = { - old_dentry->d_inode->i_uid, - old_dentry->d_inode->i_mode + d_backing_inode(old_dentry)->i_uid, + d_backing_inode(old_dentry)->i_mode }; char *buffer = NULL, *buffer2 = NULL; const char *lname, *tname = NULL, *info = NULL; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index dd56bffd6500..e5f1561439db 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -204,8 +204,8 @@ static int common_perm_mnt_dentry(int op, struct vfsmount *mnt, struct dentry *dentry, u32 mask) { struct path path = { mnt, dentry }; - struct path_cond cond = { dentry->d_inode->i_uid, - dentry->d_inode->i_mode + struct path_cond cond = { d_backing_inode(dentry)->i_uid, + d_backing_inode(dentry)->i_mode }; return common_perm(op, &path, mask, &cond); @@ -223,7 +223,7 @@ static int common_perm_mnt_dentry(int op, struct vfsmount *mnt, static int common_perm_rm(int op, struct path *dir, struct dentry *dentry, u32 mask) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; if (!inode || !dir->mnt || !mediated_filesystem(dentry)) @@ -281,8 +281,8 @@ static int apparmor_path_mknod(struct path *dir, struct dentry *dentry, static int apparmor_path_truncate(struct path *path) { - struct path_cond cond = { path->dentry->d_inode->i_uid, - path->dentry->d_inode->i_mode + struct path_cond cond = { d_backing_inode(path->dentry)->i_uid, + d_backing_inode(path->dentry)->i_mode }; if (!path->mnt || !mediated_filesystem(path->dentry)) @@ -327,8 +327,8 @@ static int apparmor_path_rename(struct path *old_dir, struct dentry *old_dentry, if (!unconfined(profile)) { struct path old_path = { old_dir->mnt, old_dentry }; struct path new_path = { new_dir->mnt, new_dentry }; - struct path_cond cond = { old_dentry->d_inode->i_uid, - old_dentry->d_inode->i_mode + struct path_cond cond = { d_backing_inode(old_dentry)->i_uid, + d_backing_inode(old_dentry)->i_mode }; error = aa_path_perm(OP_RENAME_SRC, profile, &old_path, 0, @@ -354,8 +354,8 @@ static int apparmor_path_chmod(struct path *path, umode_t mode) static int apparmor_path_chown(struct path *path, kuid_t uid, kgid_t gid) { - struct path_cond cond = { path->dentry->d_inode->i_uid, - path->dentry->d_inode->i_mode + struct path_cond cond = { d_backing_inode(path->dentry)->i_uid, + d_backing_inode(path->dentry)->i_mode }; if (!mediated_filesystem(path->dentry)) diff --git a/security/commoncap.c b/security/commoncap.c index f66713bd7450..f2875cd9f677 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -297,7 +297,7 @@ static inline void bprm_clear_caps(struct linux_binprm *bprm) */ int cap_inode_need_killpriv(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); int error; if (!inode->i_op->getxattr) @@ -319,7 +319,7 @@ int cap_inode_need_killpriv(struct dentry *dentry) */ int cap_inode_killpriv(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); if (!inode->i_op->removexattr) return 0; @@ -375,7 +375,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, */ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; unsigned tocopy, i; int size; diff --git a/security/inode.c b/security/inode.c index 131a3c49f766..91503b79c5f8 100644 --- a/security/inode.c +++ b/security/inode.c @@ -27,7 +27,7 @@ static int mount_count; static inline int positive(struct dentry *dentry) { - return dentry->d_inode && !d_unhashed(dentry); + return d_really_is_positive(dentry) && !d_unhashed(dentry); } static int fill_super(struct super_block *sb, void *data, int silent) @@ -102,14 +102,14 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, if (!parent) parent = mount->mnt_root; - dir = parent->d_inode; + dir = d_inode(parent); mutex_lock(&dir->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) goto out; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { error = -EEXIST; goto out1; } @@ -197,20 +197,20 @@ void securityfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); if (positive(dentry)) { - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { if (d_is_dir(dentry)) - simple_rmdir(parent->d_inode, dentry); + simple_rmdir(d_inode(parent), dentry); else - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); dput(dentry); } } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 5e9687f02e1b..159ef3ea4130 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -131,7 +131,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, size_t req_xattr_value_len, char type, char *digest) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct shash_desc *desc; char **xattrname; size_t xattr_size = 0; @@ -199,7 +199,7 @@ int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name, int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name, const char *xattr_value, size_t xattr_value_len) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct evm_ima_xattr_data xattr_data; int rc = 0; diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index f589c9a05da2..10f994307a04 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -72,7 +72,7 @@ static void __init evm_init_config(void) static int evm_find_protected_xattrs(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); char **xattr; int error; int count = 0; @@ -165,8 +165,8 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry, /* Replace RSA with HMAC if not mounted readonly and * not immutable */ - if (!IS_RDONLY(dentry->d_inode) && - !IS_IMMUTABLE(dentry->d_inode)) + if (!IS_RDONLY(d_backing_inode(dentry)) && + !IS_IMMUTABLE(d_backing_inode(dentry))) evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); @@ -235,7 +235,7 @@ enum integrity_status evm_verifyxattr(struct dentry *dentry, return INTEGRITY_UNKNOWN; if (!iint) { - iint = integrity_iint_find(dentry->d_inode); + iint = integrity_iint_find(d_backing_inode(dentry)); if (!iint) return INTEGRITY_UNKNOWN; } @@ -253,7 +253,7 @@ EXPORT_SYMBOL_GPL(evm_verifyxattr); */ static enum integrity_status evm_verify_current_integrity(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); if (!evm_initialized || !S_ISREG(inode->i_mode) || evm_fixmode) return 0; @@ -293,13 +293,13 @@ static int evm_protect_xattr(struct dentry *dentry, const char *xattr_name, if (evm_status == INTEGRITY_NOXATTRS) { struct integrity_iint_cache *iint; - iint = integrity_iint_find(dentry->d_inode); + iint = integrity_iint_find(d_backing_inode(dentry)); if (iint && (iint->flags & IMA_NEW_FILE)) return 0; } out: if (evm_status != INTEGRITY_PASS) - integrity_audit_msg(AUDIT_INTEGRITY_METADATA, dentry->d_inode, + integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); @@ -379,7 +379,7 @@ void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, */ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); if (!evm_initialized || !evm_protected_xattr(xattr_name)) return; @@ -404,7 +404,7 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr) if ((evm_status == INTEGRITY_PASS) || (evm_status == INTEGRITY_NOXATTRS)) return 0; - integrity_audit_msg(AUDIT_INTEGRITY_METADATA, dentry->d_inode, + integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry), dentry->d_name.name, "appraise_metadata", integrity_status_msg[evm_status], -EPERM, 0); return -EPERM; diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index fffcdb0b31f0..4df493e4b3c9 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -165,7 +165,7 @@ void ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len, int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); if (!inode->i_op->getxattr) return 0; @@ -190,7 +190,7 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint, static const char op[] = "appraise_data"; char *cause = "unknown"; struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len, hash_start = 0; @@ -314,7 +314,7 @@ void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file) */ void ima_inode_post_setattr(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct integrity_iint_cache *iint; int must_appraise, rc; @@ -380,7 +380,7 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; - ima_reset_appraise_flags(dentry->d_inode, + ima_reset_appraise_flags(d_backing_inode(dentry), (xvalue->type == EVM_IMA_XATTR_DIGSIG) ? 1 : 0); result = 0; } @@ -393,7 +393,7 @@ int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name) result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1) { - ima_reset_appraise_flags(dentry->d_inode, 0); + ima_reset_appraise_flags(d_backing_inode(dentry), 0); result = 0; } return result; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index b526ddc3add5..1d34277dc402 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -237,7 +237,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_d_path(ab, " path=", &a->u.path); - inode = a->u.path.dentry->d_inode; + inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); @@ -251,7 +251,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_format(ab, " name="); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); - inode = a->u.dentry->d_inode; + inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); diff --git a/security/security.c b/security/security.c index 730ac65a5737..8e9b1f4b9b45 100644 --- a/security/security.c +++ b/security/security.c @@ -410,7 +410,7 @@ EXPORT_SYMBOL(security_old_inode_init_security); int security_path_mknod(struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { - if (unlikely(IS_PRIVATE(dir->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return security_ops->path_mknod(dir, dentry, mode, dev); } @@ -418,7 +418,7 @@ EXPORT_SYMBOL(security_path_mknod); int security_path_mkdir(struct path *dir, struct dentry *dentry, umode_t mode) { - if (unlikely(IS_PRIVATE(dir->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return security_ops->path_mkdir(dir, dentry, mode); } @@ -426,14 +426,14 @@ EXPORT_SYMBOL(security_path_mkdir); int security_path_rmdir(struct path *dir, struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dir->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return security_ops->path_rmdir(dir, dentry); } int security_path_unlink(struct path *dir, struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dir->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return security_ops->path_unlink(dir, dentry); } @@ -442,7 +442,7 @@ EXPORT_SYMBOL(security_path_unlink); int security_path_symlink(struct path *dir, struct dentry *dentry, const char *old_name) { - if (unlikely(IS_PRIVATE(dir->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return security_ops->path_symlink(dir, dentry, old_name); } @@ -450,7 +450,7 @@ int security_path_symlink(struct path *dir, struct dentry *dentry, int security_path_link(struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry) { - if (unlikely(IS_PRIVATE(old_dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return security_ops->path_link(old_dentry, new_dir, new_dentry); } @@ -459,8 +459,8 @@ int security_path_rename(struct path *old_dir, struct dentry *old_dentry, struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (unlikely(IS_PRIVATE(old_dentry->d_inode) || - (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) + if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || + (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { @@ -477,21 +477,21 @@ EXPORT_SYMBOL(security_path_rename); int security_path_truncate(struct path *path) { - if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return security_ops->path_truncate(path); } int security_path_chmod(struct path *path, umode_t mode) { - if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return security_ops->path_chmod(path, mode); } int security_path_chown(struct path *path, kuid_t uid, kgid_t gid) { - if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return security_ops->path_chown(path, uid, gid); } @@ -513,14 +513,14 @@ EXPORT_SYMBOL_GPL(security_inode_create); int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - if (unlikely(IS_PRIVATE(old_dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return security_ops->inode_link(old_dentry, dir, new_dentry); } int security_inode_unlink(struct inode *dir, struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_unlink(dir, dentry); } @@ -543,7 +543,7 @@ EXPORT_SYMBOL_GPL(security_inode_mkdir); int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_rmdir(dir, dentry); } @@ -559,8 +559,8 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (unlikely(IS_PRIVATE(old_dentry->d_inode) || - (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode)))) + if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || + (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { @@ -576,14 +576,14 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, int security_inode_readlink(struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_readlink(dentry); } int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_follow_link(dentry, nd); } @@ -599,7 +599,7 @@ int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { int ret; - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = security_ops->inode_setattr(dentry, attr); if (ret) @@ -610,7 +610,7 @@ EXPORT_SYMBOL_GPL(security_inode_setattr); int security_inode_getattr(const struct path *path) { - if (unlikely(IS_PRIVATE(path->dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return security_ops->inode_getattr(path); } @@ -620,7 +620,7 @@ int security_inode_setxattr(struct dentry *dentry, const char *name, { int ret; - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = security_ops->inode_setxattr(dentry, name, value, size, flags); if (ret) @@ -634,7 +634,7 @@ int security_inode_setxattr(struct dentry *dentry, const char *name, void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; security_ops->inode_post_setxattr(dentry, name, value, size, flags); evm_inode_post_setxattr(dentry, name, value, size); @@ -642,14 +642,14 @@ void security_inode_post_setxattr(struct dentry *dentry, const char *name, int security_inode_getxattr(struct dentry *dentry, const char *name) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_getxattr(dentry, name); } int security_inode_listxattr(struct dentry *dentry) { - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return security_ops->inode_listxattr(dentry); } @@ -658,7 +658,7 @@ int security_inode_removexattr(struct dentry *dentry, const char *name) { int ret; - if (unlikely(IS_PRIVATE(dentry->d_inode))) + if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = security_ops->inode_removexattr(dentry, name); if (ret) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c318b304ee2f..7dade28affba 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -414,7 +414,7 @@ static int sb_finish_set_opts(struct super_block *sb) { struct superblock_security_struct *sbsec = sb->s_security; struct dentry *root = sb->s_root; - struct inode *root_inode = root->d_inode; + struct inode *root_inode = d_backing_inode(root); int rc = 0; if (sbsec->behavior == SECURITY_FS_USE_XATTR) { @@ -552,7 +552,7 @@ static int selinux_get_mnt_opts(const struct super_block *sb, opts->mnt_opts_flags[i++] = DEFCONTEXT_MNT; } if (sbsec->flags & ROOTCONTEXT_MNT) { - struct inode *root = sbsec->sb->s_root->d_inode; + struct inode *root = d_backing_inode(sbsec->sb->s_root); struct inode_security_struct *isec = root->i_security; rc = security_sid_to_context(isec->sid, &context, &len); @@ -608,7 +608,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, int rc = 0, i; struct superblock_security_struct *sbsec = sb->s_security; const char *name = sb->s_type->name; - struct inode *inode = sbsec->sb->s_root->d_inode; + struct inode *inode = d_backing_inode(sbsec->sb->s_root); struct inode_security_struct *root_isec = inode->i_security; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; u32 defcontext_sid = 0; @@ -835,8 +835,8 @@ static int selinux_cmp_sb_context(const struct super_block *oldsb, if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid) goto mismatch; if (oldflags & ROOTCONTEXT_MNT) { - struct inode_security_struct *oldroot = oldsb->s_root->d_inode->i_security; - struct inode_security_struct *newroot = newsb->s_root->d_inode->i_security; + struct inode_security_struct *oldroot = d_backing_inode(oldsb->s_root)->i_security; + struct inode_security_struct *newroot = d_backing_inode(newsb->s_root)->i_security; if (oldroot->sid != newroot->sid) goto mismatch; } @@ -886,16 +886,16 @@ static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb, if (!set_fscontext) newsbsec->sid = sid; if (!set_rootcontext) { - struct inode *newinode = newsb->s_root->d_inode; + struct inode *newinode = d_backing_inode(newsb->s_root); struct inode_security_struct *newisec = newinode->i_security; newisec->sid = sid; } newsbsec->mntpoint_sid = sid; } if (set_rootcontext) { - const struct inode *oldinode = oldsb->s_root->d_inode; + const struct inode *oldinode = d_backing_inode(oldsb->s_root); const struct inode_security_struct *oldisec = oldinode->i_security; - struct inode *newinode = newsb->s_root->d_inode; + struct inode *newinode = d_backing_inode(newsb->s_root); struct inode_security_struct *newisec = newinode->i_security; newisec->sid = oldisec->sid; @@ -1610,7 +1610,7 @@ static inline int dentry_has_perm(const struct cred *cred, struct dentry *dentry, u32 av) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_DENTRY; @@ -1625,7 +1625,7 @@ static inline int path_has_perm(const struct cred *cred, const struct path *path, u32 av) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); struct common_audit_data ad; ad.type = LSM_AUDIT_DATA_PATH; @@ -1753,7 +1753,7 @@ static int may_link(struct inode *dir, int rc; dsec = dir->i_security; - isec = dentry->d_inode->i_security; + isec = d_backing_inode(dentry)->i_security; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; @@ -1797,7 +1797,7 @@ static inline int may_rename(struct inode *old_dir, int rc; old_dsec = old_dir->i_security; - old_isec = old_dentry->d_inode->i_security; + old_isec = d_backing_inode(old_dentry)->i_security; old_is_dir = d_is_dir(old_dentry); new_dsec = new_dir->i_security; @@ -1827,7 +1827,7 @@ static inline int may_rename(struct inode *old_dir, if (rc) return rc; if (d_is_positive(new_dentry)) { - new_isec = new_dentry->d_inode->i_security; + new_isec = d_backing_inode(new_dentry)->i_security; new_is_dir = d_is_dir(new_dentry); rc = avc_has_perm(sid, new_isec->sid, new_isec->sclass, @@ -1963,7 +1963,7 @@ static int selinux_binder_transfer_file(struct task_struct *from, { u32 sid = task_sid(to); struct file_security_struct *fsec = file->f_security; - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = d_backing_inode(file->f_path.dentry); struct inode_security_struct *isec = inode->i_security; struct common_audit_data ad; int rc; @@ -2627,7 +2627,7 @@ static int selinux_sb_remount(struct super_block *sb, void *data) break; case ROOTCONTEXT_MNT: { struct inode_security_struct *root_isec; - root_isec = sb->s_root->d_inode->i_security; + root_isec = d_backing_inode(sb->s_root)->i_security; if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid, sid)) goto out_bad_option; @@ -2727,7 +2727,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, struct task_security_struct *tsec; struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_backing_inode(dentry->d_parent); u32 newsid; int rc; @@ -2982,7 +2982,7 @@ static int selinux_inode_setotherxattr(struct dentry *dentry, const char *name) static int selinux_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec = inode->i_security; struct superblock_security_struct *sbsec; struct common_audit_data ad; @@ -3059,7 +3059,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct inode_security_struct *isec = inode->i_security; u32 newsid; int rc; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 5fde34326dcf..d2787cca1fcb 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1737,7 +1737,7 @@ static struct dentry *sel_make_dir(struct dentry *dir, const char *name, inc_nlink(inode); d_add(dentry, inode); /* bump link count on parent directory, too */ - inc_nlink(dir->d_inode); + inc_nlink(d_inode(dir)); return dentry; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 69fdc384af30..b644757886bc 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -593,7 +593,7 @@ static int smack_sb_copy_data(char *orig, char *smackopts) static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) { struct dentry *root = sb->s_root; - struct inode *inode = root->d_inode; + struct inode *inode = d_backing_inode(root); struct superblock_smack *sp = sb->s_security; struct inode_smack *isp; struct smack_known *skp; @@ -889,15 +889,15 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); - isp = smk_of_inode(old_dentry->d_inode); + isp = smk_of_inode(d_backing_inode(old_dentry)); rc = smk_curacc(isp, MAY_WRITE, &ad); - rc = smk_bu_inode(old_dentry->d_inode, MAY_WRITE, rc); + rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_WRITE, rc); if (rc == 0 && d_is_positive(new_dentry)) { - isp = smk_of_inode(new_dentry->d_inode); + isp = smk_of_inode(d_backing_inode(new_dentry)); smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); rc = smk_curacc(isp, MAY_WRITE, &ad); - rc = smk_bu_inode(new_dentry->d_inode, MAY_WRITE, rc); + rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_WRITE, rc); } return rc; @@ -913,7 +913,7 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, */ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *ip = dentry->d_inode; + struct inode *ip = d_backing_inode(dentry); struct smk_audit_info ad; int rc; @@ -956,8 +956,8 @@ static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry) /* * You need write access to the thing you're removing */ - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); - rc = smk_bu_inode(dentry->d_inode, MAY_WRITE, rc); + rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); + rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); if (rc == 0) { /* * You also need write access to the containing directory @@ -995,15 +995,15 @@ static int smack_inode_rename(struct inode *old_inode, smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); - isp = smk_of_inode(old_dentry->d_inode); + isp = smk_of_inode(d_backing_inode(old_dentry)); rc = smk_curacc(isp, MAY_READWRITE, &ad); - rc = smk_bu_inode(old_dentry->d_inode, MAY_READWRITE, rc); + rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_READWRITE, rc); if (rc == 0 && d_is_positive(new_dentry)) { - isp = smk_of_inode(new_dentry->d_inode); + isp = smk_of_inode(d_backing_inode(new_dentry)); smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); rc = smk_curacc(isp, MAY_READWRITE, &ad); - rc = smk_bu_inode(new_dentry->d_inode, MAY_READWRITE, rc); + rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_READWRITE, rc); } return rc; } @@ -1060,8 +1060,8 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); - rc = smk_bu_inode(dentry->d_inode, MAY_WRITE, rc); + rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); + rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); return rc; } @@ -1075,7 +1075,7 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) static int smack_inode_getattr(const struct path *path) { struct smk_audit_info ad; - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); int rc; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH); @@ -1142,8 +1142,8 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name, smk_ad_setfield_u_fs_path_dentry(&ad, dentry); if (rc == 0) { - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); - rc = smk_bu_inode(dentry->d_inode, MAY_WRITE, rc); + rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); + rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); } return rc; @@ -1164,7 +1164,7 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct smack_known *skp; - struct inode_smack *isp = dentry->d_inode->i_security; + struct inode_smack *isp = d_backing_inode(dentry)->i_security; if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) { isp->smk_flags |= SMK_INODE_TRANSMUTE; @@ -1209,8 +1209,8 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad); - rc = smk_bu_inode(dentry->d_inode, MAY_READ, rc); + rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_READ, &ad); + rc = smk_bu_inode(d_backing_inode(dentry), MAY_READ, rc); return rc; } @@ -1246,12 +1246,12 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY); smk_ad_setfield_u_fs_path_dentry(&ad, dentry); - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); - rc = smk_bu_inode(dentry->d_inode, MAY_WRITE, rc); + rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad); + rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc); if (rc != 0) return rc; - isp = dentry->d_inode->i_security; + isp = d_backing_inode(dentry)->i_security; /* * Don't do anything special for these. * XATTR_NAME_SMACKIPIN diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 06f719ed63c9..d9682985349e 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -2490,7 +2490,7 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent) return rc; } - root_inode = sb->s_root->d_inode; + root_inode = d_inode(sb->s_root); return 0; } diff --git a/security/tomoyo/condition.c b/security/tomoyo/condition.c index 63681e8be628..6c4528d4b48f 100644 --- a/security/tomoyo/condition.c +++ b/security/tomoyo/condition.c @@ -714,7 +714,7 @@ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) dentry = dget_parent(dentry); break; } - inode = dentry->d_inode; + inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 1e0d480ff6a6..5077f1968841 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -97,7 +97,7 @@ static char *tomoyo_get_absolute_path(const struct path *path, char * const buff /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; @@ -125,7 +125,7 @@ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; @@ -168,7 +168,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. @@ -219,7 +219,7 @@ out: static char *tomoyo_get_socket_name(const struct path *path, char * const buffer, const int buflen) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); struct socket *sock = inode ? SOCKET_I(inode) : NULL; struct sock *sk = sock ? sock->sk : NULL; if (sk) { @@ -277,7 +277,7 @@ char *tomoyo_realpath_from_path(const struct path *path) pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } - inode = sb->s_root->d_inode; + inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation * or dentry without vfsmount. diff --git a/sound/oss/sequencer.c b/sound/oss/sequencer.c index c0eea1dfe90f..f19da4b47c1d 100644 --- a/sound/oss/sequencer.c +++ b/sound/oss/sequencer.c @@ -681,13 +681,8 @@ static int seq_timing_event(unsigned char *event_rec) break; case TMR_ECHO: - if (seq_mode == SEQ_2) - seq_copy_to_input(event_rec, 8); - else - { - parm = (parm << 8 | SEQ_ECHO); - seq_copy_to_input((unsigned char *) &parm, 4); - } + parm = (parm << 8 | SEQ_ECHO); + seq_copy_to_input((unsigned char *) &parm, 4); break; default:; @@ -1324,7 +1319,6 @@ int sequencer_ioctl(int dev, struct file *file, unsigned int cmd, void __user *a int mode = translate_mode(file); struct synth_info inf; struct seq_event_rec event_rec; - unsigned long flags; int __user *p = arg; orig_dev = dev = dev >> 4; @@ -1479,9 +1473,7 @@ int sequencer_ioctl(int dev, struct file *file, unsigned int cmd, void __user *a case SNDCTL_SEQ_OUTOFBAND: if (copy_from_user(&event_rec, arg, sizeof(event_rec))) return -EFAULT; - spin_lock_irqsave(&lock,flags); play_event(event_rec.arr); - spin_unlock_irqrestore(&lock,flags); return 0; case SNDCTL_MIDI_INFO: diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index e70a7fb393dd..873ed1bce12b 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -2529,7 +2529,7 @@ static void set_dig_out(struct hda_codec *codec, hda_nid_t nid, if (!d) return; for (; *d; d++) - snd_hdac_regmap_update(&codec->core, nid, + snd_hdac_regmap_update(&codec->core, *d, AC_VERB_SET_DIGI_CONVERT_1, mask, val); } diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h index be1b7ded8d82..0efdb094d21c 100644 --- a/sound/pci/hda/hda_controller.h +++ b/sound/pci/hda/hda_controller.h @@ -404,7 +404,7 @@ struct azx { ((chip)->ops->reg_readb((dev)->sd_addr + AZX_REG_##reg)) #define azx_has_pm_runtime(chip) \ - (!AZX_DCAPS_PM_RUNTIME || ((chip)->driver_caps & AZX_DCAPS_PM_RUNTIME)) + ((chip)->driver_caps & AZX_DCAPS_PM_RUNTIME) /* PCM setup */ static inline struct azx_dev *get_azx_dev(struct snd_pcm_substream *substream) diff --git a/sound/pci/hda/hda_i915.c b/sound/pci/hda/hda_i915.c index 52a85d87c23c..3052a2b095f7 100644 --- a/sound/pci/hda/hda_i915.c +++ b/sound/pci/hda/hda_i915.c @@ -55,6 +55,12 @@ void haswell_set_bclk(struct hda_intel *hda) int cdclk_freq; unsigned int bclk_m, bclk_n; struct i915_audio_component *acomp = &hda->audio_component; + struct pci_dev *pci = hda->chip.pci; + + /* Only Haswell/Broadwell need set BCLK */ + if (pci->device != 0x0a0c && pci->device != 0x0c0c + && pci->device != 0x0d0c && pci->device != 0x160c) + return; if (!acomp->ops) return; diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e1c210515581..34040d26c94f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -297,6 +297,9 @@ enum { AZX_DCAPS_PM_RUNTIME | AZX_DCAPS_I915_POWERWELL |\ AZX_DCAPS_SNOOP_TYPE(SCH)) +#define AZX_DCAPS_INTEL_BAYTRAIL \ + (AZX_DCAPS_INTEL_PCH_NOPM | AZX_DCAPS_I915_POWERWELL) + #define AZX_DCAPS_INTEL_BRASWELL \ (AZX_DCAPS_INTEL_PCH | AZX_DCAPS_I915_POWERWELL) @@ -1992,7 +1995,7 @@ static const struct pci_device_id azx_ids[] = { .driver_data = AZX_DRIVER_SCH | AZX_DCAPS_INTEL_PCH_NOPM }, /* BayTrail */ { PCI_DEVICE(0x8086, 0x0f04), - .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_PCH_NOPM }, + .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_BAYTRAIL }, /* Braswell */ { PCI_DEVICE(0x8086, 0x2284), .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_BRASWELL }, diff --git a/sound/pci/hda/hda_proc.c b/sound/pci/hda/hda_proc.c index ee6230767c64..baaf7ed06875 100644 --- a/sound/pci/hda/hda_proc.c +++ b/sound/pci/hda/hda_proc.c @@ -582,8 +582,8 @@ static void print_conn_list(struct snd_info_buffer *buffer, /* Get Cache connections info */ cache_len = snd_hda_get_conn_list(codec, nid, &list); - if (cache_len != conn_len - || memcmp(list, conn, conn_len)) { + if (cache_len >= 0 && (cache_len != conn_len || + memcmp(list, conn, conn_len) != 0)) { snd_iprintf(buffer, " In-driver Connection: %d\n", cache_len); if (cache_len > 0) { snd_iprintf(buffer, " "); diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b18b9c67b262..06199e4e930f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4176,17 +4176,15 @@ static void alc_fixup_disable_aamix(struct hda_codec *codec, } } -static unsigned int alc_power_filter_xps13(struct hda_codec *codec, - hda_nid_t nid, - unsigned int power_state) +static void alc_shutup_dell_xps13(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; + int hp_pin = spec->gen.autocfg.hp_pins[0]; - /* Avoid pop noises when headphones are plugged in */ - if (spec->gen.hp_jack_present) - if (nid == codec->core.afg || nid == 0x02 || nid == 0x15) - return AC_PWRST_D0; - return snd_hda_gen_path_power_filter(codec, nid, power_state); + /* Prevent pop noises when headphones are plugged in */ + snd_hda_codec_write(codec, hp_pin, 0, + AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); + msleep(20); } static void alc_fixup_dell_xps13(struct hda_codec *codec, @@ -4197,8 +4195,7 @@ static void alc_fixup_dell_xps13(struct hda_codec *codec, struct hda_input_mux *imux = &spec->gen.input_mux; int i; - spec->shutup = alc_no_shutup; - codec->power_filter = alc_power_filter_xps13; + spec->shutup = alc_shutup_dell_xps13; /* Make the internal mic the default input source. */ for (i = 0; i < imux->num_items; i++) { @@ -5231,6 +5228,16 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {0x1b, 0x411111f0}, \ {0x1e, 0x411111f0} +#define ALC256_STANDARD_PINS \ + {0x12, 0x90a60140}, \ + {0x14, 0x90170110}, \ + {0x19, 0x411111f0}, \ + {0x1a, 0x411111f0}, \ + {0x1b, 0x411111f0}, \ + {0x1d, 0x40700001}, \ + {0x1e, 0x411111f0}, \ + {0x21, 0x02211020} + #define ALC282_STANDARD_PINS \ {0x14, 0x90170110}, \ {0x18, 0x411111f0}, \ @@ -5331,15 +5338,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x1d, 0x40700001}, {0x21, 0x02211050}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, - {0x12, 0x90a60140}, - {0x13, 0x40000000}, - {0x14, 0x90170110}, - {0x19, 0x411111f0}, - {0x1a, 0x411111f0}, - {0x1b, 0x411111f0}, - {0x1d, 0x40700001}, - {0x1e, 0x411111f0}, - {0x21, 0x02211020}), + ALC256_STANDARD_PINS, + {0x13, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + ALC256_STANDARD_PINS, + {0x13, 0x411111f0}), SND_HDA_PIN_QUIRK(0x10ec0280, 0x103c, "HP", ALC280_FIXUP_HP_GPIO4, {0x12, 0x90a60130}, {0x13, 0x40000000}, @@ -5667,6 +5670,8 @@ static int patch_alc269(struct hda_codec *codec) break; case 0x10ec0256: spec->codec_variant = ALC269_TYPE_ALC256; + spec->gen.mixer_nid = 0; /* ALC256 does not have any loopback mixer path */ + alc_update_coef_idx(codec, 0x36, 1 << 13, 1 << 5); /* Switch pcbeep path to Line in path*/ break; } @@ -5680,8 +5685,8 @@ static int patch_alc269(struct hda_codec *codec) if (err < 0) goto error; - if (!spec->gen.no_analog && spec->gen.beep_nid) - set_beep_amp(spec, 0x0b, 0x04, HDA_INPUT); + if (!spec->gen.no_analog && spec->gen.beep_nid && spec->gen.mixer_nid) + set_beep_amp(spec, spec->gen.mixer_nid, 0x04, HDA_INPUT); codec->patch_ops = alc_patch_ops; codec->patch_ops.stream_pm = snd_hda_gen_stream_pm; diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c index 749069aa6997..b120925223ae 100644 --- a/sound/pci/intel8x0.c +++ b/sound/pci/intel8x0.c @@ -3101,13 +3101,13 @@ static int snd_intel8x0_create(struct snd_card *card, chip->bmaddr = pci_iomap(pci, 3, 0); else chip->bmaddr = pci_iomap(pci, 1, 0); + + port_inited: if (!chip->bmaddr) { dev_err(card->dev, "Controller space ioremap problem\n"); snd_intel8x0_free(chip); return -EIO; } - - port_inited: chip->bdbars_count = bdbars[device_type]; /* initialize offsets */ diff --git a/sound/soc/sh/fsi.c b/sound/soc/sh/fsi.c index 0c2af21b0b82..142c066eaee2 100644 --- a/sound/soc/sh/fsi.c +++ b/sound/soc/sh/fsi.c @@ -250,6 +250,7 @@ struct fsi_clk { struct fsi_priv { void __iomem *base; + phys_addr_t phys; struct fsi_master *master; struct fsi_stream playback; @@ -1371,13 +1372,18 @@ static int fsi_dma_probe(struct fsi_priv *fsi, struct fsi_stream *io, struct dev shdma_chan_filter, (void *)io->dma_id, dev, is_play ? "tx" : "rx"); if (io->chan) { - struct dma_slave_config cfg; + struct dma_slave_config cfg = {}; int ret; - cfg.slave_id = io->dma_id; - cfg.dst_addr = 0; /* use default addr */ - cfg.src_addr = 0; /* use default addr */ - cfg.direction = is_play ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM; + if (is_play) { + cfg.dst_addr = fsi->phys + REG_DODT; + cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + cfg.direction = DMA_MEM_TO_DEV; + } else { + cfg.src_addr = fsi->phys + REG_DIDT; + cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + cfg.direction = DMA_DEV_TO_MEM; + } ret = dmaengine_slave_config(io->chan, &cfg); if (ret < 0) { @@ -1974,6 +1980,7 @@ static int fsi_probe(struct platform_device *pdev) /* FSI A setting */ fsi = &master->fsia; fsi->base = master->base; + fsi->phys = res->start; fsi->master = master; fsi_port_info_init(fsi, &info.port_a); fsi_handler_init(fsi, &info.port_a); @@ -1986,6 +1993,7 @@ static int fsi_probe(struct platform_device *pdev) /* FSI B setting */ fsi = &master->fsib; fsi->base = master->base + 0x40; + fsi->phys = res->start + 0x40; fsi->master = master; fsi_port_info_init(fsi, &info.port_b); fsi_handler_init(fsi, &info.port_b); diff --git a/sound/usb/format.c b/sound/usb/format.c index 8bcc87cf5667..789d19ec035d 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -79,7 +79,10 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip, format = 1 << UAC_FORMAT_TYPE_I_PCM; } if (format & (1 << UAC_FORMAT_TYPE_I_PCM)) { - if (chip->usb_id == USB_ID(0x0582, 0x0016) /* Edirol SD-90 */ && + if (((chip->usb_id == USB_ID(0x0582, 0x0016)) || + /* Edirol SD-90 */ + (chip->usb_id == USB_ID(0x0582, 0x000c))) && + /* Roland SC-D70 */ sample_width == 24 && sample_bytes == 2) sample_bytes = 3; else if (sample_width > sample_bytes * 8) { diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 07f984d5f516..2f6d3e9a1bcd 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -816,37 +816,11 @@ YAMAHA_DEVICE(0x7010, "UB99"), .data = (const struct snd_usb_audio_quirk[]) { { .ifnum = 0, - .type = QUIRK_AUDIO_FIXED_ENDPOINT, - .data = & (const struct audioformat) { - .formats = SNDRV_PCM_FMTBIT_S24_3LE, - .channels = 2, - .iface = 0, - .altsetting = 1, - .altset_idx = 1, - .attributes = 0, - .endpoint = 0x01, - .ep_attr = 0x01, - .rates = SNDRV_PCM_RATE_CONTINUOUS, - .rate_min = 44100, - .rate_max = 44100, - } + .type = QUIRK_AUDIO_STANDARD_INTERFACE }, { .ifnum = 1, - .type = QUIRK_AUDIO_FIXED_ENDPOINT, - .data = & (const struct audioformat) { - .formats = SNDRV_PCM_FMTBIT_S24_3LE, - .channels = 2, - .iface = 1, - .altsetting = 1, - .altset_idx = 1, - .attributes = 0, - .endpoint = 0x81, - .ep_attr = 0x01, - .rates = SNDRV_PCM_RATE_CONTINUOUS, - .rate_min = 44100, - .rate_max = 44100, - } + .type = QUIRK_AUDIO_STANDARD_INTERFACE }, { .ifnum = 2, diff --git a/tools/power/cpupower/utils/helpers/pci.c b/tools/power/cpupower/utils/helpers/pci.c index 9690798e6446..8b278983cfc5 100644 --- a/tools/power/cpupower/utils/helpers/pci.c +++ b/tools/power/cpupower/utils/helpers/pci.c @@ -25,14 +25,21 @@ struct pci_dev *pci_acc_init(struct pci_access **pacc, int domain, int bus, int slot, int func, int vendor, int dev) { - struct pci_filter filter_nb_link = { domain, bus, slot, func, - vendor, dev }; + struct pci_filter filter_nb_link; struct pci_dev *device; *pacc = pci_alloc(); if (*pacc == NULL) return NULL; + pci_filter_init(*pacc, &filter_nb_link); + filter_nb_link.domain = domain; + filter_nb_link.bus = bus; + filter_nb_link.slot = slot; + filter_nb_link.func = func; + filter_nb_link.vendor = vendor; + filter_nb_link.device = dev; + pci_init(*pacc); pci_scan_bus(*pacc); diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 8d550ff14700..78fb8201014f 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -1561,6 +1561,9 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, goto out; } + if (irq_num >= kvm->arch.vgic.nr_irqs) + return -EINVAL; + vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level); if (vcpu_id >= 0) { /* kick the specified vcpu */ @@ -2141,7 +2144,7 @@ int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { - return gsi; + return 0; } int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d3fc9399062a..90977418aeb6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -89,6 +89,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_cache); static __read_mostly struct preempt_ops kvm_preempt_ops; struct dentry *kvm_debugfs_dir; +EXPORT_SYMBOL_GPL(kvm_debugfs_dir); static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); |