From 0e98936c924e2329f876b0b7791b45249c2e2129 Mon Sep 17 00:00:00 2001 From: Sumant Patro Date: Tue, 20 Jun 2006 15:32:37 -0700 Subject: [SCSI] megaraid_sas: zcr with fix The patch adds support for a ZCR controller (Device ID : 0x413). It also has a critical bug fix : Disable controller interrupt before firing INIT cmd to FW. Interrupt is enabled after required initialization is over. This is done to ensure that driver is ready to handle interrupts when it is generated by the controller. Signed-off-by: Sumant Patro Signed-off-by: James Bottomley --- Documentation/scsi/ChangeLog.megaraid_sas | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'Documentation') diff --git a/Documentation/scsi/ChangeLog.megaraid_sas b/Documentation/scsi/ChangeLog.megaraid_sas index 0a85a7e8120e..d9e5960dafd5 100644 --- a/Documentation/scsi/ChangeLog.megaraid_sas +++ b/Documentation/scsi/ChangeLog.megaraid_sas @@ -1,4 +1,20 @@ +1 Release Date : Sun May 14 22:49:52 PDT 2006 - Sumant Patro +2 Current Version : 00.00.03.01 +3 Older Version : 00.00.02.04 + +i. Added support for ZCR controller. + + New device id 0x413 added. + +ii. Bug fix : Disable controller interrupt before firing INIT cmd to FW. + + Interrupt is enabled after required initialization is over. + This is done to ensure that driver is ready to handle interrupts when + it is generated by the controller. + + -Sumant Patro + 1 Release Date : Wed Feb 03 14:31:44 PST 2006 - Sumant Patro 2 Current Version : 00.00.02.04 3 Older Version : 00.00.02.04 -- cgit v1.2.3 From 844d3b427ef1a4f96e54866747bdb6c6cbca4c6a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 Jun 2006 21:48:27 -0700 Subject: MTD: fix all kernel-doc warnings Fix all kernel-doc warnings in MTD headers and source files: - add some missing struct fields; - correct some function parameter names; - use kernel-doc format for function doc. headers; - nand_ecc.c contains only exported interfaces, no internal ones; Signed-off-by: Randy Dunlap Signed-off-by: David Woodhouse --- Documentation/DocBook/mtdnand.tmpl | 4 +++- drivers/mtd/nand/nand_base.c | 16 ++++++++-------- drivers/mtd/nand/nand_ecc.c | 3 +-- include/linux/mtd/nand.h | 13 ++++++++++--- 4 files changed, 22 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 6e463d0db266..32f385605981 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -1295,7 +1295,9 @@ in this page !Idrivers/mtd/nand/nand_base.c !Idrivers/mtd/nand/nand_bbt.c -!Idrivers/mtd/nand/nand_ecc.c + diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 80a76654d963..62b861304e03 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -155,7 +155,7 @@ static u16 nand_read_word(struct mtd_info *mtd) /** * nand_select_chip - [DEFAULT] control CE line * @mtd: MTD device structure - * @chip: chipnumber to select, -1 for deselect + * @chipnr: chipnumber to select, -1 for deselect * * Default select function for 1 chip devices. */ @@ -542,7 +542,6 @@ static void nand_command(struct mtd_info *mtd, unsigned int command, * Send command to NAND device. This is the version for the new large page * devices We dont have the separate regions as we have in the small page * devices. We must emulate NAND_CMD_READOOB to keep the code compatible. - * */ static void nand_command_lp(struct mtd_info *mtd, unsigned int command, int column, int page_addr) @@ -656,7 +655,7 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command, /** * nand_get_device - [GENERIC] Get chip for selected access - * @this: the nand chip descriptor + * @chip: the nand chip descriptor * @mtd: MTD device structure * @new_state: the state which is requested * @@ -696,13 +695,12 @@ nand_get_device(struct nand_chip *chip, struct mtd_info *mtd, int new_state) /** * nand_wait - [DEFAULT] wait until the command is done * @mtd: MTD device structure - * @this: NAND chip structure + * @chip: NAND chip structure * * Wait for command done. This applies to erase and program only * Erase can take up to 400ms and program up to 20ms according to * general NAND and SmartMedia specs - * -*/ + */ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip) { @@ -896,6 +894,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip, /** * nand_transfer_oob - [Internal] Transfer oob to client buffer * @chip: nand chip structure + * @oob: oob destination address * @ops: oob ops structure */ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob, @@ -946,6 +945,7 @@ static uint8_t *nand_transfer_oob(struct nand_chip *chip, uint8_t *oob, * * @mtd: MTD device structure * @from: offset to read from + * @ops: oob ops structure * * Internal function. Called with chip held. */ @@ -1760,7 +1760,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, /** * nand_write_oob - [MTD Interface] NAND write data and/or out-of-band * @mtd: MTD device structure - * @from: offset to read from + * @to: offset to write to * @ops: oob operation description structure */ static int nand_write_oob(struct mtd_info *mtd, loff_t to, @@ -2055,7 +2055,7 @@ static void nand_sync(struct mtd_info *mtd) /** * nand_block_isbad - [MTD Interface] Check if block at offset is bad * @mtd: MTD device structure - * @ofs: offset relative to mtd start + * @offs: offset relative to mtd start */ static int nand_block_isbad(struct mtd_info *mtd, loff_t offs) { diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c index 2a163e4084df..dd438ca47d9a 100644 --- a/drivers/mtd/nand/nand_ecc.c +++ b/drivers/mtd/nand/nand_ecc.c @@ -65,8 +65,7 @@ static const u_char nand_ecc_precalc_table[] = { }; /** - * nand_calculate_ecc - [NAND Interface] Calculate 3 byte ECC code - * for 256 byte block + * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block * @mtd: MTD block structure * @dat: raw data * @ecc_code: buffer for ECC diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 66559272ebcb..2266f032a8c6 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -202,7 +202,7 @@ typedef enum { struct nand_chip; /** - * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independend devices + * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices * @lock: protection lock * @active: the mtd device which holds the controller currently * @wq: wait queue to sleep on if a NAND operation is in progress @@ -223,12 +223,15 @@ struct nand_hw_control { * @total: total number of ecc bytes per page * @prepad: padding information for syndrome based ecc generators * @postpad: padding information for syndrome based ecc generators + * @layout: ECC layout control struct pointer * @hwctl: function to control hardware ecc generator. Must only * be provided if an hardware ECC is available * @calculate: function for ecc calculation or readback from ecc hardware * @correct: function for ecc correction, matching to ecc generator (sw/hw) * @read_page: function to read a page according to the ecc generator requirements * @write_page: function to write a page according to the ecc generator requirements + * @read_oob: function to read chip OOB data + * @write_oob: function to write chip OOB data */ struct nand_ecc_ctrl { nand_ecc_modes_t mode; @@ -300,11 +303,15 @@ struct nand_buffers { * @cmdfunc: [REPLACEABLE] hardwarespecific function for writing commands to the chip * @waitfunc: [REPLACEABLE] hardwarespecific function for wait on ready * @ecc: [BOARDSPECIFIC] ecc control ctructure + * @buffers: buffer structure for read/write + * @hwcontrol: platform-specific hardware control structure + * @ops: oob operation operands * @erase_cmd: [INTERN] erase command write function, selectable due to AND support * @scan_bbt: [REPLACEABLE] function to scan bad block table * @chip_delay: [BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR) * @wq: [INTERN] wait queue to sleep on if a NAND operation is in progress * @state: [INTERN] the current state of the NAND device + * @oob_poi: poison value buffer * @page_shift: [INTERN] number of address bits in a page (column address bits) * @phys_erase_shift: [INTERN] number of address bits in a physical eraseblock * @bbt_erase_shift: [INTERN] number of address bits in a bbt entry @@ -521,7 +528,7 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len, * struct platform_nand_chip - chip level device structure * * @nr_chips: max. number of chips to scan for - * @chip_offs: chip number offset + * @chip_offset: chip number offset * @nr_partitions: number of partitions pointed to by partitions (or zero) * @partitions: mtd partition list * @chip_delay: R/B delay value in us @@ -546,7 +553,7 @@ struct platform_nand_chip { * @hwcontrol: platform specific hardware control structure * @dev_ready: platform specific function to read ready/busy pin * @select_chip: platform specific chip select function - * @priv_data: private data to transport driver specific settings + * @priv: private data to transport driver specific settings * * All fields are optional and depend on the hardware driver requirements */ -- cgit v1.2.3 From ea9b6dcc152f09c207117ab121d4fa03d2db282a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 Jun 2006 21:48:38 -0700 Subject: MTD: kernel-doc fixes + additions Fix some kernel-doc typos/spellos. Use kernel-doc syntax in places where it was almost used. Correct/add struct, struct field, and function param names where needed. Signed-off-by: Randy Dunlap Signed-off-by: David Woodhouse --- Documentation/DocBook/mtdnand.tmpl | 7 ++-- include/linux/mtd/bbm.h | 35 +++++++++-------- include/linux/mtd/mtd.h | 4 +- include/linux/mtd/nand.h | 3 -- include/linux/mtd/onenand.h | 77 ++++++++++++++++++++++---------------- include/mtd/mtd-abi.h | 2 +- 6 files changed, 69 insertions(+), 59 deletions(-) (limited to 'Documentation') diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 32f385605981..630159c5bdba 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -109,7 +109,7 @@ for most of the implementations. These functions can be replaced by the board driver if neccecary. Those functions are called via pointers in the NAND chip description structure. The board driver can set the functions which - should be replaced by board dependend functions before calling nand_scan(). + should be replaced by board dependent functions before calling nand_scan(). If the function pointer is NULL on entry to nand_scan() then the pointer is set to the default function which is suitable for the detected chip type. @@ -133,7 +133,7 @@ [REPLACEABLE] Replaceable members hold hardware related functions which can be provided by the board driver. The board driver can set the functions which - should be replaced by board dependend functions before calling nand_scan(). + should be replaced by board dependent functions before calling nand_scan(). If the function pointer is NULL on entry to nand_scan() then the pointer is set to the default function which is suitable for the detected chip type. @@ -156,9 +156,8 @@ Basic board driver For most boards it will be sufficient to provide just the - basic functions and fill out some really board dependend + basic functions and fill out some really board dependent members in the nand chip description structure. - See drivers/mtd/nand/skeleton for reference. Basic defines diff --git a/include/linux/mtd/bbm.h b/include/linux/mtd/bbm.h index 7a7fbe87fef0..1221b7c44158 100644 --- a/include/linux/mtd/bbm.h +++ b/include/linux/mtd/bbm.h @@ -19,21 +19,21 @@ /** * struct nand_bbt_descr - bad block table descriptor - * @param options options for this descriptor - * @param pages the page(s) where we find the bbt, used with + * @options: options for this descriptor + * @pages: the page(s) where we find the bbt, used with * option BBT_ABSPAGE when bbt is searched, * then we store the found bbts pages here. * Its an array and supports up to 8 chips now - * @param offs offset of the pattern in the oob area of the page - * @param veroffs offset of the bbt version counter in the oob are of the page - * @param version version read from the bbt page during scan - * @param len length of the pattern, if 0 no pattern check is performed - * @param maxblocks maximum number of blocks to search for a bbt. This number of - * blocks is reserved at the end of the device + * @offs: offset of the pattern in the oob area of the page + * @veroffs: offset of the bbt version counter in the oob area of the page + * @version: version read from the bbt page during scan + * @len: length of the pattern, if 0 no pattern check is performed + * @maxblocks: maximum number of blocks to search for a bbt. This + * number of blocks is reserved at the end of the device * where the tables are written. - * @param reserved_block_code if non-0, this pattern denotes a reserved + * @reserved_block_code: if non-0, this pattern denotes a reserved * (rather than bad) block in the stored bbt - * @param pattern pattern to identify bad block table or factory marked + * @pattern: pattern to identify bad block table or factory marked * good / bad blocks, can be NULL, if len = 0 * * Descriptor for the bad block table marker and the descriptor for the @@ -93,12 +93,15 @@ struct nand_bbt_descr { #define ONENAND_BADBLOCK_POS 0 /** - * struct bbt_info - [GENERIC] Bad Block Table data structure - * @param bbt_erase_shift [INTERN] number of address bits in a bbt entry - * @param badblockpos [INTERN] position of the bad block marker in the oob area - * @param bbt [INTERN] bad block table pointer - * @param badblock_pattern [REPLACEABLE] bad block scan pattern used for initial bad block scan - * @param priv [OPTIONAL] pointer to private bbm date + * struct bbm_info - [GENERIC] Bad Block Table data structure + * @bbt_erase_shift: [INTERN] number of address bits in a bbt entry + * @badblockpos: [INTERN] position of the bad block marker in the oob area + * @options: options for this descriptor + * @bbt: [INTERN] bad block table pointer + * @isbad_bbt: function to determine if a block is bad + * @badblock_pattern: [REPLACEABLE] bad block scan pattern used for + * initial bad block scan + * @priv: [OPTIONAL] pointer to private bbm date */ struct bbm_info { int bbt_erase_shift; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 9b7a2b525d63..94a443d45258 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -77,11 +77,11 @@ typedef enum { * * @len: number of bytes to write/read. When a data buffer is given * (datbuf != NULL) this is the number of data bytes. When - + no data buffer is available this is the number of oob bytes. + * no data buffer is available this is the number of oob bytes. * * @retlen: number of bytes written/read. When a data buffer is given * (datbuf != NULL) this is the number of data bytes. When - + no data buffer is available this is the number of oob bytes. + * no data buffer is available this is the number of oob bytes. * * @ooblen: number of oob bytes per page * @ooboffs: offset of oob data in the oob area (only relevant when diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 2266f032a8c6..0b4cd2fa64aa 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -407,7 +407,6 @@ struct nand_chip { /** * struct nand_flash_dev - NAND Flash Device ID Structure - * * @name: Identify the device type * @id: device ID code * @pagesize: Pagesize in bytes. Either 256 or 512 or 0 @@ -526,7 +525,6 @@ extern int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len, /** * struct platform_nand_chip - chip level device structure - * * @nr_chips: max. number of chips to scan for * @chip_offset: chip number offset * @nr_partitions: number of partitions pointed to by partitions (or zero) @@ -549,7 +547,6 @@ struct platform_nand_chip { /** * struct platform_nand_ctrl - controller level device structure - * * @hwcontrol: platform specific hardware control structure * @dev_ready: platform specific function to read ready/busy pin * @select_chip: platform specific chip select function diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index 9ce9a48db444..1f4972155249 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -23,7 +23,7 @@ extern int onenand_scan(struct mtd_info *mtd, int max_chips); /* Free resources held by the OneNAND device */ extern void onenand_release(struct mtd_info *mtd); -/** +/* * onenand_state_t - chip states * Enumeration for OneNAND flash chip state */ @@ -42,9 +42,9 @@ typedef enum { /** * struct onenand_bufferram - OneNAND BufferRAM Data - * @param block block address in BufferRAM - * @param page page address in BufferRAM - * @param valid valid flag + * @block: block address in BufferRAM + * @page: page address in BufferRAM + * @valid: valid flag */ struct onenand_bufferram { int block; @@ -54,32 +54,43 @@ struct onenand_bufferram { /** * struct onenand_chip - OneNAND Private Flash Chip Data - * @param base [BOARDSPECIFIC] address to access OneNAND - * @param chipsize [INTERN] the size of one chip for multichip arrays - * @param device_id [INTERN] device ID - * @param verstion_id [INTERN] version ID - * @param options [BOARDSPECIFIC] various chip options. They can partly be set to inform onenand_scan about - * @param erase_shift [INTERN] number of address bits in a block - * @param page_shift [INTERN] number of address bits in a page - * @param ppb_shift [INTERN] number of address bits in a pages per block - * @param page_mask [INTERN] a page per block mask - * @param bufferam_index [INTERN] BufferRAM index - * @param bufferam [INTERN] BufferRAM info - * @param readw [REPLACEABLE] hardware specific function for read short - * @param writew [REPLACEABLE] hardware specific function for write short - * @param command [REPLACEABLE] hardware specific function for writing commands to the chip - * @param wait [REPLACEABLE] hardware specific function for wait on ready - * @param read_bufferram [REPLACEABLE] hardware specific function for BufferRAM Area - * @param write_bufferram [REPLACEABLE] hardware specific function for BufferRAM Area - * @param read_word [REPLACEABLE] hardware specific function for read register of OneNAND - * @param write_word [REPLACEABLE] hardware specific function for write register of OneNAND - * @param scan_bbt [REPLACEALBE] hardware specific function for scaning Bad block Table - * @param chip_lock [INTERN] spinlock used to protect access to this structure and the chip - * @param wq [INTERN] wait queue to sleep on if a OneNAND operation is in progress - * @param state [INTERN] the current state of the OneNAND device - * @param ecclayout [REPLACEABLE] the default ecc placement scheme - * @param bbm [REPLACEABLE] pointer to Bad Block Management - * @param priv [OPTIONAL] pointer to private chip date + * @base: [BOARDSPECIFIC] address to access OneNAND + * @chipsize: [INTERN] the size of one chip for multichip arrays + * @device_id: [INTERN] device ID + * @density_mask: chip density, used for DDP devices + * @verstion_id: [INTERN] version ID + * @options: [BOARDSPECIFIC] various chip options. They can + * partly be set to inform onenand_scan about + * @erase_shift: [INTERN] number of address bits in a block + * @page_shift: [INTERN] number of address bits in a page + * @ppb_shift: [INTERN] number of address bits in a pages per block + * @page_mask: [INTERN] a page per block mask + * @bufferram_index: [INTERN] BufferRAM index + * @bufferram: [INTERN] BufferRAM info + * @readw: [REPLACEABLE] hardware specific function for read short + * @writew: [REPLACEABLE] hardware specific function for write short + * @command: [REPLACEABLE] hardware specific function for writing + * commands to the chip + * @wait: [REPLACEABLE] hardware specific function for wait on ready + * @read_bufferram: [REPLACEABLE] hardware specific function for BufferRAM Area + * @write_bufferram: [REPLACEABLE] hardware specific function for BufferRAM Area + * @read_word: [REPLACEABLE] hardware specific function for read + * register of OneNAND + * @write_word: [REPLACEABLE] hardware specific function for write + * register of OneNAND + * @mmcontrol: sync burst read function + * @block_markbad: function to mark a block as bad + * @scan_bbt: [REPLACEALBE] hardware specific function for scanning + * Bad block Table + * @chip_lock: [INTERN] spinlock used to protect access to this + * structure and the chip + * @wq: [INTERN] wait queue to sleep on if a OneNAND + * operation is in progress + * @state: [INTERN] the current state of the OneNAND device + * @page_buf: data buffer + * @ecclayout: [REPLACEABLE] the default ecc placement scheme + * @bbm: [REPLACEABLE] pointer to Bad Block Management + * @priv: [OPTIONAL] pointer to private chip date */ struct onenand_chip { void __iomem *base; @@ -147,9 +158,9 @@ struct onenand_chip { #define ONENAND_MFR_SAMSUNG 0xec /** - * struct nand_manufacturers - NAND Flash Manufacturer ID Structure - * @param name: Manufacturer name - * @param id: manufacturer ID code of device. + * struct onenand_manufacturers - NAND Flash Manufacturer ID Structure + * @name: Manufacturer name + * @id: manufacturer ID code of device. */ struct onenand_manufacturers { int id; diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h index 31329fce1ff5..1da3f7fa7993 100644 --- a/include/mtd/mtd-abi.h +++ b/include/mtd/mtd-abi.h @@ -133,7 +133,7 @@ struct nand_ecclayout { }; /** - * struct mtd_ecc_stats - error correction status + * struct mtd_ecc_stats - error correction stats * * @corrected: number of corrected bits * @failed: number of uncorrectable errors -- cgit v1.2.3 From cbdb54d3ca986e30d0b8c1f755c3910b5573baae Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 3 Jul 2006 15:10:14 -0500 Subject: Documentation: correct values in MPC8548E SEC example node Signed-off-by: Kim Phillips Signed-off-by: Kumar Gala --- Documentation/powerpc/booting-without-of.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt index 217e51768b87..3c62e66e1fcc 100644 --- a/Documentation/powerpc/booting-without-of.txt +++ b/Documentation/powerpc/booting-without-of.txt @@ -1436,9 +1436,9 @@ platforms are moved over to use the flattened-device-tree model. interrupts = <1d 3>; interrupt-parent = <40000>; num-channels = <4>; - channel-fifo-len = <24>; + channel-fifo-len = <18>; exec-units-mask = <000000fe>; - descriptor-types-mask = <073f1127>; + descriptor-types-mask = <012b0ebf>; }; -- cgit v1.2.3 From 9614634fe6a138fd8ae044950700d2af8d203f97 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 3 Jul 2006 00:24:13 -0700 Subject: [PATCH] ZVC/zone_reclaim: Leave 1% of unmapped pagecache pages for file I/O It turns out that it is advantageous to leave a small portion of unmapped file backed pages if all of a zone's pages (or almost all pages) are allocated and so the page allocator has to go off-node. This allows recently used file I/O buffers to stay on the node and reduces the times that zone reclaim is invoked if file I/O occurs when we run out of memory in a zone. The problem is that zone reclaim runs too frequently when the page cache is used for file I/O (read write and therefore unmapped pages!) alone and we have almost all pages of the zone allocated. Zone reclaim may remove 32 unmapped pages. File I/O will use these pages for the next read/write requests and the unmapped pages increase. After the zone has filled up again zone reclaim will remove it again after only 32 pages. This cycle is too inefficient and there are potentially too many zone reclaim cycles. With the 1% boundary we may still remove all unmapped pages for file I/O in zone reclaim pass. However. it will take a large number of read and writes to get back to 1% again where we trigger zone reclaim again. The zone reclaim 2.6.16/17 does not show this behavior because we have a 30 second timeout. [akpm@osdl.org: rename the /proc file and the variable] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 14 ++++++++++++++ include/linux/mmzone.h | 6 ++++++ include/linux/swap.h | 1 + include/linux/sysctl.h | 2 +- kernel/sysctl.c | 11 +++++++++++ mm/page_alloc.c | 22 ++++++++++++++++++++++ mm/vmscan.c | 27 ++++++++++++++------------- 7 files changed, 69 insertions(+), 14 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 86754eb390da..7cee90223d3a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm: - block_dump - drop-caches - zone_reclaim_mode +- min_unmapped_ratio - panic_on_oom ============================================================== @@ -168,6 +169,19 @@ in all nodes of the system. ============================================================= +min_unmapped_ratio: + +This is available only on NUMA kernels. + +A percentage of the file backed pages in each zone. Zone reclaim will only +occur if more than this percentage of pages are file backed and unmapped. +This is to insure that a minimal amount of local pages is still available for +file I/O even if the node is overallocated. + +The default is 1 percent. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 27e748eb72b0..656b588a9f96 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -150,6 +150,10 @@ struct zone { unsigned long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA + /* + * zone reclaim becomes active if more unmapped pages exist. + */ + unsigned long min_unmapped_ratio; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -414,6 +418,8 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ diff --git a/include/linux/swap.h b/include/linux/swap.h index cf6ca6e377bd..5e59184c9096 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -189,6 +189,7 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int sysctl_min_unmapped_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 46e4d8f2771f..e4b1a4d4dcf3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -188,7 +188,7 @@ enum VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ - VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 99a58f279077..362a0cc37138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -932,6 +932,17 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_MIN_UNMAPPED, + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e792a583f3b..54a4f5375bba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; +#ifdef CONFIG_NUMA + zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) + / 100; +#endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); @@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA +int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_unmapped_ratio = (zone->present_pages * + sysctl_min_unmapped_ratio) / 100; + return 0; +} +#endif + /* * lowmem_reserve_ratio_sysctl_handler - just a wrapper around * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() diff --git a/mm/vmscan.c b/mm/vmscan.c index ff2ebe9458a3..5d4c4d02254d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1503,10 +1503,6 @@ module_init(kswapd_init) * * If non-zero call zone_reclaim when the number of free pages falls below * the watermarks. - * - * In the future we may add flags to the mode. However, the page allocator - * should only have to check that zone_reclaim_mode != 0 before calling - * zone_reclaim(). */ int zone_reclaim_mode __read_mostly; @@ -1523,6 +1519,12 @@ int zone_reclaim_mode __read_mostly; */ #define ZONE_RECLAIM_PRIORITY 4 +/* + * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; + /* * Try to free up some pages from this zone through reclaim. */ @@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Do not reclaim if there are not enough reclaimable pages in this - * zone that would satify this allocations. + * Zone reclaim reclaims unmapped file backed pages. * - * All unmapped pagecache pages are reclaimable. - * - * Both counters may be temporarily off a bit so we use - * SWAP_CLUSTER_MAX as the boundary. It may also be good to - * leave a few frequently used unmapped pagecache pages around. + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately + * thrown out if the zone is overallocated. So we do not reclaim + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) - return 0; + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) + return 0; /* * Avoid concurrent zone reclaims, do not reclaim in a zone that does -- cgit v1.2.3 From 55df314fbdb44c20fa7a5112d16546ee970c1d76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:43 -0700 Subject: [PATCH] lockdep: irqtrace subsystem, docs Add Documentation/irqflags-tracing.txt. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/irqflags-tracing.txt | 57 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 Documentation/irqflags-tracing.txt (limited to 'Documentation') diff --git a/Documentation/irqflags-tracing.txt b/Documentation/irqflags-tracing.txt new file mode 100644 index 000000000000..6a444877ee0b --- /dev/null +++ b/Documentation/irqflags-tracing.txt @@ -0,0 +1,57 @@ +IRQ-flags state tracing + +started by Ingo Molnar + +the "irq-flags tracing" feature "traces" hardirq and softirq state, in +that it gives interested subsystems an opportunity to be notified of +every hardirqs-off/hardirqs-on, softirqs-off/softirqs-on event that +happens in the kernel. + +CONFIG_TRACE_IRQFLAGS_SUPPORT is needed for CONFIG_PROVE_SPIN_LOCKING +and CONFIG_PROVE_RW_LOCKING to be offered by the generic lock debugging +code. Otherwise only CONFIG_PROVE_MUTEX_LOCKING and +CONFIG_PROVE_RWSEM_LOCKING will be offered on an architecture - these +are locking APIs that are not used in IRQ context. (the one exception +for rwsems is worked around) + +architecture support for this is certainly not in the "trivial" +category, because lots of lowlevel assembly code deal with irq-flags +state changes. But an architecture can be irq-flags-tracing enabled in a +rather straightforward and risk-free manner. + +Architectures that want to support this need to do a couple of +code-organizational changes first: + +- move their irq-flags manipulation code from their asm/system.h header + to asm/irqflags.h + +- rename local_irq_disable()/etc to raw_local_irq_disable()/etc. so that + the linux/irqflags.h code can inject callbacks and can construct the + real local_irq_disable()/etc APIs. + +- add and enable TRACE_IRQFLAGS_SUPPORT in their arch level Kconfig file + +and then a couple of functional changes are needed as well to implement +irq-flags-tracing support: + +- in lowlevel entry code add (build-conditional) calls to the + trace_hardirqs_off()/trace_hardirqs_on() functions. The lock validator + closely guards whether the 'real' irq-flags matches the 'virtual' + irq-flags state, and complains loudly (and turns itself off) if the + two do not match. Usually most of the time for arch support for + irq-flags-tracing is spent in this state: look at the lockdep + complaint, try to figure out the assembly code we did not cover yet, + fix and repeat. Once the system has booted up and works without a + lockdep complaint in the irq-flags-tracing functions arch support is + complete. +- if the architecture has non-maskable interrupts then those need to be + excluded from the irq-tracing [and lock validation] mechanism via + lockdep_off()/lockdep_on(). + +in general there is no risk from having an incomplete irq-flags-tracing +implementation in an architecture: lockdep will detect that and will +turn itself off. I.e. the lock validator will still be reliable. There +should be no crashes due to irq-tracing bugs. (except if the assembly +changes break other code by modifying conditions or registers that +shouldnt be) + -- cgit v1.2.3 From cae2ed9aa573415c6e5de9a09b7ff0d74af793bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:48 -0700 Subject: [PATCH] lockdep: locking API self tests Introduce DEBUG_LOCKING_API_SELFTESTS, which uses the generic lock debugging code's silent-failure feature to run a matrix of testcases. There are 210 testcases currently: +----------------------- | Locking API testsuite: +------------------------------+------+------+------+------+------+------+ | spin |wlock |rlock |mutex | wsem | rsem | -------------------------------+------+------+------+------+------+------+ A-A deadlock: ok | ok | ok | ok | ok | ok | A-B-B-A deadlock: ok | ok | ok | ok | ok | ok | A-B-B-C-C-A deadlock: ok | ok | ok | ok | ok | ok | A-B-C-A-B-C deadlock: ok | ok | ok | ok | ok | ok | A-B-B-C-C-D-D-A deadlock: ok | ok | ok | ok | ok | ok | A-B-C-D-B-D-D-A deadlock: ok | ok | ok | ok | ok | ok | A-B-C-D-B-C-D-A deadlock: ok | ok | ok | ok | ok | ok | double unlock: ok | ok | ok | ok | ok | ok | bad unlock order: ok | ok | ok | ok | ok | ok | --------------------------------------+------+------+------+------+------+ recursive read-lock: | ok | | ok | --------------------------------------+------+------+------+------+------+ non-nested unlock: ok | ok | ok | ok | --------------------------------------+------+------+------+ hard-irqs-on + irq-safe-A/12: ok | ok | ok | soft-irqs-on + irq-safe-A/12: ok | ok | ok | hard-irqs-on + irq-safe-A/21: ok | ok | ok | soft-irqs-on + irq-safe-A/21: ok | ok | ok | sirq-safe-A => hirqs-on/12: ok | ok | ok | sirq-safe-A => hirqs-on/21: ok | ok | ok | hard-safe-A + irqs-on/12: ok | ok | ok | soft-safe-A + irqs-on/12: ok | ok | ok | hard-safe-A + irqs-on/21: ok | ok | ok | soft-safe-A + irqs-on/21: ok | ok | ok | hard-safe-A + unsafe-B #1/123: ok | ok | ok | soft-safe-A + unsafe-B #1/123: ok | ok | ok | hard-safe-A + unsafe-B #1/132: ok | ok | ok | soft-safe-A + unsafe-B #1/132: ok | ok | ok | hard-safe-A + unsafe-B #1/213: ok | ok | ok | soft-safe-A + unsafe-B #1/213: ok | ok | ok | hard-safe-A + unsafe-B #1/231: ok | ok | ok | soft-safe-A + unsafe-B #1/231: ok | ok | ok | hard-safe-A + unsafe-B #1/312: ok | ok | ok | soft-safe-A + unsafe-B #1/312: ok | ok | ok | hard-safe-A + unsafe-B #1/321: ok | ok | ok | soft-safe-A + unsafe-B #1/321: ok | ok | ok | hard-safe-A + unsafe-B #2/123: ok | ok | ok | soft-safe-A + unsafe-B #2/123: ok | ok | ok | hard-safe-A + unsafe-B #2/132: ok | ok | ok | soft-safe-A + unsafe-B #2/132: ok | ok | ok | hard-safe-A + unsafe-B #2/213: ok | ok | ok | soft-safe-A + unsafe-B #2/213: ok | ok | ok | hard-safe-A + unsafe-B #2/231: ok | ok | ok | soft-safe-A + unsafe-B #2/231: ok | ok | ok | hard-safe-A + unsafe-B #2/312: ok | ok | ok | soft-safe-A + unsafe-B #2/312: ok | ok | ok | hard-safe-A + unsafe-B #2/321: ok | ok | ok | soft-safe-A + unsafe-B #2/321: ok | ok | ok | hard-irq lock-inversion/123: ok | ok | ok | soft-irq lock-inversion/123: ok | ok | ok | hard-irq lock-inversion/132: ok | ok | ok | soft-irq lock-inversion/132: ok | ok | ok | hard-irq lock-inversion/213: ok | ok | ok | soft-irq lock-inversion/213: ok | ok | ok | hard-irq lock-inversion/231: ok | ok | ok | soft-irq lock-inversion/231: ok | ok | ok | hard-irq lock-inversion/312: ok | ok | ok | soft-irq lock-inversion/312: ok | ok | ok | hard-irq lock-inversion/321: ok | ok | ok | soft-irq lock-inversion/321: ok | ok | ok | hard-irq read-recursion/123: ok | soft-irq read-recursion/123: ok | hard-irq read-recursion/132: ok | soft-irq read-recursion/132: ok | hard-irq read-recursion/213: ok | soft-irq read-recursion/213: ok | hard-irq read-recursion/231: ok | soft-irq read-recursion/231: ok | hard-irq read-recursion/312: ok | soft-irq read-recursion/312: ok | hard-irq read-recursion/321: ok | soft-irq read-recursion/321: ok | --------------------------------+-----+---------------- Good, all 210 testcases passed! | --------------------------------+ Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 9 + lib/Kconfig.debug | 11 + lib/Makefile | 1 + lib/locking-selftest-hardirq.h | 9 + lib/locking-selftest-mutex.h | 11 + lib/locking-selftest-rlock-hardirq.h | 2 + lib/locking-selftest-rlock-softirq.h | 2 + lib/locking-selftest-rlock.h | 14 + lib/locking-selftest-rsem.h | 14 + lib/locking-selftest-softirq.h | 9 + lib/locking-selftest-spin-hardirq.h | 2 + lib/locking-selftest-spin-softirq.h | 2 + lib/locking-selftest-spin.h | 11 + lib/locking-selftest-wlock-hardirq.h | 2 + lib/locking-selftest-wlock-softirq.h | 2 + lib/locking-selftest-wlock.h | 14 + lib/locking-selftest-wsem.h | 14 + lib/locking-selftest.c | 1218 ++++++++++++++++++++++++++++++++++ 18 files changed, 1347 insertions(+) create mode 100644 lib/locking-selftest-hardirq.h create mode 100644 lib/locking-selftest-mutex.h create mode 100644 lib/locking-selftest-rlock-hardirq.h create mode 100644 lib/locking-selftest-rlock-softirq.h create mode 100644 lib/locking-selftest-rlock.h create mode 100644 lib/locking-selftest-rsem.h create mode 100644 lib/locking-selftest-softirq.h create mode 100644 lib/locking-selftest-spin-hardirq.h create mode 100644 lib/locking-selftest-spin-softirq.h create mode 100644 lib/locking-selftest-spin.h create mode 100644 lib/locking-selftest-wlock-hardirq.h create mode 100644 lib/locking-selftest-wlock-softirq.h create mode 100644 lib/locking-selftest-wlock.h create mode 100644 lib/locking-selftest-wsem.h create mode 100644 lib/locking-selftest.c (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 86e9282d1c20..149f62ba14a5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -435,6 +435,15 @@ running once the system is up. debug [KNL] Enable kernel debugging (events log level). + debug_locks_verbose= + [KNL] verbose self-tests + Format=<0|1> + Print debugging info while doing the locking API + self-tests. + We default to 0 (no extra messages), setting it to + 1 will print _a lot_ more information - normally + only useful to kernel developers. + decnet= [HW,NET] Format: [,] See also Documentation/networking/decnet.txt. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 91e338a3d069..16021b09c184 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -149,6 +149,17 @@ config DEBUG_SPINLOCK_SLEEP If you say Y here, various routines which may sleep will become very noisy if they are called with a spinlock held. +config DEBUG_LOCKING_API_SELFTESTS + bool "Locking API boot-time self-tests" + depends on DEBUG_KERNEL + help + Say Y here if you want the kernel to run a short self-test during + bootup. The self-test checks whether common types of locking bugs + are detected by debugging mechanisms or not. (if you disable + lock debugging then those bugs wont be detected of course.) + The following locking APIs are covered: spinlocks, rwlocks, + mutexes and rwsems. + config STACKTRACE bool depends on STACKTRACE_SUPPORT diff --git a/lib/Makefile b/lib/Makefile index 4f5d01922f82..be9719ae82d0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -18,6 +18,7 @@ CFLAGS_kobject.o += -DDEBUG CFLAGS_kobject_uevent.o += -DDEBUG endif +obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/lib/locking-selftest-hardirq.h b/lib/locking-selftest-hardirq.h new file mode 100644 index 000000000000..10d4a150b259 --- /dev/null +++ b/lib/locking-selftest-hardirq.h @@ -0,0 +1,9 @@ +#undef IRQ_DISABLE +#undef IRQ_ENABLE +#undef IRQ_ENTER +#undef IRQ_EXIT + +#define IRQ_ENABLE HARDIRQ_ENABLE +#define IRQ_DISABLE HARDIRQ_DISABLE +#define IRQ_ENTER HARDIRQ_ENTER +#define IRQ_EXIT HARDIRQ_EXIT diff --git a/lib/locking-selftest-mutex.h b/lib/locking-selftest-mutex.h new file mode 100644 index 000000000000..68601b6f584b --- /dev/null +++ b/lib/locking-selftest-mutex.h @@ -0,0 +1,11 @@ +#undef LOCK +#define LOCK ML + +#undef UNLOCK +#define UNLOCK MU + +#undef RLOCK +#undef WLOCK + +#undef INIT +#define INIT MI diff --git a/lib/locking-selftest-rlock-hardirq.h b/lib/locking-selftest-rlock-hardirq.h new file mode 100644 index 000000000000..9f517ebcb786 --- /dev/null +++ b/lib/locking-selftest-rlock-hardirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-rlock.h" +#include "locking-selftest-hardirq.h" diff --git a/lib/locking-selftest-rlock-softirq.h b/lib/locking-selftest-rlock-softirq.h new file mode 100644 index 000000000000..981455db7ff0 --- /dev/null +++ b/lib/locking-selftest-rlock-softirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-rlock.h" +#include "locking-selftest-softirq.h" diff --git a/lib/locking-selftest-rlock.h b/lib/locking-selftest-rlock.h new file mode 100644 index 000000000000..6789044f4d0e --- /dev/null +++ b/lib/locking-selftest-rlock.h @@ -0,0 +1,14 @@ +#undef LOCK +#define LOCK RL + +#undef UNLOCK +#define UNLOCK RU + +#undef RLOCK +#define RLOCK RL + +#undef WLOCK +#define WLOCK WL + +#undef INIT +#define INIT RWI diff --git a/lib/locking-selftest-rsem.h b/lib/locking-selftest-rsem.h new file mode 100644 index 000000000000..62da886680c7 --- /dev/null +++ b/lib/locking-selftest-rsem.h @@ -0,0 +1,14 @@ +#undef LOCK +#define LOCK RSL + +#undef UNLOCK +#define UNLOCK RSU + +#undef RLOCK +#define RLOCK RSL + +#undef WLOCK +#define WLOCK WSL + +#undef INIT +#define INIT RWSI diff --git a/lib/locking-selftest-softirq.h b/lib/locking-selftest-softirq.h new file mode 100644 index 000000000000..a83de2a04ace --- /dev/null +++ b/lib/locking-selftest-softirq.h @@ -0,0 +1,9 @@ +#undef IRQ_DISABLE +#undef IRQ_ENABLE +#undef IRQ_ENTER +#undef IRQ_EXIT + +#define IRQ_DISABLE SOFTIRQ_DISABLE +#define IRQ_ENABLE SOFTIRQ_ENABLE +#define IRQ_ENTER SOFTIRQ_ENTER +#define IRQ_EXIT SOFTIRQ_EXIT diff --git a/lib/locking-selftest-spin-hardirq.h b/lib/locking-selftest-spin-hardirq.h new file mode 100644 index 000000000000..693198dce30a --- /dev/null +++ b/lib/locking-selftest-spin-hardirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-spin.h" +#include "locking-selftest-hardirq.h" diff --git a/lib/locking-selftest-spin-softirq.h b/lib/locking-selftest-spin-softirq.h new file mode 100644 index 000000000000..c472e2a87ffc --- /dev/null +++ b/lib/locking-selftest-spin-softirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-spin.h" +#include "locking-selftest-softirq.h" diff --git a/lib/locking-selftest-spin.h b/lib/locking-selftest-spin.h new file mode 100644 index 000000000000..ccd1b4b09757 --- /dev/null +++ b/lib/locking-selftest-spin.h @@ -0,0 +1,11 @@ +#undef LOCK +#define LOCK L + +#undef UNLOCK +#define UNLOCK U + +#undef RLOCK +#undef WLOCK + +#undef INIT +#define INIT SI diff --git a/lib/locking-selftest-wlock-hardirq.h b/lib/locking-selftest-wlock-hardirq.h new file mode 100644 index 000000000000..2dd2e5122caa --- /dev/null +++ b/lib/locking-selftest-wlock-hardirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-wlock.h" +#include "locking-selftest-hardirq.h" diff --git a/lib/locking-selftest-wlock-softirq.h b/lib/locking-selftest-wlock-softirq.h new file mode 100644 index 000000000000..cb80d1cb944e --- /dev/null +++ b/lib/locking-selftest-wlock-softirq.h @@ -0,0 +1,2 @@ +#include "locking-selftest-wlock.h" +#include "locking-selftest-softirq.h" diff --git a/lib/locking-selftest-wlock.h b/lib/locking-selftest-wlock.h new file mode 100644 index 000000000000..0815322d99ed --- /dev/null +++ b/lib/locking-selftest-wlock.h @@ -0,0 +1,14 @@ +#undef LOCK +#define LOCK WL + +#undef UNLOCK +#define UNLOCK WU + +#undef RLOCK +#define RLOCK RL + +#undef WLOCK +#define WLOCK WL + +#undef INIT +#define INIT RWI diff --git a/lib/locking-selftest-wsem.h b/lib/locking-selftest-wsem.h new file mode 100644 index 000000000000..b88c5f2dc5f0 --- /dev/null +++ b/lib/locking-selftest-wsem.h @@ -0,0 +1,14 @@ +#undef LOCK +#define LOCK WSL + +#undef UNLOCK +#define UNLOCK WSU + +#undef RLOCK +#define RLOCK RSL + +#undef WLOCK +#define WLOCK WSL + +#undef INIT +#define INIT RWSI diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c new file mode 100644 index 000000000000..5cd05f20bdec --- /dev/null +++ b/lib/locking-selftest.c @@ -0,0 +1,1218 @@ +/* + * lib/locking-selftest.c + * + * Testsuite for various locking APIs: spinlocks, rwlocks, + * mutexes and rw-semaphores. + * + * It is checking both false positives and false negatives. + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Change this to 1 if you want to see the failure printouts: + */ +static unsigned int debug_locks_verbose; + +static int __init setup_debug_locks_verbose(char *str) +{ + get_option(&str, &debug_locks_verbose); + + return 1; +} + +__setup("debug_locks_verbose=", setup_debug_locks_verbose); + +#define FAILURE 0 +#define SUCCESS 1 + +#define LOCKTYPE_SPIN 0x1 +#define LOCKTYPE_RWLOCK 0x2 +#define LOCKTYPE_MUTEX 0x4 +#define LOCKTYPE_RWSEM 0x8 + +/* + * Normal standalone locks, for the circular and irq-context + * dependency tests: + */ +static DEFINE_SPINLOCK(lock_A); +static DEFINE_SPINLOCK(lock_B); +static DEFINE_SPINLOCK(lock_C); +static DEFINE_SPINLOCK(lock_D); + +static DEFINE_RWLOCK(rwlock_A); +static DEFINE_RWLOCK(rwlock_B); +static DEFINE_RWLOCK(rwlock_C); +static DEFINE_RWLOCK(rwlock_D); + +static DEFINE_MUTEX(mutex_A); +static DEFINE_MUTEX(mutex_B); +static DEFINE_MUTEX(mutex_C); +static DEFINE_MUTEX(mutex_D); + +static DECLARE_RWSEM(rwsem_A); +static DECLARE_RWSEM(rwsem_B); +static DECLARE_RWSEM(rwsem_C); +static DECLARE_RWSEM(rwsem_D); + +/* + * Locks that we initialize dynamically as well so that + * e.g. X1 and X2 becomes two instances of the same class, + * but X* and Y* are different classes. We do this so that + * we do not trigger a real lockup: + */ +static DEFINE_SPINLOCK(lock_X1); +static DEFINE_SPINLOCK(lock_X2); +static DEFINE_SPINLOCK(lock_Y1); +static DEFINE_SPINLOCK(lock_Y2); +static DEFINE_SPINLOCK(lock_Z1); +static DEFINE_SPINLOCK(lock_Z2); + +static DEFINE_RWLOCK(rwlock_X1); +static DEFINE_RWLOCK(rwlock_X2); +static DEFINE_RWLOCK(rwlock_Y1); +static DEFINE_RWLOCK(rwlock_Y2); +static DEFINE_RWLOCK(rwlock_Z1); +static DEFINE_RWLOCK(rwlock_Z2); + +static DEFINE_MUTEX(mutex_X1); +static DEFINE_MUTEX(mutex_X2); +static DEFINE_MUTEX(mutex_Y1); +static DEFINE_MUTEX(mutex_Y2); +static DEFINE_MUTEX(mutex_Z1); +static DEFINE_MUTEX(mutex_Z2); + +static DECLARE_RWSEM(rwsem_X1); +static DECLARE_RWSEM(rwsem_X2); +static DECLARE_RWSEM(rwsem_Y1); +static DECLARE_RWSEM(rwsem_Y2); +static DECLARE_RWSEM(rwsem_Z1); +static DECLARE_RWSEM(rwsem_Z2); + +/* + * non-inlined runtime initializers, to let separate locks share + * the same lock-class: + */ +#define INIT_CLASS_FUNC(class) \ +static noinline void \ +init_class_##class(spinlock_t *lock, rwlock_t *rwlock, struct mutex *mutex, \ + struct rw_semaphore *rwsem) \ +{ \ + spin_lock_init(lock); \ + rwlock_init(rwlock); \ + mutex_init(mutex); \ + init_rwsem(rwsem); \ +} + +INIT_CLASS_FUNC(X) +INIT_CLASS_FUNC(Y) +INIT_CLASS_FUNC(Z) + +static void init_shared_classes(void) +{ + init_class_X(&lock_X1, &rwlock_X1, &mutex_X1, &rwsem_X1); + init_class_X(&lock_X2, &rwlock_X2, &mutex_X2, &rwsem_X2); + + init_class_Y(&lock_Y1, &rwlock_Y1, &mutex_Y1, &rwsem_Y1); + init_class_Y(&lock_Y2, &rwlock_Y2, &mutex_Y2, &rwsem_Y2); + + init_class_Z(&lock_Z1, &rwlock_Z1, &mutex_Z1, &rwsem_Z1); + init_class_Z(&lock_Z2, &rwlock_Z2, &mutex_Z2, &rwsem_Z2); +} + +/* + * For spinlocks and rwlocks we also do hardirq-safe / softirq-safe tests. + * The following functions use a lock from a simulated hardirq/softirq + * context, causing the locks to be marked as hardirq-safe/softirq-safe: + */ + +#define HARDIRQ_DISABLE local_irq_disable +#define HARDIRQ_ENABLE local_irq_enable + +#define HARDIRQ_ENTER() \ + local_irq_disable(); \ + irq_enter(); \ + WARN_ON(!in_irq()); + +#define HARDIRQ_EXIT() \ + __irq_exit(); \ + local_irq_enable(); + +#define SOFTIRQ_DISABLE local_bh_disable +#define SOFTIRQ_ENABLE local_bh_enable + +#define SOFTIRQ_ENTER() \ + local_bh_disable(); \ + local_irq_disable(); \ + trace_softirq_enter(); \ + WARN_ON(!in_softirq()); + +#define SOFTIRQ_EXIT() \ + trace_softirq_exit(); \ + local_irq_enable(); \ + local_bh_enable(); + +/* + * Shortcuts for lock/unlock API variants, to keep + * the testcases compact: + */ +#define L(x) spin_lock(&lock_##x) +#define U(x) spin_unlock(&lock_##x) +#define LU(x) L(x); U(x) +#define SI(x) spin_lock_init(&lock_##x) + +#define WL(x) write_lock(&rwlock_##x) +#define WU(x) write_unlock(&rwlock_##x) +#define WLU(x) WL(x); WU(x) + +#define RL(x) read_lock(&rwlock_##x) +#define RU(x) read_unlock(&rwlock_##x) +#define RLU(x) RL(x); RU(x) +#define RWI(x) rwlock_init(&rwlock_##x) + +#define ML(x) mutex_lock(&mutex_##x) +#define MU(x) mutex_unlock(&mutex_##x) +#define MI(x) mutex_init(&mutex_##x) + +#define WSL(x) down_write(&rwsem_##x) +#define WSU(x) up_write(&rwsem_##x) + +#define RSL(x) down_read(&rwsem_##x) +#define RSU(x) up_read(&rwsem_##x) +#define RWSI(x) init_rwsem(&rwsem_##x) + +#define LOCK_UNLOCK_2(x,y) LOCK(x); LOCK(y); UNLOCK(y); UNLOCK(x) + +/* + * Generate different permutations of the same testcase, using + * the same basic lock-dependency/state events: + */ + +#define GENERATE_TESTCASE(name) \ + \ +static void name(void) { E(); } + +#define GENERATE_PERMUTATIONS_2_EVENTS(name) \ + \ +static void name##_12(void) { E1(); E2(); } \ +static void name##_21(void) { E2(); E1(); } + +#define GENERATE_PERMUTATIONS_3_EVENTS(name) \ + \ +static void name##_123(void) { E1(); E2(); E3(); } \ +static void name##_132(void) { E1(); E3(); E2(); } \ +static void name##_213(void) { E2(); E1(); E3(); } \ +static void name##_231(void) { E2(); E3(); E1(); } \ +static void name##_312(void) { E3(); E1(); E2(); } \ +static void name##_321(void) { E3(); E2(); E1(); } + +/* + * AA deadlock: + */ + +#define E() \ + \ + LOCK(X1); \ + LOCK(X2); /* this one should fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(AA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(AA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(AA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(AA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(AA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(AA_rsem) + +#undef E + +/* + * Special-case for read-locking, they are + * allowed to recurse on the same lock instance: + */ +static void rlock_AA1(void) +{ + RL(X1); + RL(X1); // this one should NOT fail +} + +static void rlock_AA1B(void) +{ + RL(X1); + RL(X2); // this one should fail +} + +static void rsem_AA1(void) +{ + RSL(X1); + RSL(X1); // this one should fail +} + +static void rsem_AA1B(void) +{ + RSL(X1); + RSL(X2); // this one should fail +} +/* + * The mixing of read and write locks is not allowed: + */ +static void rlock_AA2(void) +{ + RL(X1); + WL(X2); // this one should fail +} + +static void rsem_AA2(void) +{ + RSL(X1); + WSL(X2); // this one should fail +} + +static void rlock_AA3(void) +{ + WL(X1); + RL(X2); // this one should fail +} + +static void rsem_AA3(void) +{ + WSL(X1); + RSL(X2); // this one should fail +} + +/* + * ABBA deadlock: + */ + +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(B, A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABBA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABBA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABBA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABBA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABBA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABBA_rsem) + +#undef E + +/* + * AB BC CA deadlock: + */ + +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(B, C); \ + LOCK_UNLOCK_2(C, A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABBCCA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABBCCA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABBCCA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABBCCA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABBCCA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABBCCA_rsem) + +#undef E + +/* + * AB CA BC deadlock: + */ + +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(C, A); \ + LOCK_UNLOCK_2(B, C); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABCABC_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABCABC_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABCABC_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABCABC_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABCABC_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABCABC_rsem) + +#undef E + +/* + * AB BC CD DA deadlock: + */ + +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(B, C); \ + LOCK_UNLOCK_2(C, D); \ + LOCK_UNLOCK_2(D, A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABBCCDDA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABBCCDDA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABBCCDDA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABBCCDDA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABBCCDDA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABBCCDDA_rsem) + +#undef E + +/* + * AB CD BD DA deadlock: + */ +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(C, D); \ + LOCK_UNLOCK_2(B, D); \ + LOCK_UNLOCK_2(D, A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABCDBDDA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABCDBDDA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABCDBDDA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABCDBDDA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABCDBDDA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABCDBDDA_rsem) + +#undef E + +/* + * AB CD BC DA deadlock: + */ +#define E() \ + \ + LOCK_UNLOCK_2(A, B); \ + LOCK_UNLOCK_2(C, D); \ + LOCK_UNLOCK_2(B, C); \ + LOCK_UNLOCK_2(D, A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(ABCDBCDA_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(ABCDBCDA_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(ABCDBCDA_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(ABCDBCDA_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(ABCDBCDA_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(ABCDBCDA_rsem) + +#undef E + +/* + * Double unlock: + */ +#define E() \ + \ + LOCK(A); \ + UNLOCK(A); \ + UNLOCK(A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(double_unlock_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(double_unlock_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(double_unlock_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(double_unlock_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(double_unlock_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(double_unlock_rsem) + +#undef E + +/* + * Bad unlock ordering: + */ +#define E() \ + \ + LOCK(A); \ + LOCK(B); \ + UNLOCK(A); /* fail */ \ + UNLOCK(B); + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(bad_unlock_order_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(bad_unlock_order_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(bad_unlock_order_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(bad_unlock_order_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(bad_unlock_order_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(bad_unlock_order_rsem) + +#undef E + +/* + * initializing a held lock: + */ +#define E() \ + \ + LOCK(A); \ + INIT(A); /* fail */ + +/* + * 6 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_TESTCASE(init_held_spin) +#include "locking-selftest-wlock.h" +GENERATE_TESTCASE(init_held_wlock) +#include "locking-selftest-rlock.h" +GENERATE_TESTCASE(init_held_rlock) +#include "locking-selftest-mutex.h" +GENERATE_TESTCASE(init_held_mutex) +#include "locking-selftest-wsem.h" +GENERATE_TESTCASE(init_held_wsem) +#include "locking-selftest-rsem.h" +GENERATE_TESTCASE(init_held_rsem) + +#undef E + +/* + * locking an irq-safe lock with irqs enabled: + */ +#define E1() \ + \ + IRQ_ENTER(); \ + LOCK(A); \ + UNLOCK(A); \ + IRQ_EXIT(); + +#define E2() \ + \ + LOCK(A); \ + UNLOCK(A); + +/* + * Generate 24 testcases: + */ +#include "locking-selftest-spin-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) + +#include "locking-selftest-rlock-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) + +#include "locking-selftest-wlock-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) + +#include "locking-selftest-spin-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) + +#include "locking-selftest-rlock-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) + +#include "locking-selftest-wlock-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) + +#undef E1 +#undef E2 + +/* + * Enabling hardirqs with a softirq-safe lock held: + */ +#define E1() \ + \ + SOFTIRQ_ENTER(); \ + LOCK(A); \ + UNLOCK(A); \ + SOFTIRQ_EXIT(); + +#define E2() \ + \ + HARDIRQ_DISABLE(); \ + LOCK(A); \ + HARDIRQ_ENABLE(); \ + UNLOCK(A); + +/* + * Generate 12 testcases: + */ +#include "locking-selftest-spin.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_spin) + +#include "locking-selftest-wlock.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_wlock) + +#include "locking-selftest-rlock.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + +#undef E1 +#undef E2 + +/* + * Enabling irqs with an irq-safe lock held: + */ +#define E1() \ + \ + IRQ_ENTER(); \ + LOCK(A); \ + UNLOCK(A); \ + IRQ_EXIT(); + +#define E2() \ + \ + IRQ_DISABLE(); \ + LOCK(A); \ + IRQ_ENABLE(); \ + UNLOCK(A); + +/* + * Generate 24 testcases: + */ +#include "locking-selftest-spin-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) + +#include "locking-selftest-rlock-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) + +#include "locking-selftest-wlock-hardirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) + +#include "locking-selftest-spin-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) + +#include "locking-selftest-rlock-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) + +#include "locking-selftest-wlock-softirq.h" +GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) + +#undef E1 +#undef E2 + +/* + * Acquiring a irq-unsafe lock while holding an irq-safe-lock: + */ +#define E1() \ + \ + LOCK(A); \ + LOCK(B); \ + UNLOCK(B); \ + UNLOCK(A); \ + +#define E2() \ + \ + LOCK(B); \ + UNLOCK(B); + +#define E3() \ + \ + IRQ_ENTER(); \ + LOCK(A); \ + UNLOCK(A); \ + IRQ_EXIT(); + +/* + * Generate 36 testcases: + */ +#include "locking-selftest-spin-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) + +#include "locking-selftest-rlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) + +#include "locking-selftest-wlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) + +#include "locking-selftest-spin-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) + +#include "locking-selftest-rlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) + +#include "locking-selftest-wlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) + +#undef E1 +#undef E2 +#undef E3 + +/* + * If a lock turns into softirq-safe, but earlier it took + * a softirq-unsafe lock: + */ + +#define E1() \ + IRQ_DISABLE(); \ + LOCK(A); \ + LOCK(B); \ + UNLOCK(B); \ + UNLOCK(A); \ + IRQ_ENABLE(); + +#define E2() \ + LOCK(B); \ + UNLOCK(B); + +#define E3() \ + IRQ_ENTER(); \ + LOCK(A); \ + UNLOCK(A); \ + IRQ_EXIT(); + +/* + * Generate 36 testcases: + */ +#include "locking-selftest-spin-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) + +#include "locking-selftest-rlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) + +#include "locking-selftest-wlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) + +#include "locking-selftest-spin-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) + +#include "locking-selftest-rlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) + +#include "locking-selftest-wlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) + +#undef E1 +#undef E2 +#undef E3 + +/* + * read-lock / write-lock irq inversion. + * + * Deadlock scenario: + * + * CPU#1 is at #1, i.e. it has write-locked A, but has not + * taken B yet. + * + * CPU#2 is at #2, i.e. it has locked B. + * + * Hardirq hits CPU#2 at point #2 and is trying to read-lock A. + * + * The deadlock occurs because CPU#1 will spin on B, and CPU#2 + * will spin on A. + */ + +#define E1() \ + \ + IRQ_DISABLE(); \ + WL(A); \ + LOCK(B); \ + UNLOCK(B); \ + WU(A); \ + IRQ_ENABLE(); + +#define E2() \ + \ + LOCK(B); \ + UNLOCK(B); + +#define E3() \ + \ + IRQ_ENTER(); \ + RL(A); \ + RU(A); \ + IRQ_EXIT(); + +/* + * Generate 36 testcases: + */ +#include "locking-selftest-spin-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_spin) + +#include "locking-selftest-rlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_rlock) + +#include "locking-selftest-wlock-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) + +#include "locking-selftest-spin-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) + +#include "locking-selftest-rlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_rlock) + +#include "locking-selftest-wlock-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) + +#undef E1 +#undef E2 +#undef E3 + +/* + * read-lock / write-lock recursion that is actually safe. + */ + +#define E1() \ + \ + IRQ_DISABLE(); \ + WL(A); \ + WU(A); \ + IRQ_ENABLE(); + +#define E2() \ + \ + RL(A); \ + RU(A); \ + +#define E3() \ + \ + IRQ_ENTER(); \ + RL(A); \ + L(B); \ + U(B); \ + RU(A); \ + IRQ_EXIT(); + +/* + * Generate 12 testcases: + */ +#include "locking-selftest-hardirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard) + +#include "locking-selftest-softirq.h" +GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) + +#undef E1 +#undef E2 +#undef E3 + +/* + * read-lock / write-lock recursion that is unsafe. + */ + +#define E1() \ + \ + IRQ_DISABLE(); \ + L(B); \ + WL(A); \ + WU(A); \ + U(B); \ + IRQ_ENABLE(); + +#define E2() \ + \ + RL(A); \ + RU(A); \ + +#define E3() \ + \ + IRQ_ENTER(); \ + L(B); \ + U(B); \ + IRQ_EXIT(); + +/* + * Generate 12 testcases: + */ +#include "locking-selftest-hardirq.h" +// GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard) + +#include "locking-selftest-softirq.h" +// GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft) + +#define lockdep_reset() +#define lockdep_reset_lock(x) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) +# define I_RWLOCK(x) lockdep_reset_lock(&rwlock_##x.dep_map) +# define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) +# define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) +#else +# define I_SPINLOCK(x) +# define I_RWLOCK(x) +# define I_MUTEX(x) +# define I_RWSEM(x) +#endif + +#define I1(x) \ + do { \ + I_SPINLOCK(x); \ + I_RWLOCK(x); \ + I_MUTEX(x); \ + I_RWSEM(x); \ + } while (0) + +#define I2(x) \ + do { \ + spin_lock_init(&lock_##x); \ + rwlock_init(&rwlock_##x); \ + mutex_init(&mutex_##x); \ + init_rwsem(&rwsem_##x); \ + } while (0) + +static void reset_locks(void) +{ + local_irq_disable(); + I1(A); I1(B); I1(C); I1(D); + I1(X1); I1(X2); I1(Y1); I1(Y2); I1(Z1); I1(Z2); + lockdep_reset(); + I2(A); I2(B); I2(C); I2(D); + init_shared_classes(); + local_irq_enable(); +} + +#undef I + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; + +static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) +{ + unsigned long saved_preempt_count = preempt_count(); + int expected_failure = 0; + + WARN_ON(irqs_disabled()); + + testcase_fn(); + /* + * Filter out expected failures: + */ +#ifndef CONFIG_PROVE_LOCKING + if ((lockclass_mask & LOCKTYPE_SPIN) && debug_locks != expected) + expected_failure = 1; + if ((lockclass_mask & LOCKTYPE_RWLOCK) && debug_locks != expected) + expected_failure = 1; + if ((lockclass_mask & LOCKTYPE_MUTEX) && debug_locks != expected) + expected_failure = 1; + if ((lockclass_mask & LOCKTYPE_RWSEM) && debug_locks != expected) + expected_failure = 1; +#endif + if (debug_locks != expected) { + if (expected_failure) { + expected_testcase_failures++; + printk("failed|"); + } else { + unexpected_testcase_failures++; + printk("FAILED|"); + } + } else { + testcase_successes++; + printk(" ok |"); + } + testcase_total++; + + if (debug_locks_verbose) + printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n", + lockclass_mask, debug_locks, expected); + /* + * Some tests (e.g. double-unlock) might corrupt the preemption + * count, so restore it: + */ + preempt_count() = saved_preempt_count; +#ifdef CONFIG_TRACE_IRQFLAGS + if (softirq_count()) + current->softirqs_enabled = 0; + else + current->softirqs_enabled = 1; +#endif + + reset_locks(); +} + +static inline void print_testname(const char *testname) +{ + printk("%33s:", testname); +} + +#define DO_TESTCASE_1(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + +#define DO_TESTCASE_1B(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + printk("\n"); + +#define DO_TESTCASE_3(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + +#define DO_TESTCASE_3RW(desc, name, nr) \ + print_testname(desc"/"#nr); \ + dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ + dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ + printk("\n"); + +#define DO_TESTCASE_6(desc, name) \ + print_testname(desc); \ + dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ + dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ + dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ + printk("\n"); + +#define DO_TESTCASE_6_SUCCESS(desc, name) \ + print_testname(desc); \ + dotest(name##_spin, SUCCESS, LOCKTYPE_SPIN); \ + dotest(name##_wlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \ + dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \ + dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \ + printk("\n"); + +/* + * 'read' variant: rlocks must not trigger. + */ +#define DO_TESTCASE_6R(desc, name) \ + print_testname(desc); \ + dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ + dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ + dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ + dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ + dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ + printk("\n"); + +#define DO_TESTCASE_2I(desc, name, nr) \ + DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ + DO_TESTCASE_1("soft-"desc, name##_soft, nr); + +#define DO_TESTCASE_2IB(desc, name, nr) \ + DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ + DO_TESTCASE_1B("soft-"desc, name##_soft, nr); + +#define DO_TESTCASE_6I(desc, name, nr) \ + DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ + DO_TESTCASE_3("soft-"desc, name##_soft, nr); + +#define DO_TESTCASE_6IRW(desc, name, nr) \ + DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ + DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); + +#define DO_TESTCASE_2x3(desc, name) \ + DO_TESTCASE_3(desc, name, 12); \ + DO_TESTCASE_3(desc, name, 21); + +#define DO_TESTCASE_2x6(desc, name) \ + DO_TESTCASE_6I(desc, name, 12); \ + DO_TESTCASE_6I(desc, name, 21); + +#define DO_TESTCASE_6x2(desc, name) \ + DO_TESTCASE_2I(desc, name, 123); \ + DO_TESTCASE_2I(desc, name, 132); \ + DO_TESTCASE_2I(desc, name, 213); \ + DO_TESTCASE_2I(desc, name, 231); \ + DO_TESTCASE_2I(desc, name, 312); \ + DO_TESTCASE_2I(desc, name, 321); + +#define DO_TESTCASE_6x2B(desc, name) \ + DO_TESTCASE_2IB(desc, name, 123); \ + DO_TESTCASE_2IB(desc, name, 132); \ + DO_TESTCASE_2IB(desc, name, 213); \ + DO_TESTCASE_2IB(desc, name, 231); \ + DO_TESTCASE_2IB(desc, name, 312); \ + DO_TESTCASE_2IB(desc, name, 321); + +#define DO_TESTCASE_6x6(desc, name) \ + DO_TESTCASE_6I(desc, name, 123); \ + DO_TESTCASE_6I(desc, name, 132); \ + DO_TESTCASE_6I(desc, name, 213); \ + DO_TESTCASE_6I(desc, name, 231); \ + DO_TESTCASE_6I(desc, name, 312); \ + DO_TESTCASE_6I(desc, name, 321); + +#define DO_TESTCASE_6x6RW(desc, name) \ + DO_TESTCASE_6IRW(desc, name, 123); \ + DO_TESTCASE_6IRW(desc, name, 132); \ + DO_TESTCASE_6IRW(desc, name, 213); \ + DO_TESTCASE_6IRW(desc, name, 231); \ + DO_TESTCASE_6IRW(desc, name, 312); \ + DO_TESTCASE_6IRW(desc, name, 321); + + +void locking_selftest(void) +{ + /* + * Got a locking failure before the selftest ran? + */ + if (!debug_locks) { + printk("----------------------------------\n"); + printk("| Locking API testsuite disabled |\n"); + printk("----------------------------------\n"); + return; + } + + /* + * Run the testsuite: + */ + printk("------------------------\n"); + printk("| Locking API testsuite:\n"); + printk("----------------------------------------------------------------------------\n"); + printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); + printk(" --------------------------------------------------------------------------\n"); + + init_shared_classes(); + debug_locks_silent = !debug_locks_verbose; + + DO_TESTCASE_6("A-A deadlock", AA); + DO_TESTCASE_6R("A-B-B-A deadlock", ABBA); + DO_TESTCASE_6R("A-B-B-C-C-A deadlock", ABBCCA); + DO_TESTCASE_6R("A-B-C-A-B-C deadlock", ABCABC); + DO_TESTCASE_6R("A-B-B-C-C-D-D-A deadlock", ABBCCDDA); + DO_TESTCASE_6R("A-B-C-D-B-D-D-A deadlock", ABCDBDDA); + DO_TESTCASE_6R("A-B-C-D-B-C-D-A deadlock", ABCDBCDA); + DO_TESTCASE_6("double unlock", double_unlock); + DO_TESTCASE_6("initialize held", init_held); + DO_TESTCASE_6_SUCCESS("bad unlock order", bad_unlock_order); + + printk(" --------------------------------------------------------------------------\n"); + print_testname("recursive read-lock"); + printk(" |"); + dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); + printk(" |"); + dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); + printk("\n"); + + print_testname("recursive read-lock #2"); + printk(" |"); + dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK); + printk(" |"); + dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); + printk("\n"); + + print_testname("mixed read-write-lock"); + printk(" |"); + dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK); + printk(" |"); + dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM); + printk("\n"); + + print_testname("mixed write-read-lock"); + printk(" |"); + dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK); + printk(" |"); + dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); + printk("\n"); + + printk(" --------------------------------------------------------------------------\n"); + + /* + * irq-context testcases: + */ + DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); + DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); + DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); + DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); + DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); + DO_TESTCASE_6x6RW("irq lock-inversion", irq_inversion); + + DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); +// DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); + + if (unexpected_testcase_failures) { + printk("-----------------------------------------------------------------\n"); + debug_locks = 0; + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------------------------------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + debug_locks = 1; + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------------------------------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + debug_locks = 1; + } else { + printk("-------------------------------------------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + debug_locks = 1; + } + debug_locks_silent = 0; +} -- cgit v1.2.3 From f3e97da38e1d69d24195d76f96b912323f5ee30c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Jul 2006 00:24:52 -0700 Subject: [PATCH] lockdep: design docs Lock validator design documentation. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/lockdep-design.txt | 197 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 Documentation/lockdep-design.txt (limited to 'Documentation') diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt new file mode 100644 index 000000000000..00d93605bfd3 --- /dev/null +++ b/Documentation/lockdep-design.txt @@ -0,0 +1,197 @@ +Runtime locking correctness validator +===================================== + +started by Ingo Molnar +additions by Arjan van de Ven + +Lock-class +---------- + +The basic object the validator operates upon is a 'class' of locks. + +A class of locks is a group of locks that are logically the same with +respect to locking rules, even if the locks may have multiple (possibly +tens of thousands of) instantiations. For example a lock in the inode +struct is one class, while each inode has its own instantiation of that +lock class. + +The validator tracks the 'state' of lock-classes, and it tracks +dependencies between different lock-classes. The validator maintains a +rolling proof that the state and the dependencies are correct. + +Unlike an lock instantiation, the lock-class itself never goes away: when +a lock-class is used for the first time after bootup it gets registered, +and all subsequent uses of that lock-class will be attached to this +lock-class. + +State +----- + +The validator tracks lock-class usage history into 5 separate state bits: + +- 'ever held in hardirq context' [ == hardirq-safe ] +- 'ever held in softirq context' [ == softirq-safe ] +- 'ever held with hardirqs enabled' [ == hardirq-unsafe ] +- 'ever held with softirqs and hardirqs enabled' [ == softirq-unsafe ] + +- 'ever used' [ == !unused ] + +Single-lock state rules: +------------------------ + +A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The +following states are exclusive, and only one of them is allowed to be +set for any lock-class: + + and + and + +The validator detects and reports lock usage that violate these +single-lock state rules. + +Multi-lock dependency rules: +---------------------------- + +The same lock-class must not be acquired twice, because this could lead +to lock recursion deadlocks. + +Furthermore, two locks may not be taken in different order: + + -> + -> + +because this could lead to lock inversion deadlocks. (The validator +finds such dependencies in arbitrary complexity, i.e. there can be any +other locking sequence between the acquire-lock operations, the +validator will still track all dependencies between locks.) + +Furthermore, the following usage based lock dependencies are not allowed +between any two lock-classes: + + -> + -> + +The first rule comes from the fact the a hardirq-safe lock could be +taken by a hardirq context, interrupting a hardirq-unsafe lock - and +thus could result in a lock inversion deadlock. Likewise, a softirq-safe +lock could be taken by an softirq context, interrupting a softirq-unsafe +lock. + +The above rules are enforced for any locking sequence that occurs in the +kernel: when acquiring a new lock, the validator checks whether there is +any rule violation between the new lock and any of the held locks. + +When a lock-class changes its state, the following aspects of the above +dependency rules are enforced: + +- if a new hardirq-safe lock is discovered, we check whether it + took any hardirq-unsafe lock in the past. + +- if a new softirq-safe lock is discovered, we check whether it took + any softirq-unsafe lock in the past. + +- if a new hardirq-unsafe lock is discovered, we check whether any + hardirq-safe lock took it in the past. + +- if a new softirq-unsafe lock is discovered, we check whether any + softirq-safe lock took it in the past. + +(Again, we do these checks too on the basis that an interrupt context +could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which +could lead to a lock inversion deadlock - even if that lock scenario did +not trigger in practice yet.) + +Exception: Nested data dependencies leading to nested locking +------------------------------------------------------------- + +There are a few cases where the Linux kernel acquires more than one +instance of the same lock-class. Such cases typically happen when there +is some sort of hierarchy within objects of the same type. In these +cases there is an inherent "natural" ordering between the two objects +(defined by the properties of the hierarchy), and the kernel grabs the +locks in this fixed order on each of the objects. + +An example of such an object hieararchy that results in "nested locking" +is that of a "whole disk" block-dev object and a "partition" block-dev +object; the partition is "part of" the whole device and as long as one +always takes the whole disk lock as a higher lock than the partition +lock, the lock ordering is fully correct. The validator does not +automatically detect this natural ordering, as the locking rule behind +the ordering is not static. + +In order to teach the validator about this correct usage model, new +versions of the various locking primitives were added that allow you to +specify a "nesting level". An example call, for the block device mutex, +looks like this: + +enum bdev_bd_mutex_lock_class +{ + BD_MUTEX_NORMAL, + BD_MUTEX_WHOLE, + BD_MUTEX_PARTITION +}; + + mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); + +In this case the locking is done on a bdev object that is known to be a +partition. + +The validator treats a lock that is taken in such a nested fasion as a +separate (sub)class for the purposes of validation. + +Note: When changing code to use the _nested() primitives, be careful and +check really thoroughly that the hiearchy is correctly mapped; otherwise +you can get false positives or false negatives. + +Proof of 100% correctness: +-------------------------- + +The validator achieves perfect, mathematical 'closure' (proof of locking +correctness) in the sense that for every simple, standalone single-task +locking sequence that occured at least once during the lifetime of the +kernel, the validator proves it with a 100% certainty that no +combination and timing of these locking sequences can cause any class of +lock related deadlock. [*] + +I.e. complex multi-CPU and multi-task locking scenarios do not have to +occur in practice to prove a deadlock: only the simple 'component' +locking chains have to occur at least once (anytime, in any +task/context) for the validator to be able to prove correctness. (For +example, complex deadlocks that would normally need more than 3 CPUs and +a very unlikely constellation of tasks, irq-contexts and timings to +occur, can be detected on a plain, lightly loaded single-CPU system as +well!) + +This radically decreases the complexity of locking related QA of the +kernel: what has to be done during QA is to trigger as many "simple" +single-task locking dependencies in the kernel as possible, at least +once, to prove locking correctness - instead of having to trigger every +possible combination of locking interaction between CPUs, combined with +every possible hardirq and softirq nesting scenario (which is impossible +to do in practice). + +[*] assuming that the validator itself is 100% correct, and no other + part of the system corrupts the state of the validator in any way. + We also assume that all NMI/SMM paths [which could interrupt + even hardirq-disabled codepaths] are correct and do not interfere + with the validator. We also assume that the 64-bit 'chain hash' + value is unique for every lock-chain in the system. Also, lock + recursion must not be higher than 20. + +Performance: +------------ + +The above rules require _massive_ amounts of runtime checking. If we did +that for every lock taken and for every irqs-enable event, it would +render the system practically unusably slow. The complexity of checking +is O(N^2), so even with just a few hundred lock-classes we'd have to do +tens of thousands of checks for every event. + +This problem is solved by checking any given 'locking scenario' (unique +sequence of locks taken after each other) only once. A simple stack of +held locks is maintained, and a lightweight 64-bit hash value is +calculated, which hash is unique for every lock chain. The hash value, +when the chain is validated for the first time, is then put into a hash +table, which hash-table can be checked in a lockfree manner. If the +locking chain occurs again later on, the hash table tells us that we +dont have to validate the chain again. -- cgit v1.2.3 From 6ce1669fdb6b0a0faf9b2e2ba08048b520c57841 Mon Sep 17 00:00:00 2001 From: Horms Date: Mon, 3 Jul 2006 19:35:40 -0700 Subject: [IPVS]: Add sysctl documentation * Derived from http://www.linuxvirtualserver.org/docs/sysctl.html, v1.4 maintained by Wensong Zhang * Adjusted preample to match ip-sysctl.txt * Sorted options into alphabetical order * Added expire_quiescent_template * Removed timeout_* which are no longer present * Incoporated doc/debug-levels.txt from IPVS source tree into description of ipvs_debug * Minor spelling fixes * Further editing more than welcome Signed-Off-By: Horms Signed-off-by: David S. Miller --- Documentation/networking/ipvs-sysctl.txt | 143 +++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 Documentation/networking/ipvs-sysctl.txt (limited to 'Documentation') diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.txt new file mode 100644 index 000000000000..4ccdbca03811 --- /dev/null +++ b/Documentation/networking/ipvs-sysctl.txt @@ -0,0 +1,143 @@ +/proc/sys/net/ipv4/vs/* Variables: + +am_droprate - INTEGER + default 10 + + It sets the always mode drop rate, which is used in the mode 3 + of the drop_rate defense. + +amemthresh - INTEGER + default 1024 + + It sets the available memory threshold (in pages), which is + used in the automatic modes of defense. When there is no + enough available memory, the respective strategy will be + enabled and the variable is automatically set to 2, otherwise + the strategy is disabled and the variable is set to 1. + +cache_bypass - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + If it is enabled, forward packets to the original destination + directly when no cache server is available and destination + address is not local (iph->daddr is RTN_UNICAST). It is mostly + used in transparent web cache cluster. + +debug_level - INTEGER + 0 - transmission error messages (default) + 1 - non-fatal error messages + 2 - configuration + 3 - destination trash + 4 - drop entry + 5 - service lookup + 6 - scheduling + 7 - connection new/expire, lookup and synchronization + 8 - state transition + 9 - binding destination, template checks and applications + 10 - IPVS packet transmission + 11 - IPVS packet handling (ip_vs_in/ip_vs_out) + 12 or more - packet traversal + + Only available when IPVS is compiled with the CONFIG_IPVS_DEBUG + + Higher debugging levels include the messages for lower debugging + levels, so setting debug level 2, includes level 0, 1 and 2 + messages. Thus, logging becomes more and more verbose the higher + the level. + +drop_entry - INTEGER + 0 - disabled (default) + + The drop_entry defense is to randomly drop entries in the + connection hash table, just in order to collect back some + memory for new connections. In the current code, the + drop_entry procedure can be activated every second, then it + randomly scans 1/32 of the whole and drops entries that are in + the SYN-RECV/SYNACK state, which should be effective against + syn-flooding attack. + + The valid values of drop_entry are from 0 to 3, where 0 means + that this strategy is always disabled, 1 and 2 mean automatic + modes (when there is no enough available memory, the strategy + is enabled and the variable is automatically set to 2, + otherwise the strategy is disabled and the variable is set to + 1), and 3 means that that the strategy is always enabled. + +drop_packet - INTEGER + 0 - disabled (default) + + The drop_packet defense is designed to drop 1/rate packets + before forwarding them to real servers. If the rate is 1, then + drop all the incoming packets. + + The value definition is the same as that of the drop_entry. In + the automatic mode, the rate is determined by the follow + formula: rate = amemthresh / (amemthresh - available_memory) + when available memory is less than the available memory + threshold. When the mode 3 is set, the always mode drop rate + is controlled by the /proc/sys/net/ipv4/vs/am_droprate. + +expire_nodest_conn - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + The default value is 0, the load balancer will silently drop + packets when its destination server is not available. It may + be useful, when user-space monitoring program deletes the + destination server (because of server overload or wrong + detection) and add back the server later, and the connections + to the server can continue. + + If this feature is enabled, the load balancer will expire the + connection immediately when a packet arrives and its + destination server is not available, then the client program + will be notified that the connection is closed. This is + equivalent to the feature some people requires to flush + connections when its destination is not available. + +expire_quiescent_template - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + When set to a non-zero value, the load balancer will expire + persistent templates when the destination server is quiescent. + This may be useful, when a user makes a destination server + quiescent by setting its weight to 0 and it is desired that + subsequent otherwise persistent connections are sent to a + different destination server. By default new persistent + connections are allowed to quiescent destination servers. + + If this feature is enabled, the load balancer will expire the + persistence template if it is to be used to schedule a new + connection and the destination server is quiescent. + +nat_icmp_send - BOOLEAN + 0 - disabled (default) + not 0 - enabled + + It controls sending icmp error messages (ICMP_DEST_UNREACH) + for VS/NAT when the load balancer receives packets from real + servers but the connection entries don't exist. + +secure_tcp - INTEGER + 0 - disabled (default) + + The secure_tcp defense is to use a more complicated state + transition table and some possible short timeouts of each + state. In the VS/NAT, it delays the entering the ESTABLISHED + until the real server starts to send data and ACK packet + (after 3-way handshake). + + The value definition is the same as that of drop_entry or + drop_packet. + +sync_threshold - INTEGER + default 3 + + It sets synchronization threshold, which is the minimum number + of incoming packets that a connection needs to receive before + the connection will be synchronized. A connection will be + synchronized, every time the number of its incoming packets + modulus 50 equals the threshold. The range of the threshold is + from 0 to 49. -- cgit v1.2.3