From 68e2fc78e5055740126df8eab0d31005495756c9 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Mon, 14 Jul 2008 16:51:57 +0800
Subject: Blackfin arch: Fix bug - Kernel does not boot if re-program clocks

Don't write conflicting data to EBIU_SDBCTL after the SDRAM is
configured. This can cause data corruption, since we might change SDRAM
row and column addressing modes.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 include/asm-blackfin/mach-bf527/mem_init.h | 27 ---------------------------
 include/asm-blackfin/mach-bf533/mem_init.h | 27 ---------------------------
 include/asm-blackfin/mach-bf537/mem_init.h | 27 ---------------------------
 include/asm-blackfin/mach-bf561/mem_init.h | 27 ---------------------------
 4 files changed, 108 deletions(-)

(limited to 'include')

diff --git a/include/asm-blackfin/mach-bf527/mem_init.h b/include/asm-blackfin/mach-bf527/mem_init.h
index 008ca66719e2..cbe03f4a5698 100644
--- a/include/asm-blackfin/mach-bf527/mem_init.h
+++ b/include/asm-blackfin/mach-bf527/mem_init.h
@@ -146,33 +146,6 @@
 #define SDRAM_CL    CL_3
 #endif
 
-#if (CONFIG_MEM_SIZE == 128)
-#define SDRAM_SIZE      EBSZ_128
-#endif
-#if (CONFIG_MEM_SIZE == 64)
-#define SDRAM_SIZE      EBSZ_64
-#endif
-#if (CONFIG_MEM_SIZE == 32)
-#define SDRAM_SIZE      EBSZ_32
-#endif
-#if (CONFIG_MEM_SIZE == 16)
-#define SDRAM_SIZE      EBSZ_16
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 11)
-#define SDRAM_WIDTH     EBCAW_11
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 10)
-#define SDRAM_WIDTH     EBCAW_10
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 9)
-#define SDRAM_WIDTH     EBCAW_9
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 8)
-#define SDRAM_WIDTH     EBCAW_8
-#endif
-
-#define mem_SDBCTL      (SDRAM_WIDTH | SDRAM_SIZE | EBE)
-
 /* Equation from section 17 (p17-46) of BF533 HRM */
 #define mem_SDRRC       (((CONFIG_SCLK_HZ / 1000) * SDRAM_Tref) / SDRAM_NRA) - (SDRAM_tRAS_num + SDRAM_tRP_num)
 
diff --git a/include/asm-blackfin/mach-bf533/mem_init.h b/include/asm-blackfin/mach-bf533/mem_init.h
index f8f31901fca9..995c06b2b1ef 100644
--- a/include/asm-blackfin/mach-bf533/mem_init.h
+++ b/include/asm-blackfin/mach-bf533/mem_init.h
@@ -133,33 +133,6 @@
 #define SDRAM_CL    CL_3
 #endif
 
-#if (CONFIG_MEM_SIZE == 128)
-#define SDRAM_SIZE      EBSZ_128
-#endif
-#if (CONFIG_MEM_SIZE == 64)
-#define SDRAM_SIZE      EBSZ_64
-#endif
-#if (CONFIG_MEM_SIZE == 32)
-#define SDRAM_SIZE      EBSZ_32
-#endif
-#if (CONFIG_MEM_SIZE == 16)
-#define SDRAM_SIZE      EBSZ_16
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 11)
-#define SDRAM_WIDTH     EBCAW_11
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 10)
-#define SDRAM_WIDTH     EBCAW_10
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 9)
-#define SDRAM_WIDTH     EBCAW_9
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 8)
-#define SDRAM_WIDTH     EBCAW_8
-#endif
-
-#define mem_SDBCTL      (SDRAM_WIDTH | SDRAM_SIZE | EBE)
-
 /* Equation from section 17 (p17-46) of BF533 HRM */
 #define mem_SDRRC       (((CONFIG_SCLK_HZ / 1000) * SDRAM_Tref)  / SDRAM_NRA) - (SDRAM_tRAS_num + SDRAM_tRP_num)
 
diff --git a/include/asm-blackfin/mach-bf537/mem_init.h b/include/asm-blackfin/mach-bf537/mem_init.h
index 9ad979d416c6..f67698f670ca 100644
--- a/include/asm-blackfin/mach-bf537/mem_init.h
+++ b/include/asm-blackfin/mach-bf537/mem_init.h
@@ -139,33 +139,6 @@
 #define SDRAM_CL    CL_3
 #endif
 
-#if (CONFIG_MEM_SIZE == 128)
-#define SDRAM_SIZE      EBSZ_128
-#endif
-#if (CONFIG_MEM_SIZE == 64)
-#define SDRAM_SIZE      EBSZ_64
-#endif
-#if (CONFIG_MEM_SIZE == 32)
-#define SDRAM_SIZE      EBSZ_32
-#endif
-#if (CONFIG_MEM_SIZE == 16)
-#define SDRAM_SIZE      EBSZ_16
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 11)
-#define SDRAM_WIDTH     EBCAW_11
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 10)
-#define SDRAM_WIDTH     EBCAW_10
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 9)
-#define SDRAM_WIDTH     EBCAW_9
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 8)
-#define SDRAM_WIDTH     EBCAW_8
-#endif
-
-#define mem_SDBCTL      (SDRAM_WIDTH | SDRAM_SIZE | EBE)
-
 /* Equation from section 17 (p17-46) of BF533 HRM */
 #define mem_SDRRC       (((CONFIG_SCLK_HZ / 1000) * SDRAM_Tref) / SDRAM_NRA) - (SDRAM_tRAS_num + SDRAM_tRP_num)
 
diff --git a/include/asm-blackfin/mach-bf561/mem_init.h b/include/asm-blackfin/mach-bf561/mem_init.h
index 439a5895b346..e163260bca18 100644
--- a/include/asm-blackfin/mach-bf561/mem_init.h
+++ b/include/asm-blackfin/mach-bf561/mem_init.h
@@ -131,33 +131,6 @@
 #define SDRAM_CL    CL_3
 #endif
 
-#if (CONFIG_MEM_SIZE == 128)
-#define SDRAM_SIZE      EB0_SZ_128
-#endif
-#if (CONFIG_MEM_SIZE == 64)
-#define SDRAM_SIZE      EB0_SZ_64
-#endif
-#if ( CONFIG_MEM_SIZE == 32)
-#define SDRAM_SIZE      EB0_SZ_32
-#endif
-#if (CONFIG_MEM_SIZE == 16)
-#define SDRAM_SIZE      EB0_SZ_16
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 11)
-#define SDRAM_WIDTH     EB0_CAW_11
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 10)
-#define SDRAM_WIDTH     EB0_CAW_10
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 9)
-#define SDRAM_WIDTH     EB0_CAW_9
-#endif
-#if (CONFIG_MEM_ADD_WIDTH == 8)
-#define SDRAM_WIDTH     EB0_CAW_8
-#endif
-
-#define mem_SDBCTL      (SDRAM_WIDTH | SDRAM_SIZE | EB0_E)
-
 /* Equation from section 17 (p17-46) of BF533 HRM */
 #define mem_SDRRC       (((CONFIG_SCLK_HZ / 1000) * SDRAM_Tref) / SDRAM_NRA) - (SDRAM_tRAS_num + SDRAM_tRP_num)
 
-- 
cgit v1.2.3


From 1c0d20cd29aec11a3580cedf0bccec25052e8d4c Mon Sep 17 00:00:00 2001
From: Bryan Wu <cooloney@kernel.org>
Date: Tue, 15 Jul 2008 12:08:50 +0800
Subject: Blackfin arch: add TXDWA definition to enable new feature

Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 include/asm-blackfin/mach-bf527/anomaly.h  | 2 ++
 include/asm-blackfin/mach-bf527/defBF527.h | 1 +
 include/asm-blackfin/mach-bf537/defBF537.h | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/asm-blackfin/mach-bf527/anomaly.h b/include/asm-blackfin/mach-bf527/anomaly.h
index 4725268a5ada..b7b166f4f064 100644
--- a/include/asm-blackfin/mach-bf527/anomaly.h
+++ b/include/asm-blackfin/mach-bf527/anomaly.h
@@ -23,6 +23,8 @@
 #define ANOMALY_05000245 (1)
 /* Sensitivity To Noise with Slow Input Edge Rates on External SPORT TX and RX Clocks */
 #define ANOMALY_05000265 (1)
+/* New Feature: EMAC TX DMA Word Alignment */
+#define ANOMALY_05000285 (1)
 /* Errors when SSYNC, CSYNC, or Loads to LT, LB and LC Registers Are Interrupted */
 #define ANOMALY_05000312 (1)
 /* Incorrect Access of OTP_STATUS During otp_write() Function */
diff --git a/include/asm-blackfin/mach-bf527/defBF527.h b/include/asm-blackfin/mach-bf527/defBF527.h
index 82134f578f32..f1a70db70cb8 100644
--- a/include/asm-blackfin/mach-bf527/defBF527.h
+++ b/include/asm-blackfin/mach-bf527/defBF527.h
@@ -302,6 +302,7 @@
 #define	PHYIE             0x00000001    /* PHY_INT Interrupt Enable                               */
 #define	RXDWA             0x00000002    /* Receive Frame DMA Word Alignment (Odd/Even*)           */
 #define	RXCKS             0x00000004    /* Enable RX Frame TCP/UDP Checksum Computation           */
+#define	TXDWA             0x00000010    /* Transmit Frame DMA Word Alignment (Odd/Even*)          */
 #define	MDCDIV            0x00003F00    /* SCLK:MDC Clock Divisor [MDC=SCLK/(2*(N+1))]            */
 
 #define	SET_MDCDIV(x) (((x)&0x3F)<< 8)   /* Set MDC Clock Divisor                                 */
diff --git a/include/asm-blackfin/mach-bf537/defBF537.h b/include/asm-blackfin/mach-bf537/defBF537.h
index 3f455909c418..abde24c6d3b1 100644
--- a/include/asm-blackfin/mach-bf537/defBF537.h
+++ b/include/asm-blackfin/mach-bf537/defBF537.h
@@ -290,6 +290,7 @@
 #define	PHYIE		0x00000001	/* PHY_INT Interrupt Enable                                                     */
 #define	RXDWA		0x00000002	/* Receive Frame DMA Word Alignment (Odd/Even*)         */
 #define	RXCKS		0x00000004	/* Enable RX Frame TCP/UDP Checksum Computation         */
+#define	TXDWA		0x00000010	/* Transmit Frame DMA Word Alignment (Odd/Even*)        */
 #define	MDCDIV		0x00003F00	/* SCLK:MDC Clock Divisor [MDC=SCLK/(2*(N+1))]          */
 
 #define	SET_MDCDIV(x)	(((x)&0x3F)<< 8)	/* Set MDC Clock Divisor                                */
-- 
cgit v1.2.3


From 1efc80b53eb54770139219f99657abd92595fc86 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Sat, 19 Jul 2008 16:57:32 +0800
Subject: Blackfin arch: Functional power management support

Enable: PM_SUSPEND_MEM -> Blackfin Hibernate to SDRAM
This feature requires a special bootloader (u-boot)
supporting return from hibernate.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 arch/blackfin/Kconfig                  |  66 +++-
 arch/blackfin/kernel/bfin_dma_5xx.c    |  26 ++
 arch/blackfin/kernel/bfin_gpio.c       | 118 ++++++-
 arch/blackfin/mach-common/dpmc_modes.S | 600 +++++++++++++++++++++++++++++++--
 arch/blackfin/mach-common/pm.c         | 225 ++++++++++++-
 include/asm-blackfin/dma.h             |   8 +
 include/asm-blackfin/dpmc.h            |  82 ++---
 include/asm-blackfin/gpio.h            |  19 +-
 include/asm-blackfin/mach-bf548/gpio.h |   8 +
 9 files changed, 1033 insertions(+), 119 deletions(-)

(limited to 'include')

diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index c602727d1a9a..8f21b78b2e6f 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -880,7 +880,7 @@ config ARCH_SUSPEND_POSSIBLE
 	depends on !SMP
 
 choice
-	prompt "Default Power Saving Mode"
+	prompt "Standby Power Saving Mode"
 	depends on PM
 	default PM_BFIN_SLEEP_DEEPER
 config  PM_BFIN_SLEEP_DEEPER
@@ -899,6 +899,8 @@ config  PM_BFIN_SLEEP_DEEPER
 	  normal during Sleep Deeper, due to the reduced SCLK frequency.
 	  When in the sleep mode, system DMA access to L1 memory is not supported.
 
+	  If unsure, select "Sleep Deeper".
+
 config  PM_BFIN_SLEEP
 	bool "Sleep"
 	help
@@ -906,15 +908,17 @@ config  PM_BFIN_SLEEP
 	  dissipation by disabling the clock to the processor core (CCLK).
 	  The PLL and system clock (SCLK), however, continue to operate in
 	  this mode. Typically an external event or RTC activity will wake
-	  up the processor. When in the sleep mode,
-	  system DMA access to L1 memory is not supported.
+	  up the processor. When in the sleep mode, system DMA access to L1
+	  memory is not supported.
+
+	  If unsure, select "Sleep Deeper".
 endchoice
 
 config PM_WAKEUP_BY_GPIO
-	bool "Cause Wakeup Event by GPIO"
+	bool "Allow Wakeup from Standby by GPIO"
 
 config PM_WAKEUP_GPIO_NUMBER
-	int "Wakeup GPIO number"
+	int "GPIO number"
 	range 0 47
 	depends on PM_WAKEUP_BY_GPIO
 	default 2 if BFIN537_STAMP
@@ -935,6 +939,58 @@ config  PM_WAKEUP_GPIO_POLAR_EDGE_B
 	bool "Both EDGE"
 endchoice
 
+comment "Possible Suspend Mem / Hibernate Wake-Up Sources"
+	depends on PM
+
+config PM_BFIN_WAKE_RTC
+	bool "Allow Wake-Up from RESET and on-chip RTC"
+	depends on PM
+	default n
+	help
+	  Enable RTC Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_PH6
+	bool "Allow Wake-Up from on-chip PHY or PH6 GP"
+	depends on PM && (BF52x || BF534 || BF536 || BF537)
+	default n
+	help
+	  Enable PHY and PH6 GP Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_CAN
+	bool "Allow Wake-Up from on-chip CAN0/1"
+	depends on PM && (BF54x || BF534 || BF536 || BF537)
+	default n
+	help
+	  Enable CAN0/1 Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_GP
+	bool "Allow Wake-Up from GPIOs"
+	depends on PM && BF54x
+	default n
+	help
+	  Enable General-Purpose Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_USB
+	bool "Allow Wake-Up from on-chip USB"
+	depends on PM && (BF54x || BF52x)
+	default n
+	help
+	  Enable USB Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_KEYPAD
+	bool "Allow Wake-Up from on-chip Keypad"
+	depends on PM && BF54x
+	default n
+	help
+	  Enable Keypad Wake-Up (Voltage Regulator Power-Up)
+
+config PM_BFIN_WAKE_ROTARY
+	bool "Allow Wake-Up from on-chip Rotary"
+	depends on PM && BF54x
+	default n
+	help
+	  Enable Rotary Wake-Up (Voltage Regulator Power-Up)
+
 endmenu
 
 menu "CPU Frequency scaling"
diff --git a/arch/blackfin/kernel/bfin_dma_5xx.c b/arch/blackfin/kernel/bfin_dma_5xx.c
index d54f19085f37..ad0e75845ac2 100644
--- a/arch/blackfin/kernel/bfin_dma_5xx.c
+++ b/arch/blackfin/kernel/bfin_dma_5xx.c
@@ -472,6 +472,32 @@ unsigned long get_dma_curr_addr(unsigned int channel)
 }
 EXPORT_SYMBOL(get_dma_curr_addr);
 
+#ifdef CONFIG_PM
+int blackfin_dma_suspend(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_BLACKFIN_DMA_CHANNEL; i++) {
+		if (dma_ch[i].chan_status == DMA_CHANNEL_ENABLED) {
+			printk(KERN_ERR "DMA Channel %d failed to suspend\n", i);
+			return -EBUSY;
+		}
+
+		dma_ch[i].saved_peripheral_map = dma_ch[i].regs->peripheral_map;
+	}
+
+	return 0;
+}
+
+void blackfin_dma_resume(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_BLACKFIN_DMA_CHANNEL; i++)
+		dma_ch[i].regs->peripheral_map = dma_ch[i].saved_peripheral_map;
+}
+#endif
+
 static void *__dma_memcpy(void *dest, const void *src, size_t size)
 {
 	int direction;	/* 1 - address decrease, 0 - address increase */
diff --git a/arch/blackfin/kernel/bfin_gpio.c b/arch/blackfin/kernel/bfin_gpio.c
index b6d89d1644cc..ecbd141e0ef2 100644
--- a/arch/blackfin/kernel/bfin_gpio.c
+++ b/arch/blackfin/kernel/bfin_gpio.c
@@ -186,7 +186,10 @@ static struct str_ident {
 	char name[RESOURCE_LABEL_SIZE];
 } str_ident[MAX_RESOURCES];
 
-#if defined(CONFIG_PM) && !defined(CONFIG_BF54x)
+#if defined(CONFIG_PM)
+#if defined(CONFIG_BF54x)
+static struct gpio_port_s gpio_bank_saved[gpio_bank(MAX_BLACKFIN_GPIOS)];
+#else
 static unsigned short wakeup_map[gpio_bank(MAX_BLACKFIN_GPIOS)];
 static unsigned char wakeup_flags_map[MAX_BLACKFIN_GPIOS];
 static struct gpio_port_s gpio_bank_saved[gpio_bank(MAX_BLACKFIN_GPIOS)];
@@ -206,7 +209,7 @@ static unsigned int sic_iwr_irqs[gpio_bank(MAX_BLACKFIN_GPIOS)] = {IRQ_PORTF_INT
 #ifdef BF561_FAMILY
 static unsigned int sic_iwr_irqs[gpio_bank(MAX_BLACKFIN_GPIOS)] = {IRQ_PROG0_INTB, IRQ_PROG1_INTB, IRQ_PROG2_INTB};
 #endif
-
+#endif
 #endif /* CONFIG_PM */
 
 #if defined(BF548_FAMILY)
@@ -667,7 +670,7 @@ static int bfin_gpio_wakeup_type(unsigned gpio, unsigned char type)
 	return 0;
 }
 
-u32 bfin_pm_setup(void)
+u32 bfin_pm_standby_setup(void)
 {
 	u16 bank, mask, i, gpio;
 
@@ -679,7 +682,7 @@ u32 bfin_pm_setup(void)
 		gpio_bankb[bank]->maskb = 0;
 
 		if (mask) {
-#ifdef BF537_FAMILY
+#if defined(BF527_FAMILY) || defined(BF537_FAMILY)
 			gpio_bank_saved[bank].fer   = *port_fer[bank];
 #endif
 			gpio_bank_saved[bank].inen  = gpio_bankb[bank]->inen;
@@ -715,7 +718,7 @@ u32 bfin_pm_setup(void)
 	return 0;
 }
 
-void bfin_pm_restore(void)
+void bfin_pm_standby_restore(void)
 {
 	u16 bank, mask, i;
 
@@ -724,7 +727,7 @@ void bfin_pm_restore(void)
 		bank = gpio_bank(i);
 
 		if (mask) {
-#ifdef BF537_FAMILY
+#if defined(BF527_FAMILY) || defined(BF537_FAMILY)
 			*port_fer[bank]   	= gpio_bank_saved[bank].fer;
 #endif
 			gpio_bankb[bank]->inen  = gpio_bank_saved[bank].inen;
@@ -743,8 +746,111 @@ void bfin_pm_restore(void)
 	AWA_DUMMY_READ(maskb);
 }
 
+void bfin_gpio_pm_hibernate_suspend(void)
+{
+	int i, bank;
+
+	for (i = 0; i < MAX_BLACKFIN_GPIOS; i += GPIO_BANKSIZE) {
+		bank = gpio_bank(i);
+
+#if defined(BF527_FAMILY) || defined(BF537_FAMILY)
+			gpio_bank_saved[bank].fer   = *port_fer[bank];
+#ifdef BF527_FAMILY
+			gpio_bank_saved[bank].mux   = *port_mux[bank];
+#else
+			if (bank == 0)
+				gpio_bank_saved[bank].mux   = bfin_read_PORT_MUX();
+#endif
+#endif
+			gpio_bank_saved[bank].data  = gpio_bankb[bank]->data;
+			gpio_bank_saved[bank].inen  = gpio_bankb[bank]->inen;
+			gpio_bank_saved[bank].polar = gpio_bankb[bank]->polar;
+			gpio_bank_saved[bank].dir   = gpio_bankb[bank]->dir;
+			gpio_bank_saved[bank].edge  = gpio_bankb[bank]->edge;
+			gpio_bank_saved[bank].both  = gpio_bankb[bank]->both;
+			gpio_bank_saved[bank].maska  = gpio_bankb[bank]->maska;
+	}
+
+	AWA_DUMMY_READ(maska);
+}
+
+void bfin_gpio_pm_hibernate_restore(void)
+{
+	int i, bank;
+
+	for (i = 0; i < MAX_BLACKFIN_GPIOS; i += GPIO_BANKSIZE) {
+			bank = gpio_bank(i);
+
+#if defined(BF527_FAMILY) || defined(BF537_FAMILY)
+#ifdef BF527_FAMILY
+			*port_mux[bank] = gpio_bank_saved[bank].mux;
+#else
+			if (bank == 0)
+				bfin_write_PORT_MUX(gpio_bank_saved[bank].mux);
+#endif
+			*port_fer[bank]   	= gpio_bank_saved[bank].fer;
+#endif
+			gpio_bankb[bank]->inen  = gpio_bank_saved[bank].inen;
+			gpio_bankb[bank]->dir   = gpio_bank_saved[bank].dir;
+			gpio_bankb[bank]->polar = gpio_bank_saved[bank].polar;
+			gpio_bankb[bank]->edge  = gpio_bank_saved[bank].edge;
+			gpio_bankb[bank]->both  = gpio_bank_saved[bank].both;
+
+			gpio_bankb[bank]->data_set = gpio_bank_saved[bank].data
+							| gpio_bank_saved[bank].dir;
+
+			gpio_bankb[bank]->maska = gpio_bank_saved[bank].maska;
+	}
+	AWA_DUMMY_READ(maska);
+}
+
+
 #endif
 #else /* BF548_FAMILY */
+#ifdef CONFIG_PM
+
+u32 bfin_pm_standby_setup(void)
+{
+	return 0;
+}
+
+void bfin_pm_standby_restore(void)
+{
+
+}
+
+void bfin_gpio_pm_hibernate_suspend(void)
+{
+	int i, bank;
+
+	for (i = 0; i < MAX_BLACKFIN_GPIOS; i += GPIO_BANKSIZE) {
+		bank = gpio_bank(i);
+
+			gpio_bank_saved[bank].fer  = gpio_array[bank]->port_fer;
+			gpio_bank_saved[bank].mux  = gpio_array[bank]->port_mux;
+			gpio_bank_saved[bank].data  = gpio_array[bank]->port_data;
+			gpio_bank_saved[bank].data  = gpio_array[bank]->port_data;
+			gpio_bank_saved[bank].inen  = gpio_array[bank]->port_inen;
+			gpio_bank_saved[bank].dir   = gpio_array[bank]->port_dir_set;
+	}
+}
+
+void bfin_gpio_pm_hibernate_restore(void)
+{
+	int i, bank;
+
+	for (i = 0; i < MAX_BLACKFIN_GPIOS; i += GPIO_BANKSIZE) {
+			bank = gpio_bank(i);
+
+			gpio_array[bank]->port_mux  = gpio_bank_saved[bank].mux;
+			gpio_array[bank]->port_fer  = gpio_bank_saved[bank].fer;
+			gpio_array[bank]->port_inen  = gpio_bank_saved[bank].inen;
+			gpio_array[bank]->port_dir_set   = gpio_bank_saved[bank].dir;
+			gpio_array[bank]->port_set = gpio_bank_saved[bank].data
+							| gpio_bank_saved[bank].dir;
+	}
+}
+#endif
 
 unsigned short get_gpio_dir(unsigned gpio)
 {
diff --git a/arch/blackfin/mach-common/dpmc_modes.S b/arch/blackfin/mach-common/dpmc_modes.S
index b7981d31c392..46ee77afed20 100644
--- a/arch/blackfin/mach-common/dpmc_modes.S
+++ b/arch/blackfin/mach-common/dpmc_modes.S
@@ -7,7 +7,7 @@
 #include <linux/linkage.h>
 #include <asm/blackfin.h>
 #include <asm/mach/irq.h>
-
+#include <asm/dpmc.h>
 
 .section .l1.text
 
@@ -56,26 +56,25 @@ ENTRY(_hibernate_mode)
 	[--SP] = ( R7:0, P5:0 );
 	[--SP] =  RETS;
 
+	R3 = R0;
+	R0 = IWR_DISABLE_ALL;
+	R1 = IWR_DISABLE_ALL;
+	R2 = IWR_DISABLE_ALL;
 	call _set_sic_iwr;
+	call _set_dram_srfs;
+	SSYNC;
 
 	R0 = 0xFFFF (Z);
 	call _set_rtc_istat;
 
 	P0.H = hi(VR_CTL);
 	P0.L = lo(VR_CTL);
-	R1 = W[P0](z);
-	BITSET (R1, 8);
-	BITCLR (R1, 0);
-	BITCLR (R1, 1);
-	W[P0] = R1.L;
-	SSYNC;
 
+	W[P0] = R3.L;
 	CLI R2;
 	IDLE;
-
-	/* Actually, adding anything may not be necessary...SDRAM contents
-	 * are lost
-	 */
+.Lforever:
+	jump .Lforever;
 
 ENTRY(_deep_sleep)
 	[--SP] = ( R7:0, P5:0 );
@@ -233,51 +232,70 @@ ENTRY(_sleep_deeper)
 	( R7:0, P5:0 ) = [SP++];
 	RTS;
 
+
 ENTRY(_set_dram_srfs)
 	/*  set the dram to self refresh mode */
-#if defined(CONFIG_BF54x)
+	SSYNC;
+#if defined(EBIU_RSTCTL)	/* DDR */
 	P0.H = hi(EBIU_RSTCTL);
 	P0.L = lo(EBIU_RSTCTL);
 	R2 = [P0];
-	R3.H = hi(SRREQ);
-	R3.L = lo(SRREQ);
-#else
-	P0.H = hi(EBIU_SDGCTL);
+	BITSET(R2, 3); /* SRREQ enter self-refresh mode */
+	[P0] = R2;
+	SSYNC;
+1:
+	R2 = [P0];
+	CC = BITTST(R2, 4);
+	if !CC JUMP 1b;
+#else 				/* SDRAM */
 	P0.L = lo(EBIU_SDGCTL);
+	P0.H = hi(EBIU_SDGCTL);
 	R2 = [P0];
-	R3.H = hi(SRFS);
-	R3.L = lo(SRFS);
-#endif
-	R2 = R2|R3;
+	BITSET(R2, 24); /* SRFS enter self-refresh mode */
 	[P0] = R2;
-	ssync;
-#if defined(CONFIG_BF54x)
-.LSRR_MODE:
+	SSYNC;
+
+	P0.L = lo(EBIU_SDSTAT);
+	P0.H = hi(EBIU_SDSTAT);
+1:
+	R2 = w[P0];
+	SSYNC;
+	cc = BITTST(R2, 1); /* SDSRA poll self-refresh status */
+	if !cc jump 1b;
+
+	P0.L = lo(EBIU_SDGCTL);
+	P0.H = hi(EBIU_SDGCTL);
 	R2 = [P0];
-	CC = BITTST(R2, 4);
-	if !CC JUMP .LSRR_MODE;
+	BITCLR(R2, 0); /* SCTLE disable CLKOUT */
+	[P0] = R2;
 #endif
 	RTS;
 
+
 ENTRY(_unset_dram_srfs)
 	/*  set the dram out of self refresh mode */
-#if defined(CONFIG_BF54x)
+#if defined(EBIU_RSTCTL)	/* DDR */
 	P0.H = hi(EBIU_RSTCTL);
 	P0.L = lo(EBIU_RSTCTL);
 	R2 = [P0];
-	R3.H = hi(SRREQ);
-	R3.L = lo(SRREQ);
-#else
+	BITCLR(R2, 3); /* clear SRREQ bit */
+	[P0] = R2;
+#elif defined(EBIU_SDGCTL)	/* SDRAM */
+
+	P0.L = lo(EBIU_SDGCTL); /* release CLKOUT from self-refresh */
 	P0.H = hi(EBIU_SDGCTL);
-	P0.L = lo(EBIU_SDGCTL);
 	R2 = [P0];
-	R3.H = hi(SRFS);
-	R3.L = lo(SRFS);
+	BITSET(R2, 0); /* SCTLE enable CLKOUT */
+	[P0] = R2
+	SSYNC;
+
+	P0.L = lo(EBIU_SDGCTL); /* release SDRAM from self-refresh */
+	P0.H = hi(EBIU_SDGCTL);
+	R2 = [P0];
+	BITCLR(R2, 24); /* clear SRFS bit */
+	[P0] = R2
 #endif
-	R3 = ~R3;
-	R2 = R2&R3;
-	[P0] = R2;
-	ssync;
+	SSYNC;
 	RTS;
 
 ENTRY(_set_sic_iwr)
@@ -307,6 +325,11 @@ ENTRY(_set_rtc_istat)
 	P0.L = lo(RTC_ISTAT);
 	w[P0] = R0.L;
 	SSYNC;
+#elif (ANOMALY_05000371)
+	nop;
+	nop;
+	nop;
+	nop;
 #endif
 	RTS;
 
@@ -318,3 +341,508 @@ ENTRY(_test_pll_locked)
 	CC = BITTST(R0,5);
 	IF !CC JUMP 1b;
 	RTS;
+
+.section .text
+
+
+ENTRY(_do_hibernate)
+	[--SP] = ( R7:0, P5:0 );
+	[--SP] =  RETS;
+	/* Save System MMRs */
+	R2 = R0;
+	P0.H = hi(PLL_CTL);
+	P0.L = lo(PLL_CTL);
+
+#ifdef SIC_IMASK0
+	PM_SYS_PUSH(SIC_IMASK0)
+#endif
+#ifdef SIC_IMASK1
+	PM_SYS_PUSH(SIC_IMASK1)
+#endif
+#ifdef SIC_IMASK2
+	PM_SYS_PUSH(SIC_IMASK2)
+#endif
+#ifdef SIC_IMASK
+	PM_SYS_PUSH(SIC_IMASK)
+#endif
+#ifdef SICA_IMASK0
+	PM_SYS_PUSH(SICA_IMASK0)
+#endif
+#ifdef SICA_IMASK1
+	PM_SYS_PUSH(SICA_IMASK1)
+#endif
+#ifdef SIC_IAR2
+	PM_SYS_PUSH(SIC_IAR0)
+	PM_SYS_PUSH(SIC_IAR1)
+	PM_SYS_PUSH(SIC_IAR2)
+#endif
+#ifdef SIC_IAR3
+	PM_SYS_PUSH(SIC_IAR3)
+#endif
+#ifdef SIC_IAR4
+	PM_SYS_PUSH(SIC_IAR4)
+	PM_SYS_PUSH(SIC_IAR5)
+	PM_SYS_PUSH(SIC_IAR6)
+#endif
+#ifdef SIC_IAR7
+	PM_SYS_PUSH(SIC_IAR7)
+#endif
+#ifdef SIC_IAR8
+	PM_SYS_PUSH(SIC_IAR8)
+	PM_SYS_PUSH(SIC_IAR9)
+	PM_SYS_PUSH(SIC_IAR10)
+	PM_SYS_PUSH(SIC_IAR11)
+#endif
+
+#ifdef SICA_IAR0
+	PM_SYS_PUSH(SICA_IAR0)
+	PM_SYS_PUSH(SICA_IAR1)
+	PM_SYS_PUSH(SICA_IAR2)
+	PM_SYS_PUSH(SICA_IAR3)
+	PM_SYS_PUSH(SICA_IAR4)
+	PM_SYS_PUSH(SICA_IAR5)
+	PM_SYS_PUSH(SICA_IAR6)
+	PM_SYS_PUSH(SICA_IAR7)
+#endif
+
+#ifdef SIC_IWR
+	PM_SYS_PUSH(SIC_IWR)
+#endif
+#ifdef SIC_IWR0
+	PM_SYS_PUSH(SIC_IWR0)
+#endif
+#ifdef SIC_IWR1
+	PM_SYS_PUSH(SIC_IWR1)
+#endif
+#ifdef SIC_IWR2
+	PM_SYS_PUSH(SIC_IWR2)
+#endif
+#ifdef SICA_IWR0
+	PM_SYS_PUSH(SICA_IWR0)
+#endif
+#ifdef SICA_IWR1
+	PM_SYS_PUSH(SICA_IWR1)
+#endif
+
+#ifdef PINT0_ASSIGN
+	PM_SYS_PUSH(PINT0_ASSIGN)
+	PM_SYS_PUSH(PINT1_ASSIGN)
+	PM_SYS_PUSH(PINT2_ASSIGN)
+	PM_SYS_PUSH(PINT3_ASSIGN)
+#endif
+
+	PM_SYS_PUSH(EBIU_AMBCTL0)
+	PM_SYS_PUSH(EBIU_AMBCTL1)
+	PM_SYS_PUSH16(EBIU_AMGCTL)
+
+#ifdef EBIU_FCTL
+	PM_SYS_PUSH(EBIU_MBSCTL)
+	PM_SYS_PUSH(EBIU_MODE)
+	PM_SYS_PUSH(EBIU_FCTL)
+#endif
+
+	PM_SYS_PUSH16(SYSCR)
+
+	/* Save Core MMRs */
+	P0.H = hi(SRAM_BASE_ADDRESS);
+	P0.L = lo(SRAM_BASE_ADDRESS);
+
+	PM_PUSH(DMEM_CONTROL)
+	PM_PUSH(DCPLB_ADDR0)
+	PM_PUSH(DCPLB_ADDR1)
+	PM_PUSH(DCPLB_ADDR2)
+	PM_PUSH(DCPLB_ADDR3)
+	PM_PUSH(DCPLB_ADDR4)
+	PM_PUSH(DCPLB_ADDR5)
+	PM_PUSH(DCPLB_ADDR6)
+	PM_PUSH(DCPLB_ADDR7)
+	PM_PUSH(DCPLB_ADDR8)
+	PM_PUSH(DCPLB_ADDR9)
+	PM_PUSH(DCPLB_ADDR10)
+	PM_PUSH(DCPLB_ADDR11)
+	PM_PUSH(DCPLB_ADDR12)
+	PM_PUSH(DCPLB_ADDR13)
+	PM_PUSH(DCPLB_ADDR14)
+	PM_PUSH(DCPLB_ADDR15)
+	PM_PUSH(DCPLB_DATA0)
+	PM_PUSH(DCPLB_DATA1)
+	PM_PUSH(DCPLB_DATA2)
+	PM_PUSH(DCPLB_DATA3)
+	PM_PUSH(DCPLB_DATA4)
+	PM_PUSH(DCPLB_DATA5)
+	PM_PUSH(DCPLB_DATA6)
+	PM_PUSH(DCPLB_DATA7)
+	PM_PUSH(DCPLB_DATA8)
+	PM_PUSH(DCPLB_DATA9)
+	PM_PUSH(DCPLB_DATA10)
+	PM_PUSH(DCPLB_DATA11)
+	PM_PUSH(DCPLB_DATA12)
+	PM_PUSH(DCPLB_DATA13)
+	PM_PUSH(DCPLB_DATA14)
+	PM_PUSH(DCPLB_DATA15)
+	PM_PUSH(IMEM_CONTROL)
+	PM_PUSH(ICPLB_ADDR0)
+	PM_PUSH(ICPLB_ADDR1)
+	PM_PUSH(ICPLB_ADDR2)
+	PM_PUSH(ICPLB_ADDR3)
+	PM_PUSH(ICPLB_ADDR4)
+	PM_PUSH(ICPLB_ADDR5)
+	PM_PUSH(ICPLB_ADDR6)
+	PM_PUSH(ICPLB_ADDR7)
+	PM_PUSH(ICPLB_ADDR8)
+	PM_PUSH(ICPLB_ADDR9)
+	PM_PUSH(ICPLB_ADDR10)
+	PM_PUSH(ICPLB_ADDR11)
+	PM_PUSH(ICPLB_ADDR12)
+	PM_PUSH(ICPLB_ADDR13)
+	PM_PUSH(ICPLB_ADDR14)
+	PM_PUSH(ICPLB_ADDR15)
+	PM_PUSH(ICPLB_DATA0)
+	PM_PUSH(ICPLB_DATA1)
+	PM_PUSH(ICPLB_DATA2)
+	PM_PUSH(ICPLB_DATA3)
+	PM_PUSH(ICPLB_DATA4)
+	PM_PUSH(ICPLB_DATA5)
+	PM_PUSH(ICPLB_DATA6)
+	PM_PUSH(ICPLB_DATA7)
+	PM_PUSH(ICPLB_DATA8)
+	PM_PUSH(ICPLB_DATA9)
+	PM_PUSH(ICPLB_DATA10)
+	PM_PUSH(ICPLB_DATA11)
+	PM_PUSH(ICPLB_DATA12)
+	PM_PUSH(ICPLB_DATA13)
+	PM_PUSH(ICPLB_DATA14)
+	PM_PUSH(ICPLB_DATA15)
+	PM_PUSH(EVT0)
+	PM_PUSH(EVT1)
+	PM_PUSH(EVT2)
+	PM_PUSH(EVT3)
+	PM_PUSH(EVT4)
+	PM_PUSH(EVT5)
+	PM_PUSH(EVT6)
+	PM_PUSH(EVT7)
+	PM_PUSH(EVT8)
+	PM_PUSH(EVT9)
+	PM_PUSH(EVT10)
+	PM_PUSH(EVT11)
+	PM_PUSH(EVT12)
+	PM_PUSH(EVT13)
+	PM_PUSH(EVT14)
+	PM_PUSH(EVT15)
+	PM_PUSH(IMASK)
+	PM_PUSH(ILAT)
+	PM_PUSH(IPRIO)
+	PM_PUSH(TCNTL)
+	PM_PUSH(TPERIOD)
+	PM_PUSH(TSCALE)
+	PM_PUSH(TCOUNT)
+	PM_PUSH(TBUFCTL)
+
+	/* Save Core Registers */
+	[--sp] = SYSCFG;
+	[--sp] = ( R7:0, P5:0 );
+	[--sp] = fp;
+	[--sp] = usp;
+
+	[--sp] = i0;
+	[--sp] = i1;
+	[--sp] = i2;
+	[--sp] = i3;
+
+	[--sp] = m0;
+	[--sp] = m1;
+	[--sp] = m2;
+	[--sp] = m3;
+
+	[--sp] = l0;
+	[--sp] = l1;
+	[--sp] = l2;
+	[--sp] = l3;
+
+	[--sp] = b0;
+	[--sp] = b1;
+	[--sp] = b2;
+	[--sp] = b3;
+	[--sp] = a0.x;
+	[--sp] = a0.w;
+	[--sp] = a1.x;
+	[--sp] = a1.w;
+
+	[--sp] = LC0;
+	[--sp] = LC1;
+	[--sp] = LT0;
+	[--sp] = LT1;
+	[--sp] = LB0;
+	[--sp] = LB1;
+
+	[--sp] = ASTAT;
+	[--sp] = CYCLES;
+	[--sp] = CYCLES2;
+
+	[--sp] = RETS;
+	r0 = RETI;
+	[--sp] = r0;
+	[--sp] = RETX;
+	[--sp] = RETN;
+	[--sp] = RETE;
+	[--sp] = SEQSTAT;
+
+	/* Save Magic, return address and Stack Pointer */
+	P0.H = 0;
+	P0.L = 0;
+	R0.H = 0xDEAD;	/* Hibernate Magic */
+	R0.L = 0xBEEF;
+	[P0++] = R0;	/* Store Hibernate Magic */
+	R0.H = pm_resume_here;
+	R0.L = pm_resume_here;
+	[P0++] = R0;	/* Save Return Address */
+	[P0++] = SP;	/* Save Stack Pointer */
+	P0.H = _hibernate_mode;
+	P0.L = _hibernate_mode;
+	R0 = R2;
+	call (P0); /* Goodbye */
+
+pm_resume_here:
+
+	/* Restore Core Registers */
+	SEQSTAT = [sp++];
+	RETE = [sp++];
+	RETN = [sp++];
+	RETX = [sp++];
+	r0 = [sp++];
+	RETI = r0;
+	RETS = [sp++];
+
+	CYCLES2 = [sp++];
+	CYCLES = [sp++];
+	ASTAT = [sp++];
+
+	LB1 = [sp++];
+	LB0 = [sp++];
+	LT1 = [sp++];
+	LT0 = [sp++];
+	LC1 = [sp++];
+	LC0 = [sp++];
+
+	a1.w = [sp++];
+	a1.x = [sp++];
+	a0.w = [sp++];
+	a0.x = [sp++];
+	b3 = [sp++];
+	b2 = [sp++];
+	b1 = [sp++];
+	b0 = [sp++];
+
+	l3 = [sp++];
+	l2 = [sp++];
+	l1 = [sp++];
+	l0 = [sp++];
+
+	m3 = [sp++];
+	m2 = [sp++];
+	m1 = [sp++];
+	m0 = [sp++];
+
+	i3 = [sp++];
+	i2 = [sp++];
+	i1 = [sp++];
+	i0 = [sp++];
+
+	usp = [sp++];
+	fp = [sp++];
+
+	( R7 : 0, P5 : 0) = [ SP ++ ];
+	SYSCFG = [sp++];
+
+	/* Restore Core MMRs */
+
+	PM_POP(TBUFCTL)
+	PM_POP(TCOUNT)
+	PM_POP(TSCALE)
+	PM_POP(TPERIOD)
+	PM_POP(TCNTL)
+	PM_POP(IPRIO)
+	PM_POP(ILAT)
+	PM_POP(IMASK)
+	PM_POP(EVT15)
+	PM_POP(EVT14)
+	PM_POP(EVT13)
+	PM_POP(EVT12)
+	PM_POP(EVT11)
+	PM_POP(EVT10)
+	PM_POP(EVT9)
+	PM_POP(EVT8)
+	PM_POP(EVT7)
+	PM_POP(EVT6)
+	PM_POP(EVT5)
+	PM_POP(EVT4)
+	PM_POP(EVT3)
+	PM_POP(EVT2)
+	PM_POP(EVT1)
+	PM_POP(EVT0)
+	PM_POP(ICPLB_DATA15)
+	PM_POP(ICPLB_DATA14)
+	PM_POP(ICPLB_DATA13)
+	PM_POP(ICPLB_DATA12)
+	PM_POP(ICPLB_DATA11)
+	PM_POP(ICPLB_DATA10)
+	PM_POP(ICPLB_DATA9)
+	PM_POP(ICPLB_DATA8)
+	PM_POP(ICPLB_DATA7)
+	PM_POP(ICPLB_DATA6)
+	PM_POP(ICPLB_DATA5)
+	PM_POP(ICPLB_DATA4)
+	PM_POP(ICPLB_DATA3)
+	PM_POP(ICPLB_DATA2)
+	PM_POP(ICPLB_DATA1)
+	PM_POP(ICPLB_DATA0)
+	PM_POP(ICPLB_ADDR15)
+	PM_POP(ICPLB_ADDR14)
+	PM_POP(ICPLB_ADDR13)
+	PM_POP(ICPLB_ADDR12)
+	PM_POP(ICPLB_ADDR11)
+	PM_POP(ICPLB_ADDR10)
+	PM_POP(ICPLB_ADDR9)
+	PM_POP(ICPLB_ADDR8)
+	PM_POP(ICPLB_ADDR7)
+	PM_POP(ICPLB_ADDR6)
+	PM_POP(ICPLB_ADDR5)
+	PM_POP(ICPLB_ADDR4)
+	PM_POP(ICPLB_ADDR3)
+	PM_POP(ICPLB_ADDR2)
+	PM_POP(ICPLB_ADDR1)
+	PM_POP(ICPLB_ADDR0)
+	PM_POP(IMEM_CONTROL)
+	PM_POP(DCPLB_DATA15)
+	PM_POP(DCPLB_DATA14)
+	PM_POP(DCPLB_DATA13)
+	PM_POP(DCPLB_DATA12)
+	PM_POP(DCPLB_DATA11)
+	PM_POP(DCPLB_DATA10)
+	PM_POP(DCPLB_DATA9)
+	PM_POP(DCPLB_DATA8)
+	PM_POP(DCPLB_DATA7)
+	PM_POP(DCPLB_DATA6)
+	PM_POP(DCPLB_DATA5)
+	PM_POP(DCPLB_DATA4)
+	PM_POP(DCPLB_DATA3)
+	PM_POP(DCPLB_DATA2)
+	PM_POP(DCPLB_DATA1)
+	PM_POP(DCPLB_DATA0)
+	PM_POP(DCPLB_ADDR15)
+	PM_POP(DCPLB_ADDR14)
+	PM_POP(DCPLB_ADDR13)
+	PM_POP(DCPLB_ADDR12)
+	PM_POP(DCPLB_ADDR11)
+	PM_POP(DCPLB_ADDR10)
+	PM_POP(DCPLB_ADDR9)
+	PM_POP(DCPLB_ADDR8)
+	PM_POP(DCPLB_ADDR7)
+	PM_POP(DCPLB_ADDR6)
+	PM_POP(DCPLB_ADDR5)
+	PM_POP(DCPLB_ADDR4)
+	PM_POP(DCPLB_ADDR3)
+	PM_POP(DCPLB_ADDR2)
+	PM_POP(DCPLB_ADDR1)
+	PM_POP(DCPLB_ADDR0)
+	PM_POP(DMEM_CONTROL)
+
+	/* Restore System MMRs */
+
+	P0.H = hi(PLL_CTL);
+	P0.L = lo(PLL_CTL);
+	PM_SYS_POP16(SYSCR)
+
+#ifdef EBIU_FCTL
+	PM_SYS_POP(EBIU_FCTL)
+	PM_SYS_POP(EBIU_MODE)
+	PM_SYS_POP(EBIU_MBSCTL)
+#endif
+	PM_SYS_POP16(EBIU_AMGCTL)
+	PM_SYS_POP(EBIU_AMBCTL1)
+	PM_SYS_POP(EBIU_AMBCTL0)
+
+#ifdef PINT0_ASSIGN
+	PM_SYS_POP(PINT3_ASSIGN)
+	PM_SYS_POP(PINT2_ASSIGN)
+	PM_SYS_POP(PINT1_ASSIGN)
+	PM_SYS_POP(PINT0_ASSIGN)
+#endif
+
+#ifdef SICA_IWR1
+	PM_SYS_POP(SICA_IWR1)
+#endif
+#ifdef SICA_IWR0
+	PM_SYS_POP(SICA_IWR0)
+#endif
+#ifdef SIC_IWR2
+	PM_SYS_POP(SIC_IWR2)
+#endif
+#ifdef SIC_IWR1
+	PM_SYS_POP(SIC_IWR1)
+#endif
+#ifdef SIC_IWR0
+	PM_SYS_POP(SIC_IWR0)
+#endif
+#ifdef SIC_IWR
+	PM_SYS_POP(SIC_IWR)
+#endif
+
+#ifdef SICA_IAR0
+	PM_SYS_POP(SICA_IAR7)
+	PM_SYS_POP(SICA_IAR6)
+	PM_SYS_POP(SICA_IAR5)
+	PM_SYS_POP(SICA_IAR4)
+	PM_SYS_POP(SICA_IAR3)
+	PM_SYS_POP(SICA_IAR2)
+	PM_SYS_POP(SICA_IAR1)
+	PM_SYS_POP(SICA_IAR0)
+#endif
+
+#ifdef SIC_IAR8
+	PM_SYS_POP(SIC_IAR11)
+	PM_SYS_POP(SIC_IAR10)
+	PM_SYS_POP(SIC_IAR9)
+	PM_SYS_POP(SIC_IAR8)
+#endif
+#ifdef SIC_IAR7
+	PM_SYS_POP(SIC_IAR7)
+#endif
+#ifdef SIC_IAR6
+	PM_SYS_POP(SIC_IAR6)
+	PM_SYS_POP(SIC_IAR5)
+	PM_SYS_POP(SIC_IAR4)
+#endif
+#ifdef SIC_IAR3
+	PM_SYS_POP(SIC_IAR3)
+#endif
+#ifdef SIC_IAR2
+	PM_SYS_POP(SIC_IAR2)
+	PM_SYS_POP(SIC_IAR1)
+	PM_SYS_POP(SIC_IAR0)
+#endif
+#ifdef SICA_IMASK1
+	PM_SYS_POP(SICA_IMASK1)
+#endif
+#ifdef SICA_IMASK0
+	PM_SYS_POP(SICA_IMASK0)
+#endif
+#ifdef SIC_IMASK
+	PM_SYS_POP(SIC_IMASK)
+#endif
+#ifdef SIC_IMASK2
+	PM_SYS_POP(SIC_IMASK2)
+#endif
+#ifdef SIC_IMASK1
+	PM_SYS_POP(SIC_IMASK1)
+#endif
+#ifdef SIC_IMASK0
+	PM_SYS_POP(SIC_IMASK0)
+#endif
+
+	[--sp] = RETI;	/* Clear Global Interrupt Disable */
+	SP += 4;
+
+	RETS = [SP++];
+	( R7:0, P5:0 ) = [SP++];
+	RTS;
diff --git a/arch/blackfin/mach-common/pm.c b/arch/blackfin/mach-common/pm.c
index 0be805ca423f..4fe6a2366b13 100644
--- a/arch/blackfin/mach-common/pm.c
+++ b/arch/blackfin/mach-common/pm.c
@@ -38,8 +38,9 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 
-#include <asm/dpmc.h>
 #include <asm/gpio.h>
+#include <asm/dma.h>
+#include <asm/dpmc.h>
 
 #ifdef CONFIG_PM_WAKEUP_GPIO_POLAR_H
 #define WAKEUP_TYPE	PM_WAKE_HIGH
@@ -61,16 +62,17 @@
 #define WAKEUP_TYPE	PM_WAKE_BOTH_EDGES
 #endif
 
+
 void bfin_pm_suspend_standby_enter(void)
 {
+	unsigned long flags;
+
 #ifdef CONFIG_PM_WAKEUP_BY_GPIO
 	gpio_pm_wakeup_request(CONFIG_PM_WAKEUP_GPIO_NUMBER, WAKEUP_TYPE);
 #endif
 
-	u32 flags;
-
 	local_irq_save(flags);
-	bfin_pm_setup();
+	bfin_pm_standby_setup();
 
 #ifdef CONFIG_PM_BFIN_SLEEP_DEEPER
 	sleep_deeper(bfin_sic_iwr[0], bfin_sic_iwr[1], bfin_sic_iwr[2]);
@@ -78,7 +80,7 @@ void bfin_pm_suspend_standby_enter(void)
 	sleep_mode(bfin_sic_iwr[0], bfin_sic_iwr[1], bfin_sic_iwr[2]);
 #endif
 
-	bfin_pm_restore();
+	bfin_pm_standby_restore();
 
 #if defined(CONFIG_BF54x) || defined(CONFIG_BF52x)  || defined(CONFIG_BF561)
 	bfin_write_SIC_IWR0(IWR_ENABLE_ALL);
@@ -93,6 +95,195 @@ void bfin_pm_suspend_standby_enter(void)
 	local_irq_restore(flags);
 }
 
+int bf53x_suspend_l1_mem(unsigned char *memptr)
+{
+	dma_memcpy(memptr, (const void *) L1_CODE_START, L1_CODE_LENGTH);
+	dma_memcpy(memptr + L1_CODE_LENGTH, (const void *) L1_DATA_A_START,
+			L1_DATA_A_LENGTH);
+	dma_memcpy(memptr + L1_CODE_LENGTH + L1_DATA_A_LENGTH,
+			(const void *) L1_DATA_B_START, L1_DATA_B_LENGTH);
+	memcpy(memptr + L1_CODE_LENGTH + L1_DATA_A_LENGTH +
+			L1_DATA_B_LENGTH, (const void *) L1_SCRATCH_START,
+			L1_SCRATCH_LENGTH);
+
+	return 0;
+}
+
+int bf53x_resume_l1_mem(unsigned char *memptr)
+{
+	dma_memcpy((void *) L1_CODE_START, memptr, L1_CODE_LENGTH);
+	dma_memcpy((void *) L1_DATA_A_START, memptr + L1_CODE_LENGTH,
+			L1_DATA_A_LENGTH);
+	dma_memcpy((void *) L1_DATA_B_START, memptr + L1_CODE_LENGTH +
+			L1_DATA_A_LENGTH, L1_DATA_B_LENGTH);
+	memcpy((void *) L1_SCRATCH_START, memptr + L1_CODE_LENGTH +
+			L1_DATA_A_LENGTH + L1_DATA_B_LENGTH, L1_SCRATCH_LENGTH);
+
+	return 0;
+}
+
+#ifdef CONFIG_BFIN_WB
+static void flushinv_all_dcache(void)
+{
+	u32 way, bank, subbank, set;
+	u32 status, addr;
+	u32 dmem_ctl = bfin_read_DMEM_CONTROL();
+
+	for (bank = 0; bank < 2; ++bank) {
+		if (!(dmem_ctl & (1 << (DMC1_P - bank))))
+			continue;
+
+		for (way = 0; way < 2; ++way)
+			for (subbank = 0; subbank < 4; ++subbank)
+				for (set = 0; set < 64; ++set) {
+
+					bfin_write_DTEST_COMMAND(
+						way << 26 |
+						bank << 23 |
+						subbank << 16 |
+						set << 5
+					);
+					CSYNC();
+					status = bfin_read_DTEST_DATA0();
+
+					/* only worry about valid/dirty entries */
+					if ((status & 0x3) != 0x3)
+						continue;
+
+					/* construct the address using the tag */
+					addr = (status & 0xFFFFC800) | (subbank << 12) | (set << 5);
+
+					/* flush it */
+					__asm__ __volatile__("FLUSHINV[%0];" : : "a"(addr));
+				}
+	}
+}
+#endif
+
+static inline void dcache_disable(void)
+{
+#ifdef CONFIG_BFIN_DCACHE
+	unsigned long ctrl;
+
+#ifdef CONFIG_BFIN_WB
+	flushinv_all_dcache();
+#endif
+	SSYNC();
+	ctrl = bfin_read_DMEM_CONTROL();
+	ctrl &= ~ENDCPLB;
+	bfin_write_DMEM_CONTROL(ctrl);
+	SSYNC();
+#endif
+}
+
+static inline void dcache_enable(void)
+{
+#ifdef CONFIG_BFIN_DCACHE
+	unsigned long ctrl;
+	SSYNC();
+	ctrl = bfin_read_DMEM_CONTROL();
+	ctrl |= ENDCPLB;
+	bfin_write_DMEM_CONTROL(ctrl);
+	SSYNC();
+#endif
+}
+
+static inline void icache_disable(void)
+{
+#ifdef CONFIG_BFIN_ICACHE
+	unsigned long ctrl;
+	SSYNC();
+	ctrl = bfin_read_IMEM_CONTROL();
+	ctrl &= ~ENICPLB;
+	bfin_write_IMEM_CONTROL(ctrl);
+	SSYNC();
+#endif
+}
+
+static inline void icache_enable(void)
+{
+#ifdef CONFIG_BFIN_ICACHE
+	unsigned long ctrl;
+	SSYNC();
+	ctrl = bfin_read_IMEM_CONTROL();
+	ctrl |= ENICPLB;
+	bfin_write_IMEM_CONTROL(ctrl);
+	SSYNC();
+#endif
+}
+
+int bfin_pm_suspend_mem_enter(void)
+{
+	unsigned long flags;
+	int wakeup, ret;
+
+	unsigned char *memptr = kmalloc(L1_CODE_LENGTH + L1_DATA_A_LENGTH
+					 + L1_DATA_B_LENGTH + L1_SCRATCH_LENGTH,
+					  GFP_KERNEL);
+
+	if (memptr == NULL) {
+		panic("bf53x_suspend_l1_mem malloc failed");
+		return -ENOMEM;
+	}
+
+	wakeup = bfin_read_VR_CTL() & ~FREQ;
+	wakeup |= SCKELOW;
+
+	/* FIXME: merge this somehow with set_irq_wake */
+#ifdef CONFIG_PM_BFIN_WAKE_RTC
+	wakeup |= WAKE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_PH6
+	wakeup |= PHYWE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_CAN
+	wakeup |= CANWE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_GP
+	wakeup |= GPWE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_USB
+	wakeup |= USBWE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_KEYPAD
+	wakeup |= KPADWE;
+#endif
+#ifdef CONFIG_PM_BFIN_WAKE_ROTARY
+	wakeup |= ROTWE;
+#endif
+
+	local_irq_save(flags);
+
+	ret = blackfin_dma_suspend();
+
+	if (ret) {
+		local_irq_restore(flags);
+		kfree(memptr);
+		return ret;
+	}
+
+	bfin_gpio_pm_hibernate_suspend();
+
+	dcache_disable();
+	icache_disable();
+	bf53x_suspend_l1_mem(memptr);
+
+	do_hibernate(wakeup);	/* Goodbye */
+
+	bf53x_resume_l1_mem(memptr);
+
+	icache_enable();
+	dcache_enable();
+
+	bfin_gpio_pm_hibernate_restore();
+	blackfin_dma_resume();
+
+	local_irq_restore(flags);
+	kfree(memptr);
+
+	return 0;
+}
+
 /*
  *	bfin_pm_valid - Tell the PM core that we only support the standby sleep
  *			state
@@ -101,7 +292,24 @@ void bfin_pm_suspend_standby_enter(void)
  */
 static int bfin_pm_valid(suspend_state_t state)
 {
-	return (state == PM_SUSPEND_STANDBY);
+	return (state == PM_SUSPEND_STANDBY
+#ifndef BF533_FAMILY
+	/*
+	 * On BF533/2/1:
+	 * If we enter Hibernate the SCKE Pin is driven Low,
+	 * so that the SDRAM enters Self Refresh Mode.
+	 * However when the reset sequence that follows hibernate
+	 * state is executed, SCKE is driven High, taking the
+	 * SDRAM out of Self Refresh.
+	 *
+	 * If you reconfigure and access the SDRAM "very quickly",
+	 * you are likely to avoid errors, otherwise the SDRAM
+	 * start losing its contents.
+	 * An external HW workaround is possible using logic gates.
+	 */
+	|| state == PM_SUSPEND_MEM
+#endif
+	);
 }
 
 /*
@@ -115,10 +323,9 @@ static int bfin_pm_enter(suspend_state_t state)
 	case PM_SUSPEND_STANDBY:
 		bfin_pm_suspend_standby_enter();
 		break;
-
 	case PM_SUSPEND_MEM:
-		return -ENOTSUPP;
-
+		bfin_pm_suspend_mem_enter();
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/asm-blackfin/dma.h b/include/asm-blackfin/dma.h
index c0d5259e315b..3cd4b522aa3f 100644
--- a/include/asm-blackfin/dma.h
+++ b/include/asm-blackfin/dma.h
@@ -144,8 +144,16 @@ struct dma_channel {
 	void *data;
 	unsigned int dma_enable_flag;
 	unsigned int loopback_flag;
+#ifdef CONFIG_PM
+	unsigned short saved_peripheral_map;
+#endif
 };
 
+#ifdef CONFIG_PM
+int blackfin_dma_suspend(void);
+void blackfin_dma_resume(void);
+#endif
+
 /*******************************************************************************
 *	DMA API's
 *******************************************************************************/
diff --git a/include/asm-blackfin/dpmc.h b/include/asm-blackfin/dpmc.h
index 7f34cd384f12..de28e6e018b3 100644
--- a/include/asm-blackfin/dpmc.h
+++ b/include/asm-blackfin/dpmc.h
@@ -7,63 +7,18 @@
 #ifndef _BLACKFIN_DPMC_H_
 #define _BLACKFIN_DPMC_H_
 
-#define SLEEP_MODE		1
-#define DEEP_SLEEP_MODE		2
-#define ACTIVE_PLL_DISABLED	3
-#define FULLON_MODE		4
-#define ACTIVE_PLL_ENABLED	5
-#define HIBERNATE_MODE		6
-
-#define IOCTL_FULL_ON_MODE	_IO('s', 0xA0)
-#define IOCTL_ACTIVE_MODE	_IO('s', 0xA1)
-#define IOCTL_SLEEP_MODE	_IO('s', 0xA2)
-#define IOCTL_DEEP_SLEEP_MODE	_IO('s', 0xA3)
-#define IOCTL_HIBERNATE_MODE	_IO('s', 0xA4)
-#define IOCTL_CHANGE_FREQUENCY	_IOW('s', 0xA5, unsigned long)
-#define IOCTL_CHANGE_VOLTAGE	_IOW('s', 0xA6, unsigned long)
-#define IOCTL_SET_CCLK		_IOW('s', 0xA7, unsigned long)
-#define IOCTL_SET_SCLK		_IOW('s', 0xA8, unsigned long)
-#define IOCTL_GET_PLLSTATUS	_IOW('s', 0xA9, unsigned long)
-#define IOCTL_GET_CORECLOCK	_IOW('s', 0xAA, unsigned long)
-#define IOCTL_GET_SYSTEMCLOCK	_IOW('s', 0xAB, unsigned long)
-#define IOCTL_GET_VCO		_IOW('s', 0xAC, unsigned long)
-#define IOCTL_DISABLE_WDOG_TIMER _IO('s', 0xAD)
-#define IOCTL_UNMASK_WDOG_WAKEUP_EVENT _IO('s',0xAE)
-#define IOCTL_PROGRAM_WDOG_TIMER _IOW('s',0xAF,unsigned long)
-#define IOCTL_CLEAR_WDOG_WAKEUP_EVENT _IO('s',0xB0)
-#define IOCTL_SLEEP_DEEPER_MODE _IO('s',0xB1)
-
-#define DPMC_MINOR		254
-
-#define ON	0
-#define OFF	1
-
 #ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
-unsigned long calc_volt(void);
-int calc_vlev(int vlt);
-unsigned long change_voltage(unsigned long volt);
-int calc_msel(int vco_hz);
-unsigned long change_frequency(unsigned long vco_mhz);
-int set_pll_div(unsigned short sel, unsigned char flag);
-int get_vco(void);
-unsigned long change_system_clock(unsigned long clock);
-unsigned long change_core_clock(unsigned long clock);
-unsigned long get_pll_status(void);
-void change_baud(int baud);
-void fullon_mode(void);
-void active_mode(void);
 void sleep_mode(u32 sic_iwr0, u32 sic_iwr1, u32 sic_iwr2);
 void deep_sleep(u32 sic_iwr0, u32 sic_iwr1, u32 sic_iwr2);
 void hibernate_mode(u32 sic_iwr0, u32 sic_iwr1, u32 sic_iwr2);
 void sleep_deeper(u32 sic_iwr0, u32 sic_iwr1, u32 sic_iwr2);
-void program_wdog_timer(unsigned long);
-void unmask_wdog_wakeup_evt(void);
-void clear_wdog_wakeup_evt(void);
-void disable_wdog_timer(void);
+void do_hibernate(int wakeup);
+void set_dram_srfs(void);
+void unset_dram_srfs(void);
 
-extern unsigned long get_cclk(void);
-extern unsigned long get_sclk(void);
+#define VRPAIR(vlev, freq) (((vlev) << 16) | ((freq) >> 16))
 
 struct bfin_dpmc_platform_data {
 	const unsigned int *tuple_tab;
@@ -71,8 +26,33 @@ struct bfin_dpmc_platform_data {
 	unsigned short vr_settling_time; /* in us */
 };
 
-#define VRPAIR(vlev, freq) (((vlev) << 16) | ((freq) >> 16))
+#else
+
+#define PM_PUSH(x) \
+	R0 = [P0 + (x - SRAM_BASE_ADDRESS)];\
+	[--SP] =  R0;\
+
+#define PM_POP(x) \
+	R0 = [SP++];\
+	[P0 + (x - SRAM_BASE_ADDRESS)] = R0;\
+
+#define PM_SYS_PUSH(x) \
+	R0 = [P0 + (x - PLL_CTL)];\
+	[--SP] =  R0;\
+
+#define PM_SYS_POP(x) \
+	R0 = [SP++];\
+	[P0 + (x - PLL_CTL)] = R0;\
+
+#define PM_SYS_PUSH16(x) \
+	R0 = w[P0 + (x - PLL_CTL)];\
+	[--SP] =  R0;\
+
+#define PM_SYS_POP16(x) \
+	R0 = [SP++];\
+	w[P0 + (x - PLL_CTL)] = R0;\
 
+#endif
 #endif	/* __KERNEL__ */
 
 #endif	/*_BLACKFIN_DPMC_H_*/
diff --git a/include/asm-blackfin/gpio.h b/include/asm-blackfin/gpio.h
index ff95e9d88342..168f1251eb4d 100644
--- a/include/asm-blackfin/gpio.h
+++ b/include/asm-blackfin/gpio.h
@@ -376,8 +376,12 @@ struct gpio_port_t {
 #endif
 
 #ifdef CONFIG_PM
-unsigned int bfin_pm_setup(void);
-void bfin_pm_restore(void);
+
+unsigned int bfin_pm_standby_setup(void);
+void bfin_pm_standby_restore(void);
+
+void bfin_gpio_pm_hibernate_restore(void);
+void bfin_gpio_pm_hibernate_suspend(void);
 
 #ifndef CONFIG_BF54x
 #define PM_WAKE_RISING	0x1
@@ -392,17 +396,8 @@ void gpio_pm_wakeup_free(unsigned gpio);
 
 struct gpio_port_s {
 	unsigned short data;
-	unsigned short data_clear;
-	unsigned short data_set;
-	unsigned short toggle;
 	unsigned short maska;
-	unsigned short maska_clear;
-	unsigned short maska_set;
-	unsigned short maska_toggle;
 	unsigned short maskb;
-	unsigned short maskb_clear;
-	unsigned short maskb_set;
-	unsigned short maskb_toggle;
 	unsigned short dir;
 	unsigned short polar;
 	unsigned short edge;
@@ -411,10 +406,10 @@ struct gpio_port_s {
 
 	unsigned short fer;
 	unsigned short reserved;
+	unsigned short mux;
 };
 #endif /*CONFIG_BF54x*/
 #endif /*CONFIG_PM*/
-
 /***********************************************************
 *
 * FUNCTIONS: Blackfin GPIO Driver
diff --git a/include/asm-blackfin/mach-bf548/gpio.h b/include/asm-blackfin/mach-bf548/gpio.h
index cb8b0f15c9a6..bba82dc75f16 100644
--- a/include/asm-blackfin/mach-bf548/gpio.h
+++ b/include/asm-blackfin/mach-bf548/gpio.h
@@ -209,3 +209,11 @@ struct gpio_port_t {
 	unsigned short dummy7;
 	unsigned int port_mux;
 };
+
+struct gpio_port_s {
+	unsigned short fer;
+	unsigned short data;
+	unsigned short dir;
+	unsigned short inen;
+	unsigned int mux;
+};
-- 
cgit v1.2.3


From 262c3825a9f3eb0f4f30ebb4b1ee57397bcb3ffc Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Sat, 19 Jul 2008 15:42:41 +0800
Subject: Blackfin arch:  Extend sram malloc to handle L2 SRAM.

Extend system call to alloc L2 SRAM in application.
Automatically move following sections to L2 SRAM:
1. kernel built-in l2 attribute section
2. kernel module l2 attribute section
3. elf-fdpic application l2 attribute section

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 arch/blackfin/kernel/module.c      |  74 ++++++++++++----
 arch/blackfin/kernel/setup.c       |  10 +++
 arch/blackfin/kernel/vmlinux.lds.S |  40 +++++++--
 arch/blackfin/mm/blackfin_sram.c   | 170 ++++++++++++++++++++++++++++++-------
 include/asm-blackfin/bfin-global.h |   8 +-
 include/asm-blackfin/elf.h         |   2 +
 include/asm-blackfin/module.h      |   5 +-
 7 files changed, 253 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 14a42848f37f..e1bebc80a5bf 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -173,7 +173,7 @@ module_frob_arch_sections(Elf_Ehdr * hdr, Elf_Shdr * sechdrs,
 	for (s = sechdrs; s < sechdrs_end; ++s) {
 		if ((strcmp(".l1.text", secstrings + s->sh_name) == 0) ||
 		    ((strcmp(".text", secstrings + s->sh_name) == 0) &&
-		     (hdr->e_flags & FLG_CODE_IN_L1) && (s->sh_size > 0))) {
+		     (hdr->e_flags & EF_BFIN_CODE_IN_L1) && (s->sh_size > 0))) {
 			dest = l1_inst_sram_alloc(s->sh_size);
 			mod->arch.text_l1 = dest;
 			if (dest == NULL) {
@@ -188,7 +188,7 @@ module_frob_arch_sections(Elf_Ehdr * hdr, Elf_Shdr * sechdrs,
 		}
 		if ((strcmp(".l1.data", secstrings + s->sh_name) == 0) ||
 		    ((strcmp(".data", secstrings + s->sh_name) == 0) &&
-		     (hdr->e_flags & FLG_DATA_IN_L1) && (s->sh_size > 0))) {
+		     (hdr->e_flags & EF_BFIN_DATA_IN_L1) && (s->sh_size > 0))) {
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.data_a_l1 = dest;
 			if (dest == NULL) {
@@ -203,7 +203,7 @@ module_frob_arch_sections(Elf_Ehdr * hdr, Elf_Shdr * sechdrs,
 		}
 		if (strcmp(".l1.bss", secstrings + s->sh_name) == 0 ||
 		    ((strcmp(".bss", secstrings + s->sh_name) == 0) &&
-		     (hdr->e_flags & FLG_DATA_IN_L1) && (s->sh_size > 0))) {
+		     (hdr->e_flags & EF_BFIN_DATA_IN_L1) && (s->sh_size > 0))) {
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.bss_a_l1 = dest;
 			if (dest == NULL) {
@@ -242,6 +242,51 @@ module_frob_arch_sections(Elf_Ehdr * hdr, Elf_Shdr * sechdrs,
 			s->sh_flags &= ~SHF_ALLOC;
 			s->sh_addr = (unsigned long)dest;
 		}
+		if ((strcmp(".l2.text", secstrings + s->sh_name) == 0) ||
+		    ((strcmp(".text", secstrings + s->sh_name) == 0) &&
+		     (hdr->e_flags & EF_BFIN_CODE_IN_L2) && (s->sh_size > 0))) {
+			dest = l2_sram_alloc(s->sh_size);
+			mod->arch.text_l2 = dest;
+			if (dest == NULL) {
+				printk(KERN_ERR
+				       "module %s: L2 SRAM allocation failed\n",
+				       mod->name);
+				return -1;
+			}
+			memcpy(dest, (void *)s->sh_addr, s->sh_size);
+			s->sh_flags &= ~SHF_ALLOC;
+			s->sh_addr = (unsigned long)dest;
+		}
+		if ((strcmp(".l2.data", secstrings + s->sh_name) == 0) ||
+		    ((strcmp(".data", secstrings + s->sh_name) == 0) &&
+		     (hdr->e_flags & EF_BFIN_DATA_IN_L2) && (s->sh_size > 0))) {
+			dest = l2_sram_alloc(s->sh_size);
+			mod->arch.data_l2 = dest;
+			if (dest == NULL) {
+				printk(KERN_ERR
+					"module %s: L2 SRAM allocation failed\n",
+					mod->name);
+				return -1;
+			}
+			memcpy(dest, (void *)s->sh_addr, s->sh_size);
+			s->sh_flags &= ~SHF_ALLOC;
+			s->sh_addr = (unsigned long)dest;
+		}
+		if (strcmp(".l2.bss", secstrings + s->sh_name) == 0 ||
+		    ((strcmp(".bss", secstrings + s->sh_name) == 0) &&
+		     (hdr->e_flags & EF_BFIN_DATA_IN_L2) && (s->sh_size > 0))) {
+			dest = l2_sram_alloc(s->sh_size);
+			mod->arch.bss_l2 = dest;
+			if (dest == NULL) {
+				printk(KERN_ERR
+					"module %s: L2 SRAM allocation failed\n",
+					mod->name);
+				return -1;
+			}
+			memset(dest, 0, s->sh_size);
+			s->sh_flags &= ~SHF_ALLOC;
+			s->sh_addr = (unsigned long)dest;
+		}
 	}
 	return 0;
 }
@@ -411,9 +456,10 @@ module_finalize(const Elf_Ehdr * hdr,
 			continue;
 
 		if ((sechdrs[i].sh_type == SHT_RELA) &&
-		    ((strcmp(".rela.l1.text", secstrings + sechdrs[i].sh_name) == 0) ||
+		    ((strcmp(".rela.l2.text", secstrings + sechdrs[i].sh_name) == 0) ||
+		    (strcmp(".rela.l1.text", secstrings + sechdrs[i].sh_name) == 0) ||
 		    ((strcmp(".rela.text", secstrings + sechdrs[i].sh_name) == 0) &&
-			 (hdr->e_flags & FLG_CODE_IN_L1)))) {
+			(hdr->e_flags & (EF_BFIN_CODE_IN_L1|EF_BFIN_CODE_IN_L2))))) {
 			apply_relocate_add((Elf_Shdr *) sechdrs, strtab,
 					   symindex, i, mod);
 		}
@@ -423,14 +469,12 @@ module_finalize(const Elf_Ehdr * hdr,
 
 void module_arch_cleanup(struct module *mod)
 {
-	if (mod->arch.text_l1)
-		l1_inst_sram_free((void *)mod->arch.text_l1);
-	if (mod->arch.data_a_l1)
-		l1_data_sram_free((void *)mod->arch.data_a_l1);
-	if (mod->arch.bss_a_l1)
-		l1_data_sram_free((void *)mod->arch.bss_a_l1);
-	if (mod->arch.data_b_l1)
-		l1_data_B_sram_free((void *)mod->arch.data_b_l1);
-	if (mod->arch.bss_b_l1)
-		l1_data_B_sram_free((void *)mod->arch.bss_b_l1);
+	l1_inst_sram_free(mod->arch.text_l1);
+	l1_data_A_sram_free(mod->arch.data_a_l1);
+	l1_data_A_sram_free(mod->arch.bss_a_l1);
+	l1_data_B_sram_free(mod->arch.data_b_l1);
+	l1_data_B_sram_free(mod->arch.bss_b_l1);
+	l2_sram_free(mod->arch.text_l2);
+	l2_sram_free(mod->arch.data_l2);
+	l2_sram_free(mod->arch.bss_l2);
 }
diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c
index 861a1db74df8..8671d1db1f99 100644
--- a/arch/blackfin/kernel/setup.c
+++ b/arch/blackfin/kernel/setup.c
@@ -104,6 +104,7 @@ void __init bf53x_relocate_l1_mem(void)
 	unsigned long l1_code_length;
 	unsigned long l1_data_a_length;
 	unsigned long l1_data_b_length;
+	unsigned long l2_length;
 
 	l1_code_length = _etext_l1 - _stext_l1;
 	if (l1_code_length > L1_CODE_LENGTH)
@@ -129,6 +130,15 @@ void __init bf53x_relocate_l1_mem(void)
 	/* Copy _sdata_b_l1 to _ebss_b_l1 to L1 data bank B SRAM */
 	dma_memcpy(_sdata_b_l1, _l1_lma_start + l1_code_length +
 			l1_data_a_length, l1_data_b_length);
+
+#ifdef L2_LENGTH
+	l2_length = _ebss_l2 - _stext_l2;
+	if (l2_length > L2_LENGTH)
+		panic("L2 SRAM Overflow\n");
+
+	/* Copy _stext_l2 to _edata_l2 to L2 SRAM */
+	dma_memcpy(_stext_l2, _l2_lma_start, l2_length);
+#endif
 }
 
 /* add_memory_region to memmap */
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 3ecc64cab3be..0896e38d6108 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -101,6 +101,11 @@ SECTIONS
 #if !L1_DATA_B_LENGTH
 		*(.l1.data.B)
 #endif
+#ifndef L2_LENGTH
+		. = ALIGN(32);
+		*(.data_l2.cacheline_aligned)
+		*(.l2.data)
+#endif
 
 		DATA_DATA
 		*(.data.*)
@@ -182,13 +187,12 @@ SECTIONS
 		*(.l1.data)
 		__edata_l1 = .;
 
-		. = ALIGN(4);
-		__sbss_l1 = .;
-		*(.l1.bss)
-
 		. = ALIGN(32);
 		*(.data_l1.cacheline_aligned)
 
+		. = ALIGN(4);
+		__sbss_l1 = .;
+		*(.l1.bss)
 		. = ALIGN(4);
 		__ebss_l1 = .;
 	}
@@ -203,11 +207,37 @@ SECTIONS
 		. = ALIGN(4);
 		__sbss_b_l1 = .;
 		*(.l1.bss.B)
-
 		. = ALIGN(4);
 		__ebss_b_l1 = .;
 	}
 
+#ifdef L2_LENGTH
+	__l2_lma_start = .;
+
+	.text_data_l2 L2_START : AT(LOADADDR(.data_b_l1) + SIZEOF(.data_b_l1))
+	{
+		. = ALIGN(4);
+		__stext_l2 = .;
+		*(.l1.text)
+		. = ALIGN(4);
+		__etext_l2 = .;
+
+		. = ALIGN(4);
+		__sdata_l2 = .;
+		*(.l1.data)
+		__edata_l2 = .;
+
+		. = ALIGN(32);
+		*(.data_l2.cacheline_aligned)
+
+		. = ALIGN(4);
+		__sbss_l2 = .;
+		*(.l1.bss)
+		. = ALIGN(4);
+		__ebss_l2 = .;
+	}
+#endif
+
 	/* Force trailing alignment of our init section so that when we
 	 * free our init memory, we don't leave behind a partial page.
 	 */
diff --git a/arch/blackfin/mm/blackfin_sram.c b/arch/blackfin/mm/blackfin_sram.c
index b58cf196d7cc..5af3c31c9365 100644
--- a/arch/blackfin/mm/blackfin_sram.c
+++ b/arch/blackfin/mm/blackfin_sram.c
@@ -42,6 +42,7 @@
 #include "blackfin_sram.h"
 
 static spinlock_t l1sram_lock, l1_data_sram_lock, l1_inst_sram_lock;
+static spinlock_t l2_sram_lock;
 
 /* the data structure for L1 scratchpad and DATA SRAM */
 struct sram_piece {
@@ -65,6 +66,10 @@ static struct sram_piece free_l1_data_B_sram_head, used_l1_data_B_sram_head;
 static struct sram_piece free_l1_inst_sram_head, used_l1_inst_sram_head;
 #endif
 
+#ifdef L2_LENGTH
+static struct sram_piece free_l2_sram_head, used_l2_sram_head;
+#endif
+
 static struct kmem_cache *sram_piece_cache;
 
 /* L1 Scratchpad SRAM initialization function */
@@ -97,7 +102,7 @@ static void __init l1_data_sram_init(void)
 	free_l1_data_A_sram_head.next =
 		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
 	if (!free_l1_data_A_sram_head.next) {
-		printk(KERN_INFO"Fail to initialize Data A SRAM.\n");
+		printk(KERN_INFO"Fail to initialize L1 Data A SRAM.\n");
 		return;
 	}
 
@@ -110,7 +115,7 @@ static void __init l1_data_sram_init(void)
 
 	used_l1_data_A_sram_head.next = NULL;
 
-	printk(KERN_INFO "Blackfin Data A SRAM: %d KB (%d KB free)\n",
+	printk(KERN_INFO "Blackfin L1 Data A SRAM: %d KB (%d KB free)\n",
 		L1_DATA_A_LENGTH >> 10,
 		free_l1_data_A_sram_head.next->size >> 10);
 #endif
@@ -118,7 +123,7 @@ static void __init l1_data_sram_init(void)
 	free_l1_data_B_sram_head.next =
 		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
 	if (!free_l1_data_B_sram_head.next) {
-		printk(KERN_INFO"Fail to initialize Data B SRAM.\n");
+		printk(KERN_INFO"Fail to initialize L1 Data B SRAM.\n");
 		return;
 	}
 
@@ -131,7 +136,7 @@ static void __init l1_data_sram_init(void)
 
 	used_l1_data_B_sram_head.next = NULL;
 
-	printk(KERN_INFO "Blackfin Data B SRAM: %d KB (%d KB free)\n",
+	printk(KERN_INFO "Blackfin L1 Data B SRAM: %d KB (%d KB free)\n",
 		L1_DATA_B_LENGTH >> 10,
 		free_l1_data_B_sram_head.next->size >> 10);
 #endif
@@ -146,7 +151,7 @@ static void __init l1_inst_sram_init(void)
 	free_l1_inst_sram_head.next =
 		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
 	if (!free_l1_inst_sram_head.next) {
-		printk(KERN_INFO"Fail to initialize Instruction SRAM.\n");
+		printk(KERN_INFO"Fail to initialize L1 Instruction SRAM.\n");
 		return;
 	}
 
@@ -159,7 +164,7 @@ static void __init l1_inst_sram_init(void)
 
 	used_l1_inst_sram_head.next = NULL;
 
-	printk(KERN_INFO "Blackfin Instruction SRAM: %d KB (%d KB free)\n",
+	printk(KERN_INFO "Blackfin L1 Instruction SRAM: %d KB (%d KB free)\n",
 		L1_CODE_LENGTH >> 10,
 		free_l1_inst_sram_head.next->size >> 10);
 #endif
@@ -168,6 +173,33 @@ static void __init l1_inst_sram_init(void)
 	spin_lock_init(&l1_inst_sram_lock);
 }
 
+static void __init l2_sram_init(void)
+{
+#ifdef L2_LENGTH
+	free_l2_sram_head.next =
+		kmem_cache_alloc(sram_piece_cache, GFP_KERNEL);
+	if (!free_l2_sram_head.next) {
+		printk(KERN_INFO"Fail to initialize L2 SRAM.\n");
+		return;
+	}
+
+	free_l2_sram_head.next->paddr = (void *)L2_START +
+		(_etext_l2 - _stext_l2) + (_edata_l2 - _sdata_l2);
+	free_l2_sram_head.next->size = L2_LENGTH -
+		(_etext_l2 - _stext_l2) + (_edata_l2 - _sdata_l2);
+	free_l2_sram_head.next->pid = 0;
+	free_l2_sram_head.next->next = NULL;
+
+	used_l2_sram_head.next = NULL;
+
+	printk(KERN_INFO "Blackfin L2 SRAM: %d KB (%d KB free)\n",
+		L2_LENGTH >> 10,
+		free_l2_sram_head.next->size >> 10);
+#endif
+
+	/* mutex initialize */
+	spin_lock_init(&l2_sram_lock);
+}
 void __init bfin_sram_init(void)
 {
 	sram_piece_cache = kmem_cache_create("sram_piece_cache",
@@ -177,10 +209,11 @@ void __init bfin_sram_init(void)
 	l1sram_init();
 	l1_data_sram_init();
 	l1_inst_sram_init();
+	l2_sram_init();
 }
 
-/* L1 memory allocate function */
-static void *_l1_sram_alloc(size_t size, struct sram_piece *pfree_head,
+/* SRAM allocate function */
+static void *_sram_alloc(size_t size, struct sram_piece *pfree_head,
 		struct sram_piece *pused_head)
 {
 	struct sram_piece *pslot, *plast, *pavail;
@@ -236,7 +269,7 @@ static void *_l1_sram_alloc(size_t size, struct sram_piece *pfree_head,
 }
 
 /* Allocate the largest available block.  */
-static void *_l1_sram_alloc_max(struct sram_piece *pfree_head,
+static void *_sram_alloc_max(struct sram_piece *pfree_head,
 				struct sram_piece *pused_head,
 				unsigned long *psize)
 {
@@ -259,11 +292,11 @@ static void *_l1_sram_alloc_max(struct sram_piece *pfree_head,
 
 	*psize = pmax->size;
 
-	return _l1_sram_alloc(*psize, pfree_head, pused_head);
+	return _sram_alloc(*psize, pfree_head, pused_head);
 }
 
-/* L1 memory free function */
-static int _l1_sram_free(const void *addr,
+/* SRAM free function */
+static int _sram_free(const void *addr,
 			struct sram_piece *pfree_head,
 			struct sram_piece *pused_head)
 {
@@ -333,6 +366,11 @@ int sram_free(const void *addr)
 	else if (addr >= (void *)L1_DATA_B_START
 		 && addr < (void *)(L1_DATA_B_START + L1_DATA_B_LENGTH))
 		return l1_data_B_sram_free(addr);
+#endif
+#ifdef L2_LENGTH
+	else if (addr >= (void *)L2_START
+		 && addr < (void *)(L2_START + L2_LENGTH))
+		return l2_sram_free(addr);
 #endif
 	else
 		return -1;
@@ -348,7 +386,7 @@ void *l1_data_A_sram_alloc(size_t size)
 	spin_lock_irqsave(&l1_data_sram_lock, flags);
 
 #if L1_DATA_A_LENGTH != 0
-	addr = _l1_sram_alloc(size, &free_l1_data_A_sram_head,
+	addr = _sram_alloc(size, &free_l1_data_A_sram_head,
 			&used_l1_data_A_sram_head);
 #endif
 
@@ -371,7 +409,7 @@ int l1_data_A_sram_free(const void *addr)
 	spin_lock_irqsave(&l1_data_sram_lock, flags);
 
 #if L1_DATA_A_LENGTH != 0
-	ret = _l1_sram_free(addr, &free_l1_data_A_sram_head,
+	ret = _sram_free(addr, &free_l1_data_A_sram_head,
 			&used_l1_data_A_sram_head);
 #else
 	ret = -1;
@@ -393,7 +431,7 @@ void *l1_data_B_sram_alloc(size_t size)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1_data_sram_lock, flags);
 
-	addr = _l1_sram_alloc(size, &free_l1_data_B_sram_head,
+	addr = _sram_alloc(size, &free_l1_data_B_sram_head,
 			&used_l1_data_B_sram_head);
 
 	/* add mutex operation */
@@ -418,7 +456,7 @@ int l1_data_B_sram_free(const void *addr)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1_data_sram_lock, flags);
 
-	ret = _l1_sram_free(addr, &free_l1_data_B_sram_head,
+	ret = _sram_free(addr, &free_l1_data_B_sram_head,
 			&used_l1_data_B_sram_head);
 
 	/* add mutex operation */
@@ -472,7 +510,7 @@ void *l1_inst_sram_alloc(size_t size)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1_inst_sram_lock, flags);
 
-	addr = _l1_sram_alloc(size, &free_l1_inst_sram_head,
+	addr = _sram_alloc(size, &free_l1_inst_sram_head,
 			&used_l1_inst_sram_head);
 
 	/* add mutex operation */
@@ -497,7 +535,7 @@ int l1_inst_sram_free(const void *addr)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1_inst_sram_lock, flags);
 
-	ret = _l1_sram_free(addr, &free_l1_inst_sram_head,
+	ret = _sram_free(addr, &free_l1_inst_sram_head,
 			&used_l1_inst_sram_head);
 
 	/* add mutex operation */
@@ -519,7 +557,7 @@ void *l1sram_alloc(size_t size)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1sram_lock, flags);
 
-	addr = _l1_sram_alloc(size, &free_l1_ssram_head,
+	addr = _sram_alloc(size, &free_l1_ssram_head,
 			&used_l1_ssram_head);
 
 	/* add mutex operation */
@@ -537,7 +575,7 @@ void *l1sram_alloc_max(size_t *psize)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1sram_lock, flags);
 
-	addr = _l1_sram_alloc_max(&free_l1_ssram_head,
+	addr = _sram_alloc_max(&free_l1_ssram_head,
 			&used_l1_ssram_head, psize);
 
 	/* add mutex operation */
@@ -555,7 +593,7 @@ int l1sram_free(const void *addr)
 	/* add mutex operation */
 	spin_lock_irqsave(&l1sram_lock, flags);
 
-	ret = _l1_sram_free(addr, &free_l1_ssram_head,
+	ret = _sram_free(addr, &free_l1_ssram_head,
 			&used_l1_ssram_head);
 
 	/* add mutex operation */
@@ -564,6 +602,64 @@ int l1sram_free(const void *addr)
 	return ret;
 }
 
+void *l2_sram_alloc(size_t size)
+{
+#ifdef L2_LENGTH
+	unsigned flags;
+	void *addr;
+
+	/* add mutex operation */
+	spin_lock_irqsave(&l2_sram_lock, flags);
+
+	addr = _sram_alloc(size, &free_l2_sram_head,
+			&used_l2_sram_head);
+
+	/* add mutex operation */
+	spin_unlock_irqrestore(&l2_sram_lock, flags);
+
+	pr_debug("Allocated address in l2_sram_alloc is 0x%lx+0x%lx\n",
+		 (long unsigned int)addr, size);
+
+	return addr;
+#else
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL(l2_sram_alloc);
+
+void *l2_sram_zalloc(size_t size)
+{
+	void *addr = l2_sram_alloc(size);
+
+	if (addr)
+		memset(addr, 0x00, size);
+
+	return addr;
+}
+EXPORT_SYMBOL(l2_sram_zalloc);
+
+int l2_sram_free(const void *addr)
+{
+#ifdef L2_LENGTH
+	unsigned flags;
+	int ret;
+
+	/* add mutex operation */
+	spin_lock_irqsave(&l2_sram_lock, flags);
+
+	ret = _sram_free(addr, &free_l2_sram_head,
+			&used_l2_sram_head);
+
+	/* add mutex operation */
+	spin_unlock_irqrestore(&l2_sram_lock, flags);
+
+	return ret;
+#else
+	return -1;
+#endif
+}
+EXPORT_SYMBOL(l2_sram_free);
+
 int sram_free_with_lsl(const void *addr)
 {
 	struct sram_list_struct *lsl, **tmp;
@@ -602,6 +698,9 @@ void *sram_alloc_with_lsl(size_t size, unsigned long flags)
 	if (addr == NULL && (flags & L1_DATA_B_SRAM))
 		addr = l1_data_B_sram_alloc(size);
 
+	if (addr == NULL && (flags & L2_SRAM))
+		addr = l2_sram_alloc(size);
+
 	if (addr == NULL) {
 		kfree(lsl);
 		return NULL;
@@ -621,7 +720,7 @@ EXPORT_SYMBOL(sram_alloc_with_lsl);
 /* Need to keep line of output the same.  Currently, that is 44 bytes
  * (including newline).
  */
-static int _l1sram_proc_read(char *buf, int *len, int count, const char *desc,
+static int _sram_proc_read(char *buf, int *len, int count, const char *desc,
 		struct sram_piece *pfree_head,
 		struct sram_piece *pused_head)
 {
@@ -630,13 +729,13 @@ static int _l1sram_proc_read(char *buf, int *len, int count, const char *desc,
 	if (!pfree_head || !pused_head)
 		return -1;
 
-	*len += sprintf(&buf[*len], "--- L1 %-14s Size   PID State     \n", desc);
+	*len += sprintf(&buf[*len], "--- SRAM %-14s Size   PID State     \n", desc);
 
 	/* search the relevant memory slot */
 	pslot = pused_head->next;
 
 	while (pslot != NULL) {
-		*len += sprintf(&buf[*len], "%p-%p %8i %5i %-10s\n",
+		*len += sprintf(&buf[*len], "%p-%p %10i %5i %-10s\n",
 			pslot->paddr, pslot->paddr + pslot->size,
 			pslot->size, pslot->pid, "ALLOCATED");
 
@@ -646,7 +745,7 @@ static int _l1sram_proc_read(char *buf, int *len, int count, const char *desc,
 	pslot = pfree_head->next;
 
 	while (pslot != NULL) {
-		*len += sprintf(&buf[*len], "%p-%p %8i %5i %-10s\n",
+		*len += sprintf(&buf[*len], "%p-%p %10i %5i %-10s\n",
 			pslot->paddr, pslot->paddr + pslot->size,
 			pslot->size, pslot->pid, "FREE");
 
@@ -655,38 +754,43 @@ static int _l1sram_proc_read(char *buf, int *len, int count, const char *desc,
 
 	return 0;
 }
-static int l1sram_proc_read(char *buf, char **start, off_t offset, int count,
+static int sram_proc_read(char *buf, char **start, off_t offset, int count,
 		int *eof, void *data)
 {
 	int len = 0;
 
-	if (_l1sram_proc_read(buf, &len, count, "Scratchpad",
+	if (_sram_proc_read(buf, &len, count, "Scratchpad",
 			&free_l1_ssram_head, &used_l1_ssram_head))
 		goto not_done;
 #if L1_DATA_A_LENGTH != 0
-	if (_l1sram_proc_read(buf, &len, count, "Data A",
+	if (_sram_proc_read(buf, &len, count, "L1 Data A",
 			&free_l1_data_A_sram_head,
 			&used_l1_data_A_sram_head))
 		goto not_done;
 #endif
 #if L1_DATA_B_LENGTH != 0
-	if (_l1sram_proc_read(buf, &len, count, "Data B",
+	if (_sram_proc_read(buf, &len, count, "L1 Data B",
 			&free_l1_data_B_sram_head,
 			&used_l1_data_B_sram_head))
 		goto not_done;
 #endif
 #if L1_CODE_LENGTH != 0
-	if (_l1sram_proc_read(buf, &len, count, "Instruction",
+	if (_sram_proc_read(buf, &len, count, "L1 Instruction",
 			&free_l1_inst_sram_head, &used_l1_inst_sram_head))
 		goto not_done;
 #endif
+#ifdef L2_LENGTH
+	if (_sram_proc_read(buf, &len, count, "L2",
+			&free_l2_sram_head, &used_l2_sram_head))
+		goto not_done;
+#endif
 
 	*eof = 1;
  not_done:
 	return len;
 }
 
-static int __init l1sram_proc_init(void)
+static int __init sram_proc_init(void)
 {
 	struct proc_dir_entry *ptr;
 	ptr = create_proc_entry("sram", S_IFREG | S_IRUGO, NULL);
@@ -695,8 +799,8 @@ static int __init l1sram_proc_init(void)
 		return -1;
 	}
 	ptr->owner = THIS_MODULE;
-	ptr->read_proc = l1sram_proc_read;
+	ptr->read_proc = sram_proc_read;
 	return 0;
 }
-late_initcall(l1sram_proc_init);
+late_initcall(sram_proc_init);
 #endif
diff --git a/include/asm-blackfin/bfin-global.h b/include/asm-blackfin/bfin-global.h
index 76033831eb35..320aa5e167e9 100644
--- a/include/asm-blackfin/bfin-global.h
+++ b/include/asm-blackfin/bfin-global.h
@@ -92,16 +92,20 @@ extern void *l1_data_B_sram_alloc(size_t);
 extern void *l1_inst_sram_alloc(size_t);
 extern void *l1_data_sram_alloc(size_t);
 extern void *l1_data_sram_zalloc(size_t);
+extern void *l2_sram_alloc(size_t);
+extern void *l2_sram_zalloc(size_t);
 extern int l1_data_A_sram_free(const void*);
 extern int l1_data_B_sram_free(const void*);
 extern int l1_inst_sram_free(const void*);
 extern int l1_data_sram_free(const void*);
+extern int l2_sram_free(const void *);
 extern int sram_free(const void*);
 
 #define L1_INST_SRAM		0x00000001
 #define L1_DATA_A_SRAM		0x00000002
 #define L1_DATA_B_SRAM		0x00000004
 #define L1_DATA_SRAM		0x00000006
+#define L2_SRAM			0x00000008
 extern void *sram_alloc_with_lsl(size_t, unsigned long);
 extern int sram_free_with_lsl(const void*);
 
@@ -114,7 +118,9 @@ extern struct file_operations dpmc_fops;
 extern unsigned long _ramstart, _ramend, _rambase;
 extern unsigned long memory_start, memory_end, physical_mem_end;
 extern char _stext_l1[], _etext_l1[], _sdata_l1[], _edata_l1[], _sbss_l1[],
-    _ebss_l1[], _l1_lma_start[], _sdata_b_l1[], _ebss_b_l1[];
+	_ebss_l1[], _l1_lma_start[], _sdata_b_l1[], _ebss_b_l1[],
+	_stext_l2[], _etext_l2[], _sdata_l2[], _edata_l2[], _sbss_l2[],
+	_ebss_l2[], _l2_lma_start[];
 
 #ifdef CONFIG_MTD_UCLINUX
 extern unsigned long memory_mtd_start, memory_mtd_end, mtd_size;
diff --git a/include/asm-blackfin/elf.h b/include/asm-blackfin/elf.h
index 30303fc8292c..67a03a8a353e 100644
--- a/include/asm-blackfin/elf.h
+++ b/include/asm-blackfin/elf.h
@@ -15,6 +15,8 @@
 #define EF_BFIN_FDPIC		0x00000002	/* -mfdpic */
 #define EF_BFIN_CODE_IN_L1	0x00000010	/* --code-in-l1 */
 #define EF_BFIN_DATA_IN_L1	0x00000020	/* --data-in-l1 */
+#define EF_BFIN_CODE_IN_L2	0x00000040	/* --code-in-l2 */
+#define EF_BFIN_DATA_IN_L2	0x00000080	/* --data-in-l2 */
 
 typedef unsigned long elf_greg_t;
 
diff --git a/include/asm-blackfin/module.h b/include/asm-blackfin/module.h
index 3c7ce1644280..e3128df139d6 100644
--- a/include/asm-blackfin/module.h
+++ b/include/asm-blackfin/module.h
@@ -6,8 +6,6 @@
 #define Elf_Shdr        Elf32_Shdr
 #define Elf_Sym         Elf32_Sym
 #define Elf_Ehdr        Elf32_Ehdr
-#define FLG_CODE_IN_L1	0x10
-#define FLG_DATA_IN_L1	0x20
 
 struct mod_arch_specific {
 	Elf_Shdr	*text_l1;
@@ -15,5 +13,8 @@ struct mod_arch_specific {
 	Elf_Shdr	*bss_a_l1;
 	Elf_Shdr	*data_b_l1;
 	Elf_Shdr	*bss_b_l1;
+	Elf_Shdr	*text_l2;
+	Elf_Shdr	*data_l2;
+	Elf_Shdr	*bss_l2;
 };
 #endif				/* _ASM_BFIN_MODULE_H */
-- 
cgit v1.2.3


From aa3348f461da1df5c583f9916ab80298ddd68eff Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Sat, 19 Jul 2008 15:54:10 +0800
Subject: Blackfin arch: Add return value check in bfin_sir_probe(), remove
 SSYNC().

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 include/asm-blackfin/mach-bf527/bfin_sir.h | 21 ++++++++++++-----
 include/asm-blackfin/mach-bf533/bfin_sir.h | 13 +++++++----
 include/asm-blackfin/mach-bf537/bfin_sir.h | 21 ++++++++++++-----
 include/asm-blackfin/mach-bf548/bfin_sir.h | 37 ++++++++++++++++++++++--------
 include/asm-blackfin/mach-bf561/bfin_sir.h | 13 +++++++----
 5 files changed, 75 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/asm-blackfin/mach-bf527/bfin_sir.h b/include/asm-blackfin/mach-bf527/bfin_sir.h
index 0612d0c9501c..cfd8ad4f1f2c 100644
--- a/include/asm-blackfin/mach-bf527/bfin_sir.h
+++ b/include/asm-blackfin/mach-bf527/bfin_sir.h
@@ -118,16 +118,25 @@ static inline void SIR_UART_CLEAR_LSR(struct bfin_sir_port *port)
 
 #define DRIVER_NAME "bfin_sir"
 
-static void bfin_sir_hw_init(void)
+static int bfin_sir_hw_init(void)
 {
+	int ret = -ENODEV;
 #ifdef CONFIG_BFIN_SIR0
-	peripheral_request(P_UART0_TX, DRIVER_NAME);
-	peripheral_request(P_UART0_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART0_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART0_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
 
 #ifdef CONFIG_BFIN_SIR1
-	peripheral_request(P_UART1_TX, DRIVER_NAME);
-	peripheral_request(P_UART1_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART1_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART1_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
-	SSYNC();
+	return ret;
 }
diff --git a/include/asm-blackfin/mach-bf533/bfin_sir.h b/include/asm-blackfin/mach-bf533/bfin_sir.h
index cefcf8bb505b..9bb87e9e2e9b 100644
--- a/include/asm-blackfin/mach-bf533/bfin_sir.h
+++ b/include/asm-blackfin/mach-bf533/bfin_sir.h
@@ -110,11 +110,16 @@ static inline void SIR_UART_CLEAR_LSR(struct bfin_sir_port *port)
 
 #define DRIVER_NAME "bfin_sir"
 
-static void bfin_sir_hw_init(void)
+static int bfin_sir_hw_init(void)
 {
+	int ret = -ENODEV;
 #ifdef CONFIG_BFIN_SIR0
-	peripheral_request(P_UART0_TX, DRIVER_NAME);
-	peripheral_request(P_UART0_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART0_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART0_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
-	SSYNC();
+	return ret;
 }
diff --git a/include/asm-blackfin/mach-bf537/bfin_sir.h b/include/asm-blackfin/mach-bf537/bfin_sir.h
index 0612d0c9501c..cfd8ad4f1f2c 100644
--- a/include/asm-blackfin/mach-bf537/bfin_sir.h
+++ b/include/asm-blackfin/mach-bf537/bfin_sir.h
@@ -118,16 +118,25 @@ static inline void SIR_UART_CLEAR_LSR(struct bfin_sir_port *port)
 
 #define DRIVER_NAME "bfin_sir"
 
-static void bfin_sir_hw_init(void)
+static int bfin_sir_hw_init(void)
 {
+	int ret = -ENODEV;
 #ifdef CONFIG_BFIN_SIR0
-	peripheral_request(P_UART0_TX, DRIVER_NAME);
-	peripheral_request(P_UART0_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART0_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART0_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
 
 #ifdef CONFIG_BFIN_SIR1
-	peripheral_request(P_UART1_TX, DRIVER_NAME);
-	peripheral_request(P_UART1_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART1_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART1_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
-	SSYNC();
+	return ret;
 }
diff --git a/include/asm-blackfin/mach-bf548/bfin_sir.h b/include/asm-blackfin/mach-bf548/bfin_sir.h
index 5e94271c7e3b..c41f9cf00268 100644
--- a/include/asm-blackfin/mach-bf548/bfin_sir.h
+++ b/include/asm-blackfin/mach-bf548/bfin_sir.h
@@ -124,26 +124,43 @@ struct bfin_sir_self {
 
 #define DRIVER_NAME "bfin_sir"
 
-static void bfin_sir_hw_init(void)
+static int bfin_sir_hw_init(void)
 {
+	int ret = -ENODEV;
 #ifdef CONFIG_BFIN_SIR0
-	peripheral_request(P_UART0_TX, DRIVER_NAME);
-	peripheral_request(P_UART0_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART0_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART0_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
 
 #ifdef CONFIG_BFIN_SIR1
-	peripheral_request(P_UART1_TX, DRIVER_NAME);
-	peripheral_request(P_UART1_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART1_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART1_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
 
 #ifdef CONFIG_BFIN_SIR2
-	peripheral_request(P_UART2_TX, DRIVER_NAME);
-	peripheral_request(P_UART2_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART2_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART2_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
 
 #ifdef CONFIG_BFIN_SIR3
-	peripheral_request(P_UART3_TX, DRIVER_NAME);
-	peripheral_request(P_UART3_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART3_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART3_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
-	SSYNC();
+	return ret;
 }
diff --git a/include/asm-blackfin/mach-bf561/bfin_sir.h b/include/asm-blackfin/mach-bf561/bfin_sir.h
index cefcf8bb505b..9bb87e9e2e9b 100644
--- a/include/asm-blackfin/mach-bf561/bfin_sir.h
+++ b/include/asm-blackfin/mach-bf561/bfin_sir.h
@@ -110,11 +110,16 @@ static inline void SIR_UART_CLEAR_LSR(struct bfin_sir_port *port)
 
 #define DRIVER_NAME "bfin_sir"
 
-static void bfin_sir_hw_init(void)
+static int bfin_sir_hw_init(void)
 {
+	int ret = -ENODEV;
 #ifdef CONFIG_BFIN_SIR0
-	peripheral_request(P_UART0_TX, DRIVER_NAME);
-	peripheral_request(P_UART0_RX, DRIVER_NAME);
+	ret = peripheral_request(P_UART0_TX, DRIVER_NAME);
+	if (ret)
+		return ret;
+	ret = peripheral_request(P_UART0_RX, DRIVER_NAME);
+	if (ret)
+		return ret;
 #endif
-	SSYNC();
+	return ret;
 }
-- 
cgit v1.2.3


From 0138da6101fa3cdfea7f470d014c2f13cc03e7a9 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Sat, 19 Jul 2008 16:56:53 +0800
Subject: Blackfin arch: fix bug - detect 0.1 silicon revision BF527-EZKIT as
 0.0 version

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
---
 include/asm-blackfin/processor.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-blackfin/processor.h b/include/asm-blackfin/processor.h
index 1c0040724612..6f3995b119d8 100644
--- a/include/asm-blackfin/processor.h
+++ b/include/asm-blackfin/processor.h
@@ -112,7 +112,26 @@ unsigned long get_wchan(struct task_struct *p);
 static inline uint32_t __pure bfin_revid(void)
 {
 	/* stored in the upper 4 bits */
-	return bfin_read_CHIPID() >> 28;
+	uint32_t revid = bfin_read_CHIPID() >> 28;
+
+#ifdef CONFIG_BF52x
+	/* ANOMALY_05000357
+	 * Incorrect Revision Number in DSPID Register
+	 */
+	if (revid == 0)
+		switch (bfin_read16(_BOOTROM_GET_DXE_ADDRESS_TWI)) {
+		case 0x0010:
+			revid = 0;
+			break;
+		case 0x2796:
+			revid = 1;
+			break;
+		default:
+			revid = 0xFFFF;
+			break;
+		}
+#endif
+	return revid;
 }
 
 static inline uint32_t __pure bfin_compiled_revid(void)
-- 
cgit v1.2.3


From 36ac26171afa8dbf29226199699fe955d4a0b6f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 11:22:33 +0200
Subject: crashdump: fix undefined reference to `elfcorehdr_addr'

fix build bug introduced by 95b68dec0d5 "calgary iommu: use the first
kernels TCE tables in kdump":

arch/x86/kernel/built-in.o: In function `calgary_iommu_init':
(.init.text+0x8399): undefined reference to `elfcorehdr_addr'
arch/x86/kernel/built-in.o: In function `calgary_iommu_init':
(.init.text+0x856c): undefined reference to `elfcorehdr_addr'
arch/x86/kernel/built-in.o: In function `detect_calgary':
(.init.text+0x8c68): undefined reference to `elfcorehdr_addr'
arch/x86/kernel/built-in.o: In function `detect_calgary':
(.init.text+0x8d0c): undefined reference to `elfcorehdr_addr'

make elfcorehdr_addr a generally available symbol.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/crash_dump.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 6cd39a927e1f..025e4f575103 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -8,7 +8,13 @@
 #include <linux/proc_fs.h>
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
+
+#ifdef CONFIG_PROC_VMCORE
 extern unsigned long long elfcorehdr_addr;
+#else
+static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+#endif
+
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
 extern const struct file_operations proc_vmcore_operations;
-- 
cgit v1.2.3


From 071375bc76ee86e58592f4682030c81d410ddfd9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 14:52:26 +0200
Subject: x86, RDC321x: remove gpio.h complications

Remove the include/asm-x86/gpio.h specials, just use the generic
version.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/gpio.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86/gpio.h b/include/asm-x86/gpio.h
index 116e9147fe66..c4c91b37c104 100644
--- a/include/asm-x86/gpio.h
+++ b/include/asm-x86/gpio.h
@@ -16,10 +16,6 @@
 #ifndef _ASM_I386_GPIO_H
 #define _ASM_I386_GPIO_H
 
-#ifdef CONFIG_X86_RDC321X
-#include <gpio.h>
-#else /* CONFIG_X86_RDC321X */
-
 #include <asm-generic/gpio.h>
 
 #ifdef CONFIG_GPIOLIB
@@ -57,6 +53,4 @@ static inline int irq_to_gpio(unsigned int irq)
 
 #endif /* CONFIG_GPIOLIB */
 
-#endif /* CONFIG_X86_RDC321X */
-
 #endif /* _ASM_I386_GPIO_H */
-- 
cgit v1.2.3


From 1ca9fda4b2f3413e4bc748b983f5585629ca0560 Mon Sep 17 00:00:00 2001
From: Chris McDermott <lcm@linux.vnet.ibm.com>
Date: Thu, 24 Jul 2008 19:06:09 -0700
Subject: x86: fix IBM Summit based systems' phys_cpu_present_map on 32-bit
 kernels

x86 kernels on IBM Summit based systems will only online 1 CPU because the
phys_cpu_present_map is not set up correctly. Patch below applied to
2.6.26-git10.

Signed-off-by: Chris McDermott <lcm@linux.vnet.ibm.com>
Tested-by: Tim Pepper <lnxninga@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/mach-summit/mach_apic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-x86/mach-summit/mach_apic.h b/include/asm-x86/mach-summit/mach_apic.h
index 75d2c95005d7..c47e2ab5c5ca 100644
--- a/include/asm-x86/mach-summit/mach_apic.h
+++ b/include/asm-x86/mach-summit/mach_apic.h
@@ -122,7 +122,7 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
 
 static inline physid_mask_t apicid_to_cpu_present(int apicid)
 {
-	return physid_mask_of_physid(0);
+	return physid_mask_of_physid(apicid);
 }
 
 static inline void setup_portio_remap(void)
-- 
cgit v1.2.3


From 16d69265b930f7e2fa9eea381715696f780718f4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 25 Jul 2008 19:44:36 -0700
Subject: uninline arch_pick_mmap_layout()

Fix this, on avr32:

  include/linux/utsname.h:35,
                   from init/main.c:20:
  include/linux/sched.h: In function 'arch_pick_mmap_layout':
  include/linux/sched.h:2149: error: implicit declaration of function 'PAGE_ALIGN'

Reported-by: Adrian Bunk <bunk@kernel.org>
Cc: Haavard Skinnemoen <hskinnemoen@atmel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 mm/util.c             | 10 ++++++++++
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 42036ffe6b00..3260a5c42b91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2139,16 +2139,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
-#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
-#else
-static inline void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-	mm->mmap_base = TASK_UNMAPPED_BASE;
-	mm->get_unmapped_area = arch_get_unmapped_area;
-	mm->unmap_area = arch_unmap_area;
-}
-#endif
 
 #ifdef CONFIG_TRACING
 extern void
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..0efd83097ecf 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,3 +1,4 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
@@ -136,3 +137,12 @@ char *strndup_user(const char __user *s, long n)
 	return p;
 }
 EXPORT_SYMBOL(strndup_user);
+
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	mm->mmap_base = TASK_UNMAPPED_BASE;
+	mm->get_unmapped_area = arch_get_unmapped_area;
+	mm->unmap_area = arch_unmap_area;
+}
+#endif
-- 
cgit v1.2.3


From 44ccac13c7f4728cf2992d49384671a176db74dd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:44:38 -0700
Subject: include/video/atmel_lcdc.h must #include <linux/workqueue.h>

This patch fixes the following compile error caused by commit
d22579b837358cbef12ccca5adaf7e93ae09ab7a ("atmel_lcdfb: FIFO underflow
management"):

  In file included from arch/avr32/boards/atstk1000/atstk1004.c:21:
  include/video/atmel_lcdc.h:40: error: field 'task' has incomplete type

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/video/atmel_lcdc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/video/atmel_lcdc.h b/include/video/atmel_lcdc.h
index 1ccf462b433a..613173b5db69 100644
--- a/include/video/atmel_lcdc.h
+++ b/include/video/atmel_lcdc.h
@@ -22,6 +22,7 @@
 #ifndef __ATMEL_LCDC_H__
 #define __ATMEL_LCDC_H__
 
+#include <linux/workqueue.h>
 
 /* Way LCD wires are connected to the chip:
  * Some Atmel chips use BGR color mode (instead of standard RGB)
-- 
cgit v1.2.3


From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 25 Jul 2008 19:44:49 -0700
Subject: dma-mapping: add the device argument to dma_mapping_error()

Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER
architecture does:

This enables us to cleanly fix the Calgary IOMMU issue that some devices
are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423).

I think that per-device dma_mapping_ops support would be also helpful for
KVM people to support PCI passthrough but Andi thinks that this makes it
difficult to support the PCI passthrough (see the above thread).  So I
CC'ed this to KVM camp.  Comments are appreciated.

A pointer to dma_mapping_ops to struct dev_archdata is added.  If the
pointer is non NULL, DMA operations in asm/dma-mapping.h use it.  If it's
NULL, the system-wide dma_ops pointer is used as before.

If it's useful for KVM people, I plan to implement a mechanism to register
a hook called when a new pci (or dma capable) device is created (it works
with hot plugging).  It enables IOMMUs to set up an appropriate
dma_mapping_ops per device.

The major obstacle is that dma_mapping_error doesn't take a pointer to the
device unlike other DMA operations.  So x86 can't have dma_mapping_ops per
device.  Note all the POWER IOMMUs use the same dma_mapping_error function
so this is not a problem for POWER but x86 IOMMUs use different
dma_mapping_error functions.

The first patch adds the device argument to dma_mapping_error.  The patch
is trivial but large since it touches lots of drivers and dma-mapping.h in
all the architecture.

This patch:

dma_mapping_error() doesn't take a pointer to the device unlike other DMA
operations.  So we can't have dma_mapping_ops per device.

Note that POWER already has dma_mapping_ops per device but all the POWER
IOMMUs use the same dma_mapping_error function.  x86 IOMMUs use device
argument.

[akpm@linux-foundation.org: fix sge]
[akpm@linux-foundation.org: fix svc_rdma]
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: fix bnx2x]
[akpm@linux-foundation.org: fix s2io]
[akpm@linux-foundation.org: fix pasemi_mac]
[akpm@linux-foundation.org: fix sdhci]
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: fix sparc]
[akpm@linux-foundation.org: fix ibmvscsi]
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Avi Kivity <avi@qumranet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DMA-API.txt                      |  4 +-
 arch/arm/common/dmabounce.c                    |  2 +-
 arch/ia64/hp/common/hwsw_iommu.c               |  5 +-
 arch/ia64/hp/common/sba_iommu.c                |  2 +-
 arch/ia64/sn/pci/pci_dma.c                     |  2 +-
 arch/mips/mm/dma-default.c                     |  2 +-
 arch/powerpc/platforms/cell/celleb_scc_pciex.c |  2 +-
 arch/powerpc/platforms/cell/spider-pci.c       |  2 +-
 arch/powerpc/platforms/iseries/mf.c            |  2 +-
 arch/x86/kernel/pci-calgary_64.c               |  2 +-
 arch/x86/kernel/pci-dma.c                      | 27 ++++---
 arch/x86/kernel/pci-gart_64.c                  |  3 +-
 arch/x86/kernel/pci-nommu.c                    | 14 +---
 arch/x86/kernel/pci-swiotlb_64.c               |  2 +-
 drivers/firewire/fw-iso.c                      |  2 +-
 drivers/firewire/fw-ohci.c                     |  2 +-
 drivers/firewire/fw-sbp2.c                     |  8 +--
 drivers/infiniband/hw/ipath/ipath_sdma.c       |  2 +-
 drivers/infiniband/hw/ipath/ipath_user_sdma.c  |  6 +-
 drivers/infiniband/hw/mthca/mthca_eq.c         |  2 +-
 drivers/media/dvb/pluto2/pluto2.c              |  2 +-
 drivers/mmc/host/sdhci.c                       |  4 +-
 drivers/net/arm/ep93xx_eth.c                   |  4 +-
 drivers/net/bnx2x_main.c                       |  4 +-
 drivers/net/cxgb3/sge.c                        |  2 +-
 drivers/net/e100.c                             |  2 +-
 drivers/net/e1000e/ethtool.c                   |  4 +-
 drivers/net/e1000e/netdev.c                    | 11 +--
 drivers/net/ibmveth.c                          | 38 +++++-----
 drivers/net/iseries_veth.c                     |  4 +-
 drivers/net/mlx4/eq.c                          |  2 +-
 drivers/net/pasemi_mac.c                       |  6 +-
 drivers/net/qla3xxx.c                          | 12 ++--
 drivers/net/s2io.c                             | 48 +++++++------
 drivers/net/sfc/rx.c                           |  4 +-
 drivers/net/sfc/tx.c                           |  7 +-
 drivers/net/spider_net.c                       |  4 +-
 drivers/net/tc35815.c                          |  4 +-
 drivers/net/wireless/ath5k/base.c              |  4 +-
 drivers/scsi/ibmvscsi/ibmvfc.c                 |  4 +-
 drivers/scsi/ibmvscsi/ibmvscsi.c               |  4 +-
 drivers/scsi/ibmvscsi/ibmvstgt.c               |  2 +-
 drivers/scsi/ibmvscsi/rpa_vscsi.c              |  2 +-
 drivers/spi/atmel_spi.c                        |  4 +-
 drivers/spi/au1550_spi.c                       |  6 +-
 drivers/spi/omap2_mcspi.c                      |  4 +-
 drivers/spi/pxa2xx_spi.c                       |  4 +-
 drivers/spi/spi_imx.c                          |  6 +-
 include/asm-alpha/dma-mapping.h                |  6 +-
 include/asm-alpha/pci.h                        |  2 +-
 include/asm-arm/dma-mapping.h                  |  2 +-
 include/asm-avr32/dma-mapping.h                |  2 +-
 include/asm-cris/dma-mapping.h                 |  2 +-
 include/asm-frv/dma-mapping.h                  |  2 +-
 include/asm-generic/dma-mapping-broken.h       |  2 +-
 include/asm-generic/dma-mapping.h              |  4 +-
 include/asm-generic/pci-dma-compat.h           |  4 +-
 include/asm-ia64/machvec.h                     |  2 +-
 include/asm-m68k/dma-mapping.h                 |  2 +-
 include/asm-mips/dma-mapping.h                 |  2 +-
 include/asm-mn10300/dma-mapping.h              |  2 +-
 include/asm-parisc/dma-mapping.h               |  2 +-
 include/asm-powerpc/dma-mapping.h              |  2 +-
 include/asm-sh/dma-mapping.h                   |  2 +-
 include/asm-sparc/dma-mapping_64.h             |  2 +-
 include/asm-sparc/pci_32.h                     |  3 +-
 include/asm-sparc/pci_64.h                     |  5 +-
 include/asm-x86/device.h                       |  3 +
 include/asm-x86/dma-mapping.h                  | 99 ++++++++++++++++++--------
 include/asm-x86/swiotlb.h                      |  2 +-
 include/asm-xtensa/dma-mapping.h               |  2 +-
 include/linux/i2o.h                            |  2 +-
 include/linux/ssb/ssb.h                        |  4 +-
 include/rdma/ib_verbs.h                        |  2 +-
 lib/swiotlb.c                                  |  4 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c          |  3 +-
 76 files changed, 256 insertions(+), 210 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 80d150458c80..d8b63d164e41 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -298,10 +298,10 @@ recommended that you never use these unless you really know what the
 cache width is.
 
 int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
 int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *hwdev, dma_addr_t dma_addr)
 
 In some circumstances dma_map_single and dma_map_page will fail to create
 a mapping. A driver can check for these errors by testing the returned
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index dd2947342604..69130f365904 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -280,7 +280,7 @@ unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 	/*
 	 * Trying to unmap an invalid mapping
 	 */
-	if (dma_mapping_error(dma_addr)) {
+	if (dma_mapping_error(dev, dma_addr)) {
 		dev_err(dev, "Trying to unmap invalid mapping\n");
 		return;
 	}
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 1c44ec2a1d58..88b6e6f3fd88 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -186,9 +186,10 @@ hwsw_dma_supported (struct device *dev, u64 mask)
 }
 
 int
-hwsw_dma_mapping_error (dma_addr_t dma_addr)
+hwsw_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	return hwiommu_dma_mapping_error (dma_addr) || swiotlb_dma_mapping_error(dma_addr);
+	return hwiommu_dma_mapping_error(dev, dma_addr) ||
+		swiotlb_dma_mapping_error(dev, dma_addr);
 }
 
 EXPORT_SYMBOL(hwsw_dma_mapping_error);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 34421aed1e2a..4956be40d7b5 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2147,7 +2147,7 @@ sba_dma_supported (struct device *dev, u64 mask)
 }
 
 int
-sba_dma_mapping_error (dma_addr_t dma_addr)
+sba_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index 52175af299a0..53ebb6484495 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -350,7 +350,7 @@ void sn_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(sn_dma_sync_sg_for_device);
 
-int sn_dma_mapping_error(dma_addr_t dma_addr)
+int sn_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index ae39dd88b9aa..891312f8e5a6 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -348,7 +348,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nele
 
 EXPORT_SYMBOL(dma_sync_sg_for_device);
 
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/arch/powerpc/platforms/cell/celleb_scc_pciex.c b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
index 0e04f8fb152a..3e7e0f1568ef 100644
--- a/arch/powerpc/platforms/cell/celleb_scc_pciex.c
+++ b/arch/powerpc/platforms/cell/celleb_scc_pciex.c
@@ -281,7 +281,7 @@ static int __init scc_pciex_iowa_init(struct iowa_bus *bus, void *data)
 
 	dummy_page_da = dma_map_single(bus->phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(bus->phb->parent, dummy_page_da)) {
 		pr_err("PCIEX:Map dummy page failed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/cell/spider-pci.c b/arch/powerpc/platforms/cell/spider-pci.c
index 418b605ac35a..5122ec145271 100644
--- a/arch/powerpc/platforms/cell/spider-pci.c
+++ b/arch/powerpc/platforms/cell/spider-pci.c
@@ -111,7 +111,7 @@ static int __init spiderpci_pci_setup_chip(struct pci_controller *phb,
 
 	dummy_page_da = dma_map_single(phb->parent, dummy_page_va,
 				       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dummy_page_da)) {
+	if (dma_mapping_error(phb->parent, dummy_page_da)) {
 		pr_err("SPIDER-IOWA:Map dummy page filed.\n");
 		kfree(dummy_page_va);
 		return -1;
diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c
index 1dc7295746da..731d7b157749 100644
--- a/arch/powerpc/platforms/iseries/mf.c
+++ b/arch/powerpc/platforms/iseries/mf.c
@@ -871,7 +871,7 @@ static int proc_mf_dump_cmdline(char *page, char **start, off_t off,
 		count = 256 - off;
 
 	dma_addr = iseries_hv_map(page, off + count, DMA_FROM_DEVICE);
-	if (dma_mapping_error(dma_addr))
+	if (dma_mapping_error(NULL, dma_addr))
 		return -ENOMEM;
 	memset(page, 0, off + count);
 	memset(&vsp_cmd, 0, sizeof(vsp_cmd));
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 19e7fc7c2c4f..1eb86be93d7a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -544,7 +544,7 @@ error:
 	return ret;
 }
 
-static const struct dma_mapping_ops calgary_dma_ops = {
+static struct dma_mapping_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
 	.map_single = calgary_map_single,
 	.unmap_single = calgary_unmap_single,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index cbecb05551bb..37544123896d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,7 +11,7 @@
 
 static int forbid_dac __read_mostly;
 
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr)
 
 int dma_supported(struct device *dev, u64 mask)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
 		dev_info(dev, "PCI: Disallowing DAC for device\n");
@@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask)
 	}
 #endif
 
-	if (dma_ops->dma_supported)
-		return dma_ops->dma_supported(dev, mask);
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);
 
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
@@ -367,6 +369,7 @@ void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		   gfp_t gfp)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
 	void *memory = NULL;
 	struct page *page;
 	unsigned long dma_mask = 0;
@@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 			/* Let low level make its own zone decisions */
 			gfp &= ~(GFP_DMA32|GFP_DMA);
 
-			if (dma_ops->alloc_coherent)
-				return dma_ops->alloc_coherent(dev, size,
+			if (ops->alloc_coherent)
+				return ops->alloc_coherent(dev, size,
 							   dma_handle, gfp);
 			return NULL;
 		}
@@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		}
 	}
 
-	if (dma_ops->alloc_coherent) {
+	if (ops->alloc_coherent) {
 		free_pages((unsigned long)memory, get_order(size));
 		gfp &= ~(GFP_DMA|GFP_DMA32);
-		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+		return ops->alloc_coherent(dev, size, dma_handle, gfp);
 	}
 
-	if (dma_ops->map_simple) {
-		*dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
+	if (ops->map_simple) {
+		*dma_handle = ops->map_simple(dev, virt_to_phys(memory),
 					      size,
 					      PCI_DMA_BIDIRECTIONAL);
 		if (*dma_handle != bad_dma_address)
@@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size,
 			 void *vaddr, dma_addr_t bus)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	int order = get_order(size);
 	WARN_ON(irqs_disabled());	/* for portability */
 	if (dma_release_coherent(dev, order, vaddr))
 		return;
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, bus, size, 0);
+	if (ops->unmap_single)
+		ops->unmap_single(dev, bus, size, 0);
 	free_pages((unsigned long)vaddr, order);
 }
 EXPORT_SYMBOL(dma_free_coherent);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index df5f142657d2..744126e64950 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 extern int agp_amd64_init(void);
 
-static const struct dma_mapping_ops gart_dma_ops = {
-	.mapping_error			= NULL,
+static struct dma_mapping_ops gart_dma_ops = {
 	.map_single			= gart_map_single,
 	.map_simple			= gart_map_simple,
 	.unmap_single			= gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 792b9179eff3..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	return nents;
 }
 
-/* Make sure we keep the same behaviour */
-static int nommu_mapping_error(dma_addr_t dma_addr)
-{
-#ifdef CONFIG_X86_32
-	return 0;
-#else
-	return (dma_addr == bad_dma_address);
-#endif
-}
-
-
-const struct dma_mapping_ops nommu_dma_ops = {
+struct dma_mapping_ops nommu_dma_ops = {
 	.map_single = nommu_map_single,
 	.map_sg = nommu_map_sg,
-	.mapping_error = nommu_mapping_error,
 	.is_phys = 1,
 };
 
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 20df839b9c20..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
 	return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
 }
 
-const struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
diff --git a/drivers/firewire/fw-iso.c b/drivers/firewire/fw-iso.c
index bcbe794a3ea5..e14c03dc0065 100644
--- a/drivers/firewire/fw-iso.c
+++ b/drivers/firewire/fw-iso.c
@@ -50,7 +50,7 @@ fw_iso_buffer_init(struct fw_iso_buffer *buffer, struct fw_card *card,
 
 		address = dma_map_page(card->device, buffer->pages[i],
 				       0, PAGE_SIZE, direction);
-		if (dma_mapping_error(address)) {
+		if (dma_mapping_error(card->device, address)) {
 			__free_page(buffer->pages[i]);
 			goto out_pages;
 		}
diff --git a/drivers/firewire/fw-ohci.c b/drivers/firewire/fw-ohci.c
index 333b12544dd1..566672e0bcff 100644
--- a/drivers/firewire/fw-ohci.c
+++ b/drivers/firewire/fw-ohci.c
@@ -953,7 +953,7 @@ at_context_queue_packet(struct context *ctx, struct fw_packet *packet)
 		payload_bus =
 			dma_map_single(ohci->card.device, packet->payload,
 				       packet->payload_length, DMA_TO_DEVICE);
-		if (dma_mapping_error(payload_bus)) {
+		if (dma_mapping_error(ohci->card.device, payload_bus)) {
 			packet->ack = RCODE_SEND_ERROR;
 			return -1;
 		}
diff --git a/drivers/firewire/fw-sbp2.c b/drivers/firewire/fw-sbp2.c
index 53fc5a641e6d..aaff50ebba1d 100644
--- a/drivers/firewire/fw-sbp2.c
+++ b/drivers/firewire/fw-sbp2.c
@@ -543,7 +543,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 	orb->response_bus =
 		dma_map_single(device->card->device, &orb->response,
 			       sizeof(orb->response), DMA_FROM_DEVICE);
-	if (dma_mapping_error(orb->response_bus))
+	if (dma_mapping_error(device->card->device, orb->response_bus))
 		goto fail_mapping_response;
 
 	orb->request.response.high = 0;
@@ -577,7 +577,7 @@ sbp2_send_management_orb(struct sbp2_logical_unit *lu, int node_id,
 	orb->base.request_bus =
 		dma_map_single(device->card->device, &orb->request,
 			       sizeof(orb->request), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->base.request_bus))
+	if (dma_mapping_error(device->card->device, orb->base.request_bus))
 		goto fail_mapping_request;
 
 	sbp2_send_orb(&orb->base, lu, node_id, generation,
@@ -1424,7 +1424,7 @@ sbp2_map_scatterlist(struct sbp2_command_orb *orb, struct fw_device *device,
 	orb->page_table_bus =
 		dma_map_single(device->card->device, orb->page_table,
 			       sizeof(orb->page_table), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->page_table_bus))
+	if (dma_mapping_error(device->card->device, orb->page_table_bus))
 		goto fail_page_table;
 
 	/*
@@ -1509,7 +1509,7 @@ static int sbp2_scsi_queuecommand(struct scsi_cmnd *cmd, scsi_done_fn_t done)
 	orb->base.request_bus =
 		dma_map_single(device->card->device, &orb->request,
 			       sizeof(orb->request), DMA_TO_DEVICE);
-	if (dma_mapping_error(orb->base.request_bus))
+	if (dma_mapping_error(device->card->device, orb->base.request_bus))
 		goto out;
 
 	sbp2_send_orb(&orb->base, lu, lu->tgt->node_id, lu->generation,
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
index eaba03273e4f..284c9bca517e 100644
--- a/drivers/infiniband/hw/ipath/ipath_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -698,7 +698,7 @@ retry:
 
 	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
 			      tx->map_len, DMA_TO_DEVICE);
-	if (dma_mapping_error(addr)) {
+	if (dma_mapping_error(&dd->pcidev->dev, addr)) {
 		ret = -EIO;
 		goto unlock;
 	}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index 86e016916cd1..82d9a0b5ca2f 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -206,7 +206,7 @@ static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
 
 	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
 				DMA_TO_DEVICE);
-	if (dma_mapping_error(dma_addr)) {
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 		ret = -ENOMEM;
 		goto free_unmap;
 	}
@@ -301,7 +301,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
 				     pages[j], 0, flen, DMA_TO_DEVICE);
 		unsigned long fofs = addr & ~PAGE_MASK;
 
-		if (dma_mapping_error(dma_addr)) {
+		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 			ret = -ENOMEM;
 			goto done;
 		}
@@ -508,7 +508,7 @@ static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
 		if (page) {
 			dma_addr = dma_map_page(&dd->pcidev->dev,
 						page, 0, len, DMA_TO_DEVICE);
-			if (dma_mapping_error(dma_addr)) {
+			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
 				ret = -ENOMEM;
 				goto free_pbc;
 			}
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
index 4e36aa7cb3d2..cc6858f0b65b 100644
--- a/drivers/infiniband/hw/mthca/mthca_eq.c
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -780,7 +780,7 @@ int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
 		return -ENOMEM;
 	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
 					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(dev->eq_table.icm_dma)) {
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
 		__free_page(dev->eq_table.icm_page);
 		return -ENOMEM;
 	}
diff --git a/drivers/media/dvb/pluto2/pluto2.c b/drivers/media/dvb/pluto2/pluto2.c
index 1360403b88b6..a9653c63f4db 100644
--- a/drivers/media/dvb/pluto2/pluto2.c
+++ b/drivers/media/dvb/pluto2/pluto2.c
@@ -242,7 +242,7 @@ static int __devinit pluto_dma_map(struct pluto *pluto)
 	pluto->dma_addr = pci_map_single(pluto->pdev, pluto->dma_buf,
 			TS_DMA_BYTES, PCI_DMA_FROMDEVICE);
 
-	return pci_dma_mapping_error(pluto->dma_addr);
+	return pci_dma_mapping_error(pluto->pdev, pluto->dma_addr);
 }
 
 static void pluto_dma_unmap(struct pluto *pluto)
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index c3a5db72ddd7..5f95e10229b5 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -337,7 +337,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 
 	host->align_addr = dma_map_single(mmc_dev(host->mmc),
 		host->align_buffer, 128 * 4, direction);
-	if (dma_mapping_error(host->align_addr))
+	if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr))
 		goto fail;
 	BUG_ON(host->align_addr & 0x3);
 
@@ -439,7 +439,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 
 	host->adma_addr = dma_map_single(mmc_dev(host->mmc),
 		host->adma_desc, (128 * 2 + 1) * 4, DMA_TO_DEVICE);
-	if (dma_mapping_error(host->align_addr))
+	if (dma_mapping_error(mmc_dev(host->mmc), host->align_addr))
 		goto unmap_entries;
 	BUG_ON(host->adma_addr & 0x3);
 
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 7a14980f3472..18d3eeb7eab2 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -482,7 +482,7 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 			goto err;
 
 		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (dma_mapping_error(d)) {
+		if (dma_mapping_error(NULL, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
@@ -505,7 +505,7 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 			goto err;
 
 		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_TO_DEVICE);
-		if (dma_mapping_error(d)) {
+		if (dma_mapping_error(NULL, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 0263bef9cc6d..c7cc760a1777 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1020,7 +1020,7 @@ static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp,
 
 	mapping = pci_map_page(bp->pdev, page, 0, BCM_PAGE_SIZE*PAGES_PER_SGE,
 			       PCI_DMA_FROMDEVICE);
-	if (unlikely(dma_mapping_error(mapping))) {
+	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		__free_pages(page, PAGES_PER_SGE_SHIFT);
 		return -ENOMEM;
 	}
@@ -1048,7 +1048,7 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
 
 	mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_use_size,
 				 PCI_DMA_FROMDEVICE);
-	if (unlikely(dma_mapping_error(mapping))) {
+	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
 		dev_kfree_skb(skb);
 		return -ENOMEM;
 	}
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index a96331c875e6..1b0861d73ab7 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -386,7 +386,7 @@ static inline int add_one_rx_buf(void *va, unsigned int len,
 	dma_addr_t mapping;
 
 	mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
-	if (unlikely(pci_dma_mapping_error(mapping)))
+	if (unlikely(pci_dma_mapping_error(pdev, mapping)))
 		return -ENOMEM;
 
 	pci_unmap_addr_set(sd, dma_addr, mapping);
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 1037b1332312..19d32a227be1 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -1790,7 +1790,7 @@ static int e100_rx_alloc_skb(struct nic *nic, struct rx *rx)
 	rx->dma_addr = pci_map_single(nic->pdev, rx->skb->data,
 		RFD_BUF_LEN, PCI_DMA_BIDIRECTIONAL);
 
-	if (pci_dma_mapping_error(rx->dma_addr)) {
+	if (pci_dma_mapping_error(nic->pdev, rx->dma_addr)) {
 		dev_kfree_skb_any(rx->skb);
 		rx->skb = NULL;
 		rx->dma_addr = 0;
diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c
index a14561f40db0..9350564065e7 100644
--- a/drivers/net/e1000e/ethtool.c
+++ b/drivers/net/e1000e/ethtool.c
@@ -1090,7 +1090,7 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter)
 		tx_ring->buffer_info[i].dma =
 			pci_map_single(pdev, skb->data, skb->len,
 				       PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(tx_ring->buffer_info[i].dma)) {
+		if (pci_dma_mapping_error(pdev, tx_ring->buffer_info[i].dma)) {
 			ret_val = 4;
 			goto err_nomem;
 		}
@@ -1153,7 +1153,7 @@ static int e1000_setup_desc_rings(struct e1000_adapter *adapter)
 		rx_ring->buffer_info[i].dma =
 			pci_map_single(pdev, skb->data, 2048,
 				       PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(rx_ring->buffer_info[i].dma)) {
+		if (pci_dma_mapping_error(pdev, rx_ring->buffer_info[i].dma)) {
 			ret_val = 8;
 			goto err_nomem;
 		}
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 9c0f56b3c518..d13677899767 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -195,7 +195,7 @@ map_skb:
 		buffer_info->dma = pci_map_single(pdev, skb->data,
 						  adapter->rx_buffer_len,
 						  PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(pdev, buffer_info->dma)) {
 			dev_err(&pdev->dev, "RX DMA map failed\n");
 			adapter->rx_dma_failed++;
 			break;
@@ -265,7 +265,7 @@ static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter,
 						   ps_page->page,
 						   0, PAGE_SIZE,
 						   PCI_DMA_FROMDEVICE);
-				if (pci_dma_mapping_error(ps_page->dma)) {
+				if (pci_dma_mapping_error(pdev, ps_page->dma)) {
 					dev_err(&adapter->pdev->dev,
 					  "RX DMA page map failed\n");
 					adapter->rx_dma_failed++;
@@ -300,7 +300,7 @@ static void e1000_alloc_rx_buffers_ps(struct e1000_adapter *adapter,
 		buffer_info->dma = pci_map_single(pdev, skb->data,
 						  adapter->rx_ps_bsize0,
 						  PCI_DMA_FROMDEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(pdev, buffer_info->dma)) {
 			dev_err(&pdev->dev, "RX DMA map failed\n");
 			adapter->rx_dma_failed++;
 			/* cleanup skb */
@@ -3344,7 +3344,7 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 				skb->data + offset,
 				size,
 				PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(buffer_info->dma)) {
+		if (pci_dma_mapping_error(adapter->pdev, buffer_info->dma)) {
 			dev_err(&adapter->pdev->dev, "TX DMA map failed\n");
 			adapter->tx_dma_failed++;
 			return -1;
@@ -3382,7 +3382,8 @@ static int e1000_tx_map(struct e1000_adapter *adapter,
 					offset,
 					size,
 					PCI_DMA_TODEVICE);
-			if (pci_dma_mapping_error(buffer_info->dma)) {
+			if (pci_dma_mapping_error(adapter->pdev,
+						  buffer_info->dma)) {
 				dev_err(&adapter->pdev->dev,
 					"TX DMA page map failed\n");
 				adapter->tx_dma_failed++;
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index e5a6e2e84540..91ec9fdc7184 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -260,7 +260,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 		dma_addr = dma_map_single(&adapter->vdev->dev, skb->data,
 				pool->buff_size, DMA_FROM_DEVICE);
 
-		if (dma_mapping_error(dma_addr))
+		if (dma_mapping_error((&adapter->vdev->dev, dma_addr))
 			goto failure;
 
 		pool->free_map[free_index] = IBM_VETH_INVALID_MAP;
@@ -294,7 +294,7 @@ failure:
 		pool->consumer_index = pool->size - 1;
 	else
 		pool->consumer_index--;
-	if (!dma_mapping_error(dma_addr))
+	if (!dma_mapping_error((&adapter->vdev->dev, dma_addr))
 		dma_unmap_single(&adapter->vdev->dev,
 		                 pool->dma_addr[index], pool->buff_size,
 		                 DMA_FROM_DEVICE);
@@ -448,11 +448,11 @@ static void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter)
 static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 {
 	int i;
+	struct device *dev = &adapter->vdev->dev;
 
 	if(adapter->buffer_list_addr != NULL) {
-		if(!dma_mapping_error(adapter->buffer_list_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
-					adapter->buffer_list_dma, 4096,
+		if (!dma_mapping_error(dev, adapter->buffer_list_dma)) {
+			dma_unmap_single(dev, adapter->buffer_list_dma, 4096,
 					DMA_BIDIRECTIONAL);
 			adapter->buffer_list_dma = DMA_ERROR_CODE;
 		}
@@ -461,9 +461,8 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	}
 
 	if(adapter->filter_list_addr != NULL) {
-		if(!dma_mapping_error(adapter->filter_list_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
-					adapter->filter_list_dma, 4096,
+		if (!dma_mapping_error(dev, adapter->filter_list_dma)) {
+			dma_unmap_single(dev, adapter->filter_list_dma, 4096,
 					DMA_BIDIRECTIONAL);
 			adapter->filter_list_dma = DMA_ERROR_CODE;
 		}
@@ -472,8 +471,8 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	}
 
 	if(adapter->rx_queue.queue_addr != NULL) {
-		if(!dma_mapping_error(adapter->rx_queue.queue_dma)) {
-			dma_unmap_single(&adapter->vdev->dev,
+		if (!dma_mapping_error(dev, adapter->rx_queue.queue_dma)) {
+			dma_unmap_single(dev,
 					adapter->rx_queue.queue_dma,
 					adapter->rx_queue.queue_len,
 					DMA_BIDIRECTIONAL);
@@ -535,6 +534,7 @@ static int ibmveth_open(struct net_device *netdev)
 	int rc;
 	union ibmveth_buf_desc rxq_desc;
 	int i;
+	struct device *dev;
 
 	ibmveth_debug_printk("open starting\n");
 
@@ -563,17 +563,19 @@ static int ibmveth_open(struct net_device *netdev)
 		return -ENOMEM;
 	}
 
-	adapter->buffer_list_dma = dma_map_single(&adapter->vdev->dev,
+	dev = &adapter->vdev->dev;
+
+	adapter->buffer_list_dma = dma_map_single(dev,
 			adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL);
-	adapter->filter_list_dma = dma_map_single(&adapter->vdev->dev,
+	adapter->filter_list_dma = dma_map_single(dev,
 			adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL);
-	adapter->rx_queue.queue_dma = dma_map_single(&adapter->vdev->dev,
+	adapter->rx_queue.queue_dma = dma_map_single(dev,
 			adapter->rx_queue.queue_addr,
 			adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 
-	if((dma_mapping_error(adapter->buffer_list_dma) ) ||
-	   (dma_mapping_error(adapter->filter_list_dma)) ||
-	   (dma_mapping_error(adapter->rx_queue.queue_dma))) {
+	if ((dma_mapping_error(dev, adapter->buffer_list_dma)) ||
+	    (dma_mapping_error(dev, adapter->filter_list_dma)) ||
+	    (dma_mapping_error(dev, adapter->rx_queue.queue_dma))) {
 		ibmveth_error_printk("unable to map filter or buffer list pages\n");
 		ibmveth_cleanup(adapter);
 		napi_disable(&adapter->napi);
@@ -645,7 +647,7 @@ static int ibmveth_open(struct net_device *netdev)
 	adapter->bounce_buffer_dma =
 	    dma_map_single(&adapter->vdev->dev, adapter->bounce_buffer,
 			   netdev->mtu + IBMVETH_BUFF_OH, DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(adapter->bounce_buffer_dma)) {
+	if (dma_mapping_error(dev, adapter->bounce_buffer_dma)) {
 		ibmveth_error_printk("unable to map bounce buffer\n");
 		ibmveth_cleanup(adapter);
 		napi_disable(&adapter->napi);
@@ -922,7 +924,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 		buf[1] = 0;
 	}
 
-	if (dma_mapping_error(data_dma_addr)) {
+	if (dma_mapping_error((&adapter->vdev->dev, data_dma_addr)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			ibmveth_error_printk("tx: unable to map xmit buffer\n");
 		skb_copy_from_linear_data(skb, adapter->bounce_buffer,
diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c
index b8d0639c1cdf..c46864d626b2 100644
--- a/drivers/net/iseries_veth.c
+++ b/drivers/net/iseries_veth.c
@@ -1128,7 +1128,7 @@ static int veth_transmit_to_one(struct sk_buff *skb, HvLpIndex rlp,
 	msg->data.addr[0] = dma_map_single(port->dev, skb->data,
 				skb->len, DMA_TO_DEVICE);
 
-	if (dma_mapping_error(msg->data.addr[0]))
+	if (dma_mapping_error(port->dev, msg->data.addr[0]))
 		goto recycle_and_drop;
 
 	msg->dev = port->dev;
@@ -1226,7 +1226,7 @@ static void veth_recycle_msg(struct veth_lpar_connection *cnx,
 		dma_address = msg->data.addr[0];
 		dma_length = msg->data.len[0];
 
-		if (!dma_mapping_error(dma_address))
+		if (!dma_mapping_error(msg->dev, dma_address))
 			dma_unmap_single(msg->dev, dma_address, dma_length,
 					DMA_TO_DEVICE);
 
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index ea3a09aaa844..7df928d3a3d8 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -526,7 +526,7 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 		return -ENOMEM;
 	priv->eq_table.icm_dma  = pci_map_page(dev->pdev, priv->eq_table.icm_page, 0,
 					       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(priv->eq_table.icm_dma)) {
+	if (pci_dma_mapping_error(dev->pdev, priv->eq_table.icm_dma)) {
 		__free_page(priv->eq_table.icm_page);
 		return -ENOMEM;
 	}
diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c
index 993d87c9296f..edc0fd588985 100644
--- a/drivers/net/pasemi_mac.c
+++ b/drivers/net/pasemi_mac.c
@@ -650,7 +650,7 @@ static void pasemi_mac_replenish_rx_ring(const struct net_device *dev,
 				     mac->bufsz - LOCAL_SKB_ALIGN,
 				     PCI_DMA_FROMDEVICE);
 
-		if (unlikely(dma_mapping_error(dma))) {
+		if (unlikely(pci_dma_mapping_error(mac->dma_pdev, dma))) {
 			dev_kfree_skb_irq(info->skb);
 			break;
 		}
@@ -1519,7 +1519,7 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
 	map[0] = pci_map_single(mac->dma_pdev, skb->data, skb_headlen(skb),
 				PCI_DMA_TODEVICE);
 	map_size[0] = skb_headlen(skb);
-	if (dma_mapping_error(map[0]))
+	if (pci_dma_mapping_error(mac->dma_pdev, map[0]))
 		goto out_err_nolock;
 
 	for (i = 0; i < nfrags; i++) {
@@ -1529,7 +1529,7 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
 					frag->page_offset, frag->size,
 					PCI_DMA_TODEVICE);
 		map_size[i+1] = frag->size;
-		if (dma_mapping_error(map[i+1])) {
+		if (pci_dma_mapping_error(mac->dma_pdev, map[i+1])) {
 			nfrags = i;
 			goto out_err_nolock;
 		}
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index e7d48a352beb..e82b37bbd6c3 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -328,7 +328,7 @@ static void ql_release_to_lrg_buf_free_list(struct ql3_adapter *qdev,
 					     qdev->lrg_buffer_len -
 					     QL_HEADER_SPACE,
 					     PCI_DMA_FROMDEVICE);
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 				       qdev->ndev->name, err);
@@ -1919,7 +1919,7 @@ static int ql_populate_free_queue(struct ql3_adapter *qdev)
 						     QL_HEADER_SPACE,
 						     PCI_DMA_FROMDEVICE);
 
-				err = pci_dma_mapping_error(map);
+				err = pci_dma_mapping_error(qdev->pdev, map);
 				if(err) {
 					printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 					       qdev->ndev->name, err);
@@ -2454,7 +2454,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 	 */
 	map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE);
 
-	err = pci_dma_mapping_error(map);
+	err = pci_dma_mapping_error(qdev->pdev, map);
 	if(err) {
 		printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 		       qdev->ndev->name, err);
@@ -2487,7 +2487,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 						     sizeof(struct oal),
 						     PCI_DMA_TODEVICE);
 
-				err = pci_dma_mapping_error(map);
+				err = pci_dma_mapping_error(qdev->pdev, map);
 				if(err) {
 
 					printk(KERN_ERR "%s: PCI mapping outbound address list with error: %d\n",
@@ -2514,7 +2514,7 @@ static int ql_send_map(struct ql3_adapter *qdev,
 					 frag->page_offset, frag->size,
 					 PCI_DMA_TODEVICE);
 
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping frags failed with error: %d\n",
 				       qdev->ndev->name, err);
@@ -2916,7 +2916,7 @@ static int ql_alloc_large_buffers(struct ql3_adapter *qdev)
 					     QL_HEADER_SPACE,
 					     PCI_DMA_FROMDEVICE);
 
-			err = pci_dma_mapping_error(map);
+			err = pci_dma_mapping_error(qdev->pdev, map);
 			if(err) {
 				printk(KERN_ERR "%s: PCI mapping failed with error: %d\n",
 				       qdev->ndev->name, err);
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 9dae40ccf048..86d77d05190a 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -2512,8 +2512,8 @@ static void stop_nic(struct s2io_nic *nic)
  *   Return Value:
  *  SUCCESS on success or an appropriate -ve value on failure.
  */
-
-static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
+static int fill_rx_buffers(struct s2io_nic *nic, struct ring_info *ring,
+				int from_card_up)
 {
 	struct sk_buff *skb;
 	struct RxD_t *rxdp;
@@ -2602,7 +2602,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 			rxdp1->Buffer0_ptr = pci_map_single
 			    (ring->pdev, skb->data, size - NET_IP_ALIGN,
 				PCI_DMA_FROMDEVICE);
-			if(pci_dma_mapping_error(rxdp1->Buffer0_ptr))
+			if (pci_dma_mapping_error(nic->pdev,
+						rxdp1->Buffer0_ptr))
 				goto pci_map_failed;
 
 			rxdp->Control_2 =
@@ -2636,7 +2637,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 				rxdp3->Buffer0_ptr =
 				   pci_map_single(ring->pdev, ba->ba_0,
 					BUF0_LEN, PCI_DMA_FROMDEVICE);
-				if (pci_dma_mapping_error(rxdp3->Buffer0_ptr))
+			if (pci_dma_mapping_error(nic->pdev,
+						rxdp3->Buffer0_ptr))
 					goto pci_map_failed;
 			} else
 				pci_dma_sync_single_for_device(ring->pdev,
@@ -2655,7 +2657,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 				(ring->pdev, skb->data, ring->mtu + 4,
 						PCI_DMA_FROMDEVICE);
 
-				if (pci_dma_mapping_error(rxdp3->Buffer2_ptr))
+				if (pci_dma_mapping_error(nic->pdev,
+							rxdp3->Buffer2_ptr))
 					goto pci_map_failed;
 
 				if (from_card_up) {
@@ -2664,8 +2667,8 @@ static int fill_rx_buffers(struct ring_info *ring, int from_card_up)
 						ba->ba_1, BUF1_LEN,
 						PCI_DMA_FROMDEVICE);
 
-					if (pci_dma_mapping_error
-						(rxdp3->Buffer1_ptr)) {
+					if (pci_dma_mapping_error(nic->pdev,
+						rxdp3->Buffer1_ptr)) {
 						pci_unmap_single
 							(ring->pdev,
 						    (dma_addr_t)(unsigned long)
@@ -2806,9 +2809,9 @@ static void free_rx_buffers(struct s2io_nic *sp)
 	}
 }
 
-static int s2io_chk_rx_buffers(struct ring_info *ring)
+static int s2io_chk_rx_buffers(struct s2io_nic *nic, struct ring_info *ring)
 {
-	if (fill_rx_buffers(ring, 0) == -ENOMEM) {
+	if (fill_rx_buffers(nic, ring, 0) == -ENOMEM) {
 		DBG_PRINT(INFO_DBG, "%s:Out of memory", ring->dev->name);
 		DBG_PRINT(INFO_DBG, " in Rx Intr!!\n");
 	}
@@ -2848,7 +2851,7 @@ static int s2io_poll_msix(struct napi_struct *napi, int budget)
 		return 0;
 
 	pkts_processed = rx_intr_handler(ring, budget);
-	s2io_chk_rx_buffers(ring);
+	s2io_chk_rx_buffers(nic, ring);
 
 	if (pkts_processed < budget_org) {
 		netif_rx_complete(dev, napi);
@@ -2882,7 +2885,7 @@ static int s2io_poll_inta(struct napi_struct *napi, int budget)
 	for (i = 0; i < config->rx_ring_num; i++) {
 		ring = &mac_control->rings[i];
 		ring_pkts_processed = rx_intr_handler(ring, budget);
-		s2io_chk_rx_buffers(ring);
+		s2io_chk_rx_buffers(nic, ring);
 		pkts_processed += ring_pkts_processed;
 		budget -= ring_pkts_processed;
 		if (budget <= 0)
@@ -2939,7 +2942,8 @@ static void s2io_netpoll(struct net_device *dev)
 		rx_intr_handler(&mac_control->rings[i], 0);
 
 	for (i = 0; i < config->rx_ring_num; i++) {
-		if (fill_rx_buffers(&mac_control->rings[i], 0) == -ENOMEM) {
+		if (fill_rx_buffers(nic, &mac_control->rings[i], 0) ==
+				-ENOMEM) {
 			DBG_PRINT(INFO_DBG, "%s:Out of memory", dev->name);
 			DBG_PRINT(INFO_DBG, " in Rx Netpoll!!\n");
 			break;
@@ -4235,14 +4239,14 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 		txdp->Buffer_Pointer = pci_map_single(sp->pdev,
 					fifo->ufo_in_band_v,
 					sizeof(u64), PCI_DMA_TODEVICE);
-		if (pci_dma_mapping_error(txdp->Buffer_Pointer))
+		if (pci_dma_mapping_error(sp->pdev, txdp->Buffer_Pointer))
 			goto pci_map_failed;
 		txdp++;
 	}
 
 	txdp->Buffer_Pointer = pci_map_single
 	    (sp->pdev, skb->data, frg_len, PCI_DMA_TODEVICE);
-	if (pci_dma_mapping_error(txdp->Buffer_Pointer))
+	if (pci_dma_mapping_error(sp->pdev, txdp->Buffer_Pointer))
 		goto pci_map_failed;
 
 	txdp->Host_Control = (unsigned long) skb;
@@ -4345,7 +4349,7 @@ static irqreturn_t s2io_msix_ring_handle(int irq, void *dev_id)
 		netif_rx_schedule(dev, &ring->napi);
 	} else {
 		rx_intr_handler(ring, 0);
-		s2io_chk_rx_buffers(ring);
+		s2io_chk_rx_buffers(sp, ring);
 	}
 
 	return IRQ_HANDLED;
@@ -4826,7 +4830,7 @@ static irqreturn_t s2io_isr(int irq, void *dev_id)
 		 */
 		if (!config->napi) {
 			for (i = 0; i < config->rx_ring_num; i++)
-				s2io_chk_rx_buffers(&mac_control->rings[i]);
+				s2io_chk_rx_buffers(sp, &mac_control->rings[i]);
 		}
 		writeq(sp->general_int_mask, &bar0->general_int_mask);
 		readl(&bar0->general_int_status);
@@ -6859,7 +6863,7 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 				pci_map_single( sp->pdev, (*skb)->data,
 					size - NET_IP_ALIGN,
 					PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp1->Buffer0_ptr))
+			if (pci_dma_mapping_error(sp->pdev, rxdp1->Buffer0_ptr))
 				goto memalloc_failed;
 			rxdp->Host_Control = (unsigned long) (*skb);
 		}
@@ -6886,12 +6890,13 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 				pci_map_single(sp->pdev, (*skb)->data,
 					       dev->mtu + 4,
 					       PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer2_ptr))
+			if (pci_dma_mapping_error(sp->pdev, rxdp3->Buffer2_ptr))
 				goto memalloc_failed;
 			rxdp3->Buffer0_ptr = *temp0 =
 				pci_map_single( sp->pdev, ba->ba_0, BUF0_LEN,
 						PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer0_ptr)) {
+			if (pci_dma_mapping_error(sp->pdev,
+						rxdp3->Buffer0_ptr)) {
 				pci_unmap_single (sp->pdev,
 					(dma_addr_t)rxdp3->Buffer2_ptr,
 					dev->mtu + 4, PCI_DMA_FROMDEVICE);
@@ -6903,7 +6908,8 @@ static int set_rxd_buffer_pointer(struct s2io_nic *sp, struct RxD_t *rxdp,
 			rxdp3->Buffer1_ptr = *temp1 =
 				pci_map_single(sp->pdev, ba->ba_1, BUF1_LEN,
 						PCI_DMA_FROMDEVICE);
-			if (pci_dma_mapping_error(rxdp3->Buffer1_ptr)) {
+			if (pci_dma_mapping_error(sp->pdev,
+						rxdp3->Buffer1_ptr)) {
 				pci_unmap_single (sp->pdev,
 					(dma_addr_t)rxdp3->Buffer0_ptr,
 					BUF0_LEN, PCI_DMA_FROMDEVICE);
@@ -7187,7 +7193,7 @@ static int s2io_card_up(struct s2io_nic * sp)
 
 	for (i = 0; i < config->rx_ring_num; i++) {
 		mac_control->rings[i].mtu = dev->mtu;
-		ret = fill_rx_buffers(&mac_control->rings[i], 1);
+		ret = fill_rx_buffers(sp, &mac_control->rings[i], 1);
 		if (ret) {
 			DBG_PRINT(ERR_DBG, "%s: Out of memory in Open\n",
 				  dev->name);
diff --git a/drivers/net/sfc/rx.c b/drivers/net/sfc/rx.c
index 601b001437c0..0d27dd39bc09 100644
--- a/drivers/net/sfc/rx.c
+++ b/drivers/net/sfc/rx.c
@@ -233,7 +233,7 @@ static inline int efx_init_rx_buffer_skb(struct efx_rx_queue *rx_queue,
 					  rx_buf->data, rx_buf->len,
 					  PCI_DMA_FROMDEVICE);
 
-	if (unlikely(pci_dma_mapping_error(rx_buf->dma_addr))) {
+	if (unlikely(pci_dma_mapping_error(efx->pci_dev, rx_buf->dma_addr))) {
 		dev_kfree_skb_any(rx_buf->skb);
 		rx_buf->skb = NULL;
 		return -EIO;
@@ -275,7 +275,7 @@ static inline int efx_init_rx_buffer_page(struct efx_rx_queue *rx_queue,
 					0, efx_rx_buf_size(efx),
 					PCI_DMA_FROMDEVICE);
 
-		if (unlikely(pci_dma_mapping_error(dma_addr))) {
+		if (unlikely(pci_dma_mapping_error(efx->pci_dev, dma_addr))) {
 			__free_pages(rx_buf->page, efx->rx_buffer_order);
 			rx_buf->page = NULL;
 			return -EIO;
diff --git a/drivers/net/sfc/tx.c b/drivers/net/sfc/tx.c
index 5cdd082ab8f6..5e8374ab28ee 100644
--- a/drivers/net/sfc/tx.c
+++ b/drivers/net/sfc/tx.c
@@ -172,7 +172,7 @@ static inline int efx_enqueue_skb(struct efx_tx_queue *tx_queue,
 
 	/* Process all fragments */
 	while (1) {
-		if (unlikely(pci_dma_mapping_error(dma_addr)))
+		if (unlikely(pci_dma_mapping_error(pci_dev, dma_addr)))
 			goto pci_err;
 
 		/* Store fields for marking in the per-fragment final
@@ -661,7 +661,8 @@ efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len)
 	tsoh->dma_addr = pci_map_single(tx_queue->efx->pci_dev,
 					TSOH_BUFFER(tsoh), header_len,
 					PCI_DMA_TODEVICE);
-	if (unlikely(pci_dma_mapping_error(tsoh->dma_addr))) {
+	if (unlikely(pci_dma_mapping_error(tx_queue->efx->pci_dev,
+					   tsoh->dma_addr))) {
 		kfree(tsoh);
 		return NULL;
 	}
@@ -863,7 +864,7 @@ static inline int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
 
 	st->ifc.unmap_addr = pci_map_page(efx->pci_dev, page, page_off,
 					  len, PCI_DMA_TODEVICE);
-	if (likely(!pci_dma_mapping_error(st->ifc.unmap_addr))) {
+	if (likely(!pci_dma_mapping_error(efx->pci_dev, st->ifc.unmap_addr))) {
 		st->ifc.unmap_len = len;
 		st->ifc.len = len;
 		st->ifc.dma_addr = st->ifc.unmap_addr;
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 00aa0b108cb9..b6435d0d71f9 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -452,7 +452,7 @@ spider_net_prepare_rx_descr(struct spider_net_card *card,
 	/* iommu-map the skb */
 	buf = pci_map_single(card->pdev, descr->skb->data,
 			SPIDER_NET_MAX_FRAME, PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(buf)) {
+	if (pci_dma_mapping_error(card->pdev, buf)) {
 		dev_kfree_skb_any(descr->skb);
 		descr->skb = NULL;
 		if (netif_msg_rx_err(card) && net_ratelimit())
@@ -691,7 +691,7 @@ spider_net_prepare_tx_descr(struct spider_net_card *card,
 	unsigned long flags;
 
 	buf = pci_map_single(card->pdev, skb->data, skb->len, PCI_DMA_TODEVICE);
-	if (pci_dma_mapping_error(buf)) {
+	if (pci_dma_mapping_error(card->pdev, buf)) {
 		if (netif_msg_tx_err(card) && net_ratelimit())
 			dev_err(&card->netdev->dev, "could not iommu-map packet (%p, %i). "
 				  "Dropping packet\n", skb->data, skb->len);
diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c
index a645e5028c14..8487ace9d2e3 100644
--- a/drivers/net/tc35815.c
+++ b/drivers/net/tc35815.c
@@ -506,7 +506,7 @@ static void *alloc_rxbuf_page(struct pci_dev *hwdev, dma_addr_t *dma_handle)
 		return NULL;
 	*dma_handle = pci_map_single(hwdev, buf, PAGE_SIZE,
 				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(*dma_handle)) {
+	if (pci_dma_mapping_error(hwdev, *dma_handle)) {
 		free_page((unsigned long)buf);
 		return NULL;
 	}
@@ -536,7 +536,7 @@ static struct sk_buff *alloc_rxbuf_skb(struct net_device *dev,
 		return NULL;
 	*dma_handle = pci_map_single(hwdev, skb->data, RX_BUF_SIZE,
 				     PCI_DMA_FROMDEVICE);
-	if (pci_dma_mapping_error(*dma_handle)) {
+	if (pci_dma_mapping_error(hwdev, *dma_handle)) {
 		dev_kfree_skb_any(skb);
 		return NULL;
 	}
diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c
index 217d506527a9..d9769c527346 100644
--- a/drivers/net/wireless/ath5k/base.c
+++ b/drivers/net/wireless/ath5k/base.c
@@ -1166,7 +1166,7 @@ ath5k_rxbuf_setup(struct ath5k_softc *sc, struct ath5k_buf *bf)
 		bf->skb = skb;
 		bf->skbaddr = pci_map_single(sc->pdev,
 			skb->data, sc->rxbufsize, PCI_DMA_FROMDEVICE);
-		if (unlikely(pci_dma_mapping_error(bf->skbaddr))) {
+		if (unlikely(pci_dma_mapping_error(sc->pdev, bf->skbaddr))) {
 			ATH5K_ERR(sc, "%s: DMA mapping failed\n", __func__);
 			dev_kfree_skb(skb);
 			bf->skb = NULL;
@@ -1918,7 +1918,7 @@ ath5k_beacon_setup(struct ath5k_softc *sc, struct ath5k_buf *bf)
 	ATH5K_DBG(sc, ATH5K_DEBUG_BEACON, "skb %p [data %p len %u] "
 			"skbaddr %llx\n", skb, skb->data, skb->len,
 			(unsigned long long)bf->skbaddr);
-	if (pci_dma_mapping_error(bf->skbaddr)) {
+	if (pci_dma_mapping_error(sc->pdev, bf->skbaddr)) {
 		ATH5K_ERR(sc, "beacon DMA mapping failed\n");
 		return -EIO;
 	}
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index c4a7c06793c5..61f8fdea2d96 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3525,7 +3525,7 @@ static int ibmvfc_init_crq(struct ibmvfc_host *vhost)
 	crq->msg_token = dma_map_single(dev, crq->msgs,
 					PAGE_SIZE, DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(crq->msg_token))
+	if (dma_mapping_error(dev, crq->msg_token))
 		goto map_failed;
 
 	retrc = rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address,
@@ -3618,7 +3618,7 @@ static int ibmvfc_alloc_mem(struct ibmvfc_host *vhost)
 					    async_q->size * sizeof(*async_q->msgs),
 					    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(async_q->msg_token)) {
+	if (dma_mapping_error(dev, async_q->msg_token)) {
 		dev_err(dev, "Failed to map async queue\n");
 		goto free_async_crq;
 	}
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 20000ec79b04..6b24b9cdb04c 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -859,7 +859,7 @@ static void send_mad_adapter_info(struct ibmvscsi_host_data *hostdata)
 					    sizeof(hostdata->madapter_info),
 					    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(req->buffer)) {
+	if (dma_mapping_error(hostdata->dev, req->buffer)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			dev_err(hostdata->dev,
 			        "Unable to map request_buffer for "
@@ -1407,7 +1407,7 @@ static int ibmvscsi_do_host_config(struct ibmvscsi_host_data *hostdata,
 						    length,
 						    DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(host_config->buffer)) {
+	if (dma_mapping_error(hostdata->dev, host_config->buffer)) {
 		if (!firmware_has_feature(FW_FEATURE_CMO))
 			dev_err(hostdata->dev,
 			        "dma_mapping error getting host config\n");
diff --git a/drivers/scsi/ibmvscsi/ibmvstgt.c b/drivers/scsi/ibmvscsi/ibmvstgt.c
index 3b9514c8f1f1..2e13ec00172a 100644
--- a/drivers/scsi/ibmvscsi/ibmvstgt.c
+++ b/drivers/scsi/ibmvscsi/ibmvstgt.c
@@ -564,7 +564,7 @@ static int crq_queue_create(struct crq_queue *queue, struct srp_target *target)
 					  queue->size * sizeof(*queue->msgs),
 					  DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(queue->msg_token))
+	if (dma_mapping_error(target->dev, queue->msg_token))
 		goto map_failed;
 
 	err = h_reg_crq(vport->dma_dev->unit_address, queue->msg_token,
diff --git a/drivers/scsi/ibmvscsi/rpa_vscsi.c b/drivers/scsi/ibmvscsi/rpa_vscsi.c
index 182146100dc1..462a8574dad9 100644
--- a/drivers/scsi/ibmvscsi/rpa_vscsi.c
+++ b/drivers/scsi/ibmvscsi/rpa_vscsi.c
@@ -253,7 +253,7 @@ static int rpavscsi_init_crq_queue(struct crq_queue *queue,
 					  queue->size * sizeof(*queue->msgs),
 					  DMA_BIDIRECTIONAL);
 
-	if (dma_mapping_error(queue->msg_token))
+	if (dma_mapping_error(hostdata->dev, queue->msg_token))
 		goto map_failed;
 
 	gather_partition_info();
diff --git a/drivers/spi/atmel_spi.c b/drivers/spi/atmel_spi.c
index e81d59d78910..0c7165660853 100644
--- a/drivers/spi/atmel_spi.c
+++ b/drivers/spi/atmel_spi.c
@@ -313,14 +313,14 @@ atmel_spi_dma_map_xfer(struct atmel_spi *as, struct spi_transfer *xfer)
 		xfer->tx_dma = dma_map_single(dev,
 				(void *) xfer->tx_buf, xfer->len,
 				DMA_TO_DEVICE);
-		if (dma_mapping_error(xfer->tx_dma))
+		if (dma_mapping_error(dev, xfer->tx_dma))
 			return -ENOMEM;
 	}
 	if (xfer->rx_buf) {
 		xfer->rx_dma = dma_map_single(dev,
 				xfer->rx_buf, xfer->len,
 				DMA_FROM_DEVICE);
-		if (dma_mapping_error(xfer->rx_dma)) {
+		if (dma_mapping_error(dev, xfer->rx_dma)) {
 			if (xfer->tx_buf)
 				dma_unmap_single(dev,
 						xfer->tx_dma, xfer->len,
diff --git a/drivers/spi/au1550_spi.c b/drivers/spi/au1550_spi.c
index 9149689c79d9..87b73e0169c5 100644
--- a/drivers/spi/au1550_spi.c
+++ b/drivers/spi/au1550_spi.c
@@ -334,7 +334,7 @@ static int au1550_spi_dma_rxtmp_alloc(struct au1550_spi *hw, unsigned size)
 	hw->dma_rx_tmpbuf_size = size;
 	hw->dma_rx_tmpbuf_addr = dma_map_single(hw->dev, hw->dma_rx_tmpbuf,
 			size, DMA_FROM_DEVICE);
-	if (dma_mapping_error(hw->dma_rx_tmpbuf_addr)) {
+	if (dma_mapping_error(hw->dev, hw->dma_rx_tmpbuf_addr)) {
 		kfree(hw->dma_rx_tmpbuf);
 		hw->dma_rx_tmpbuf = 0;
 		hw->dma_rx_tmpbuf_size = 0;
@@ -378,7 +378,7 @@ static int au1550_spi_dma_txrxb(struct spi_device *spi, struct spi_transfer *t)
 			dma_rx_addr = dma_map_single(hw->dev,
 					(void *)t->rx_buf,
 					t->len, DMA_FROM_DEVICE);
-			if (dma_mapping_error(dma_rx_addr))
+			if (dma_mapping_error(hw->dev, dma_rx_addr))
 				dev_err(hw->dev, "rx dma map error\n");
 		}
 	} else {
@@ -401,7 +401,7 @@ static int au1550_spi_dma_txrxb(struct spi_device *spi, struct spi_transfer *t)
 			dma_tx_addr = dma_map_single(hw->dev,
 					(void *)t->tx_buf,
 					t->len, DMA_TO_DEVICE);
-			if (dma_mapping_error(dma_tx_addr))
+			if (dma_mapping_error(hw->dev, dma_tx_addr))
 				dev_err(hw->dev, "tx dma map error\n");
 		}
 	} else {
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c
index b1cc148036c1..f6f987bb71ca 100644
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -836,7 +836,7 @@ static int omap2_mcspi_transfer(struct spi_device *spi, struct spi_message *m)
 		if (tx_buf != NULL) {
 			t->tx_dma = dma_map_single(&spi->dev, (void *) tx_buf,
 					len, DMA_TO_DEVICE);
-			if (dma_mapping_error(t->tx_dma)) {
+			if (dma_mapping_error(&spi->dev, t->tx_dma)) {
 				dev_dbg(&spi->dev, "dma %cX %d bytes error\n",
 						'T', len);
 				return -EINVAL;
@@ -845,7 +845,7 @@ static int omap2_mcspi_transfer(struct spi_device *spi, struct spi_message *m)
 		if (rx_buf != NULL) {
 			t->rx_dma = dma_map_single(&spi->dev, rx_buf, t->len,
 					DMA_FROM_DEVICE);
-			if (dma_mapping_error(t->rx_dma)) {
+			if (dma_mapping_error(&spi->dev, t->rx_dma)) {
 				dev_dbg(&spi->dev, "dma %cX %d bytes error\n",
 						'R', len);
 				if (tx_buf != NULL)
diff --git a/drivers/spi/pxa2xx_spi.c b/drivers/spi/pxa2xx_spi.c
index 0c452c46ab07..067299d6d192 100644
--- a/drivers/spi/pxa2xx_spi.c
+++ b/drivers/spi/pxa2xx_spi.c
@@ -353,7 +353,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 	drv_data->rx_dma = dma_map_single(dev, drv_data->rx,
 						drv_data->rx_map_len,
 						DMA_FROM_DEVICE);
-	if (dma_mapping_error(drv_data->rx_dma))
+	if (dma_mapping_error(dev, drv_data->rx_dma))
 		return 0;
 
 	/* Stream map the tx buffer */
@@ -361,7 +361,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 						drv_data->tx_map_len,
 						DMA_TO_DEVICE);
 
-	if (dma_mapping_error(drv_data->tx_dma)) {
+	if (dma_mapping_error(dev, drv_data->tx_dma)) {
 		dma_unmap_single(dev, drv_data->rx_dma,
 					drv_data->rx_map_len, DMA_FROM_DEVICE);
 		return 0;
diff --git a/drivers/spi/spi_imx.c b/drivers/spi/spi_imx.c
index 54ac7bea5f8c..6fb77fcc4971 100644
--- a/drivers/spi/spi_imx.c
+++ b/drivers/spi/spi_imx.c
@@ -491,7 +491,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 							buf,
 							drv_data->tx_map_len,
 							DMA_TO_DEVICE);
-			if (dma_mapping_error(drv_data->tx_dma))
+			if (dma_mapping_error(dev, drv_data->tx_dma))
 				return -1;
 
 			drv_data->tx_dma_needs_unmap = 1;
@@ -516,7 +516,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 					buf,
 					drv_data->len,
 					DMA_FROM_DEVICE);
-		if (dma_mapping_error(drv_data->rx_dma))
+		if (dma_mapping_error(dev, drv_data->rx_dma))
 			return -1;
 		drv_data->rx_dma_needs_unmap = 1;
 	}
@@ -534,7 +534,7 @@ static int map_dma_buffers(struct driver_data *drv_data)
 					buf,
 					drv_data->tx_map_len,
 					DMA_TO_DEVICE);
-	if (dma_mapping_error(drv_data->tx_dma)) {
+	if (dma_mapping_error(dev, drv_data->tx_dma)) {
 		if (drv_data->rx_dma) {
 			dma_unmap_single(dev,
 					drv_data->rx_dma,
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index db351d1296f4..a5801ae02e4b 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -24,8 +24,8 @@
 		pci_unmap_sg(alpha_gendev_to_pci(dev), sg, nents, dir)
 #define dma_supported(dev, mask)			\
 		pci_dma_supported(alpha_gendev_to_pci(dev), mask)
-#define dma_mapping_error(addr)				\
-		pci_dma_mapping_error(addr)
+#define dma_mapping_error(dev, addr)				\
+		pci_dma_mapping_error(alpha_gendev_to_pci(dev), addr)
 
 #else	/* no PCI - no IOMMU. */
 
@@ -45,7 +45,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 #define dma_unmap_page(dev, addr, size, dir)	((void)0)
 #define dma_unmap_sg(dev, sg, nents, dir)	((void)0)
 
-#define dma_mapping_error(addr)  (0)
+#define dma_mapping_error(dev, addr)  (0)
 
 #endif	/* !CONFIG_PCI */
 
diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index d31fd49ff79a..2a14302c17a3 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -106,7 +106,7 @@ extern dma_addr_t pci_map_page(struct pci_dev *, struct page *,
 /* Test for pci_map_single or pci_map_page having generated an error.  */
 
 static inline int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *pdev, dma_addr_t dma_addr)
 {
 	return dma_addr == 0;
 }
diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h
index e99406a7bece..f41335ba6337 100644
--- a/include/asm-arm/dma-mapping.h
+++ b/include/asm-arm/dma-mapping.h
@@ -56,7 +56,7 @@ static inline int dma_is_consistent(struct device *dev, dma_addr_t handle)
 /*
  * DMA errors are defined by all-bits-set in the DMA address.
  */
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == ~0;
 }
diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h
index 57dc672bab8e..0399359ab5d8 100644
--- a/include/asm-avr32/dma-mapping.h
+++ b/include/asm-avr32/dma-mapping.h
@@ -35,7 +35,7 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask)
 /*
  * dma_map_single can't fail as it is implemented now.
  */
-static inline int dma_mapping_error(dma_addr_t addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t addr)
 {
 	return 0;
 }
diff --git a/include/asm-cris/dma-mapping.h b/include/asm-cris/dma-mapping.h
index edc8d1bfaae2..cb2fb25ff8d9 100644
--- a/include/asm-cris/dma-mapping.h
+++ b/include/asm-cris/dma-mapping.h
@@ -120,7 +120,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 }
 
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-frv/dma-mapping.h b/include/asm-frv/dma-mapping.h
index 2e8966ca030d..b2898877c07b 100644
--- a/include/asm-frv/dma-mapping.h
+++ b/include/asm-frv/dma-mapping.h
@@ -126,7 +126,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nele
 }
 
 static inline
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-generic/dma-mapping-broken.h b/include/asm-generic/dma-mapping-broken.h
index e2468f894d2a..82cd0cb1c3fe 100644
--- a/include/asm-generic/dma-mapping-broken.h
+++ b/include/asm-generic/dma-mapping-broken.h
@@ -61,7 +61,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
 #define dma_sync_sg_for_device dma_sync_sg_for_cpu
 
 extern int
-dma_mapping_error(dma_addr_t dma_addr);
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 extern int
 dma_supported(struct device *dev, u64 mask);
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 783ab9944d70..189486c3f92e 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -144,9 +144,9 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 }
 
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	return pci_dma_mapping_error(dma_addr);
+	return pci_dma_mapping_error(to_pci_dev(dev), dma_addr);
 }
 
 
diff --git a/include/asm-generic/pci-dma-compat.h b/include/asm-generic/pci-dma-compat.h
index 25c10e96b2b7..37b3706226e7 100644
--- a/include/asm-generic/pci-dma-compat.h
+++ b/include/asm-generic/pci-dma-compat.h
@@ -99,9 +99,9 @@ pci_dma_sync_sg_for_device(struct pci_dev *hwdev, struct scatterlist *sg,
 }
 
 static inline int
-pci_dma_mapping_error(dma_addr_t dma_addr)
+pci_dma_mapping_error(struct pci_dev *pdev, dma_addr_t dma_addr)
 {
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(&pdev->dev, dma_addr);
 }
 
 #endif
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index 0721a5e8271e..a6d50c77b6bf 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -54,7 +54,7 @@ typedef void ia64_mv_dma_sync_single_for_cpu (struct device *, dma_addr_t, size_
 typedef void ia64_mv_dma_sync_sg_for_cpu (struct device *, struct scatterlist *, int, int);
 typedef void ia64_mv_dma_sync_single_for_device (struct device *, dma_addr_t, size_t, int);
 typedef void ia64_mv_dma_sync_sg_for_device (struct device *, struct scatterlist *, int, int);
-typedef int ia64_mv_dma_mapping_error (dma_addr_t dma_addr);
+typedef int ia64_mv_dma_mapping_error(struct device *, dma_addr_t dma_addr);
 typedef int ia64_mv_dma_supported (struct device *, u64);
 
 typedef dma_addr_t ia64_mv_dma_map_single_attrs (struct device *, void *, size_t, int, struct dma_attrs *);
diff --git a/include/asm-m68k/dma-mapping.h b/include/asm-m68k/dma-mapping.h
index a26cdeb46a57..91f7944333d4 100644
--- a/include/asm-m68k/dma-mapping.h
+++ b/include/asm-m68k/dma-mapping.h
@@ -84,7 +84,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *s
 {
 }
 
-static inline int dma_mapping_error(dma_addr_t handle)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t handle)
 {
 	return 0;
 }
diff --git a/include/asm-mips/dma-mapping.h b/include/asm-mips/dma-mapping.h
index 230b3f1b69b1..c64afb40cd06 100644
--- a/include/asm-mips/dma-mapping.h
+++ b/include/asm-mips/dma-mapping.h
@@ -42,7 +42,7 @@ extern void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 	int nelems, enum dma_data_direction direction);
 extern void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 	int nelems, enum dma_data_direction direction);
-extern int dma_mapping_error(dma_addr_t dma_addr);
+extern int dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 extern int dma_supported(struct device *dev, u64 mask);
 
 static inline int
diff --git a/include/asm-mn10300/dma-mapping.h b/include/asm-mn10300/dma-mapping.h
index 7c882fca9ec8..ccae8f6c6326 100644
--- a/include/asm-mn10300/dma-mapping.h
+++ b/include/asm-mn10300/dma-mapping.h
@@ -182,7 +182,7 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 
 static inline
-int dma_mapping_error(dma_addr_t dma_addr)
+int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/asm-parisc/dma-mapping.h b/include/asm-parisc/dma-mapping.h
index c6c0e9ff6bde..53af696f23d2 100644
--- a/include/asm-parisc/dma-mapping.h
+++ b/include/asm-parisc/dma-mapping.h
@@ -248,6 +248,6 @@ void * sba_get_iommu(struct parisc_device *dev);
 #endif
 
 /* At the moment, we panic on error for IOMMU resource exaustion */
-#define dma_mapping_error(x)	0
+#define dma_mapping_error(dev, x)	0
 
 #endif
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 74c549780987..c7ca45f97dd2 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -415,7 +415,7 @@ static inline void dma_sync_sg_for_device(struct device *dev,
 		__dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 #ifdef CONFIG_PPC64
 	return (dma_addr == DMA_ERROR_CODE);
diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h
index 22cc419389fe..6c0b8a2de143 100644
--- a/include/asm-sh/dma-mapping.h
+++ b/include/asm-sh/dma-mapping.h
@@ -171,7 +171,7 @@ static inline int dma_get_cache_alignment(void)
 	return L1_CACHE_BYTES;
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return dma_addr == 0;
 }
diff --git a/include/asm-sparc/dma-mapping_64.h b/include/asm-sparc/dma-mapping_64.h
index 38cbec76a33f..bfa64f9702d5 100644
--- a/include/asm-sparc/dma-mapping_64.h
+++ b/include/asm-sparc/dma-mapping_64.h
@@ -135,7 +135,7 @@ static inline void dma_sync_sg_for_device(struct device *dev,
 	/* No flushing needed to sync cpu writes to the device.  */
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return (dma_addr == DMA_ERROR_CODE);
 }
diff --git a/include/asm-sparc/pci_32.h b/include/asm-sparc/pci_32.h
index b93b6c79e08f..0ee949d220c0 100644
--- a/include/asm-sparc/pci_32.h
+++ b/include/asm-sparc/pci_32.h
@@ -154,7 +154,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 
 #define PCI_DMA_ERROR_CODE      (~(dma_addr_t)0x0)
 
-static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
+static inline int pci_dma_mapping_error(struct pci_dev *pdev,
+					dma_addr_t dma_addr)
 {
         return (dma_addr == PCI_DMA_ERROR_CODE);
 }
diff --git a/include/asm-sparc/pci_64.h b/include/asm-sparc/pci_64.h
index f59f2571295b..4f79a54948f6 100644
--- a/include/asm-sparc/pci_64.h
+++ b/include/asm-sparc/pci_64.h
@@ -140,9 +140,10 @@ extern int pci_dma_supported(struct pci_dev *hwdev, u64 mask);
 #define PCI64_REQUIRED_MASK	(~(dma64_addr_t)0)
 #define PCI64_ADDR_BASE		0xfffc000000000000UL
 
-static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
+static inline int pci_dma_mapping_error(struct pci_dev *pdev,
+					dma_addr_t dma_addr)
 {
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(&pdev->dev, dma_addr);
 }
 
 #ifdef CONFIG_PCI
diff --git a/include/asm-x86/device.h b/include/asm-x86/device.h
index 87a715367a1b..3c034f48fdb0 100644
--- a/include/asm-x86/device.h
+++ b/include/asm-x86/device.h
@@ -5,6 +5,9 @@ struct dev_archdata {
 #ifdef CONFIG_ACPI
 	void	*acpi_handle;
 #endif
+#ifdef CONFIG_X86_64
+struct dma_mapping_ops *dma_ops;
+#endif
 #ifdef CONFIG_DMAR
 	void *iommu; /* hook for IOMMU specific extension */
 #endif
diff --git a/include/asm-x86/dma-mapping.h b/include/asm-x86/dma-mapping.h
index c2ddd3d1b883..0eaa9bf6011f 100644
--- a/include/asm-x86/dma-mapping.h
+++ b/include/asm-x86/dma-mapping.h
@@ -17,7 +17,8 @@ extern int panic_on_overflow;
 extern int force_iommu;
 
 struct dma_mapping_ops {
-	int             (*mapping_error)(dma_addr_t dma_addr);
+	int             (*mapping_error)(struct device *dev,
+					 dma_addr_t dma_addr);
 	void*           (*alloc_coherent)(struct device *dev, size_t size,
 				dma_addr_t *dma_handle, gfp_t gfp);
 	void            (*free_coherent)(struct device *dev, size_t size,
@@ -56,14 +57,32 @@ struct dma_mapping_ops {
 	int		is_phys;
 };
 
-extern const struct dma_mapping_ops *dma_ops;
+extern struct dma_mapping_ops *dma_ops;
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
 {
-	if (dma_ops->mapping_error)
-		return dma_ops->mapping_error(dma_addr);
+#ifdef CONFIG_X86_32
+	return dma_ops;
+#else
+	if (unlikely(!dev) || !dev->archdata.dma_ops)
+		return dma_ops;
+	else
+		return dev->archdata.dma_ops;
+#endif
+}
+
+/* Make sure we keep the same behaviour */
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+#ifdef CONFIG_X86_32
+	return 0;
+#else
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+	if (ops->mapping_error)
+		return ops->mapping_error(dev, dma_addr);
 
 	return (dma_addr == bad_dma_address);
+#endif
 }
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -83,44 +102,53 @@ static inline dma_addr_t
 dma_map_single(struct device *hwdev, void *ptr, size_t size,
 	       int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
+	return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
 }
 
 static inline void
 dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
 		 int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, addr, size, direction);
+	if (ops->unmap_single)
+		ops->unmap_single(dev, addr, size, direction);
 }
 
 static inline int
 dma_map_sg(struct device *hwdev, struct scatterlist *sg,
 	   int nents, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_sg(hwdev, sg, nents, direction);
+	return ops->map_sg(hwdev, sg, nents, direction);
 }
 
 static inline void
 dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
 	     int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->unmap_sg)
-		dma_ops->unmap_sg(hwdev, sg, nents, direction);
+	if (ops->unmap_sg)
+		ops->unmap_sg(hwdev, sg, nents, direction);
 }
 
 static inline void
 dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			size_t size, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_for_cpu)
-		dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
-					     direction);
+	if (ops->sync_single_for_cpu)
+		ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
 	flush_write_buffers();
 }
 
@@ -128,10 +156,11 @@ static inline void
 dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
 			   size_t size, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_for_device)
-		dma_ops->sync_single_for_device(hwdev, dma_handle, size,
-						direction);
+	if (ops->sync_single_for_device)
+		ops->sync_single_for_device(hwdev, dma_handle, size, direction);
 	flush_write_buffers();
 }
 
@@ -139,11 +168,12 @@ static inline void
 dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
 			      unsigned long offset, size_t size, int direction)
 {
-	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_range_for_cpu)
-		dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
-						   size, direction);
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
 
+	BUG_ON(!valid_dma_direction(direction));
+	if (ops->sync_single_range_for_cpu)
+		ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
+					       size, direction);
 	flush_write_buffers();
 }
 
@@ -152,11 +182,12 @@ dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
 				 unsigned long offset, size_t size,
 				 int direction)
 {
-	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_single_range_for_device)
-		dma_ops->sync_single_range_for_device(hwdev, dma_handle,
-						      offset, size, direction);
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
 
+	BUG_ON(!valid_dma_direction(direction));
+	if (ops->sync_single_range_for_device)
+		ops->sync_single_range_for_device(hwdev, dma_handle,
+						  offset, size, direction);
 	flush_write_buffers();
 }
 
@@ -164,9 +195,11 @@ static inline void
 dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
 		    int nelems, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_sg_for_cpu)
-		dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
+	if (ops->sync_sg_for_cpu)
+		ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
 	flush_write_buffers();
 }
 
@@ -174,9 +207,11 @@ static inline void
 dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 		       int nelems, int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(hwdev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	if (dma_ops->sync_sg_for_device)
-		dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+	if (ops->sync_sg_for_device)
+		ops->sync_sg_for_device(hwdev, sg, nelems, direction);
 
 	flush_write_buffers();
 }
@@ -185,9 +220,11 @@ static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
 				      int direction)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 	BUG_ON(!valid_dma_direction(direction));
-	return dma_ops->map_single(dev, page_to_phys(page)+offset,
-				   size, direction);
+	return ops->map_single(dev, page_to_phys(page) + offset,
+			       size, direction);
 }
 
 static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
diff --git a/include/asm-x86/swiotlb.h b/include/asm-x86/swiotlb.h
index c706a7442633..2730b351afcf 100644
--- a/include/asm-x86/swiotlb.h
+++ b/include/asm-x86/swiotlb.h
@@ -35,7 +35,7 @@ extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
 			  int nents, int direction);
 extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
 			     int nents, int direction);
-extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
 extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
 				  void *vaddr, dma_addr_t dma_handle);
 extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h
index 3c7d537dd15d..51882ae3db4d 100644
--- a/include/asm-xtensa/dma-mapping.h
+++ b/include/asm-xtensa/dma-mapping.h
@@ -139,7 +139,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
 		consistent_sync(sg_virt(sg), sg->length, dir);
 }
 static inline int
-dma_mapping_error(dma_addr_t dma_addr)
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return 0;
 }
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 7d51cbca49ab..75ae6d8aba4f 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -758,7 +758,7 @@ static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
 	}
 
 	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
-	if (!dma_mapping_error(dma_addr)) {
+	if (!dma_mapping_error(&c->pdev->dev, dma_addr)) {
 #ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
 		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
 			*mptr++ = cpu_to_le32(0x7C020002);
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 4bf8cade9dbc..e530026eedf7 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -427,9 +427,9 @@ static inline int ssb_dma_mapping_error(struct ssb_device *dev, dma_addr_t addr)
 {
 	switch (dev->bus->bustype) {
 	case SSB_BUSTYPE_PCI:
-		return pci_dma_mapping_error(addr);
+		return pci_dma_mapping_error(dev->bus->host_pci, addr);
 	case SSB_BUSTYPE_SSB:
-		return dma_mapping_error(addr);
+		return dma_mapping_error(dev->dev, addr);
 	default:
 		__ssb_dma_not_implemented(dev);
 	}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 90b529f7a154..936e333e7ce5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1590,7 +1590,7 @@ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
 {
 	if (dev->dma_ops)
 		return dev->dma_ops->mapping_error(dev, dma_addr);
-	return dma_mapping_error(dma_addr);
+	return dma_mapping_error(dev->dma_device, dma_addr);
 }
 
 /**
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d568894df8cc..977edbdbc1de 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -492,7 +492,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 */
 		dma_addr_t handle;
 		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
-		if (swiotlb_dma_mapping_error(handle))
+		if (swiotlb_dma_mapping_error(hwdev, handle))
 			return NULL;
 
 		ret = bus_to_virt(handle);
@@ -824,7 +824,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 }
 
 int
-swiotlb_dma_mapping_error(dma_addr_t dma_addr)
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
 	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a19b22b452a3..84d328329d98 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -169,7 +169,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 					  (void *)
 					  vec->sge[xdr_sge_no].iov_base + sge_off,
 					  sge_bytes, DMA_TO_DEVICE);
-		if (dma_mapping_error(sge[sge_no].addr))
+		if (dma_mapping_error(xprt->sc_cm_id->device->dma_device,
+					sge[sge_no].addr))
 			goto err;
 		sge_off = 0;
 		sge_no++;
-- 
cgit v1.2.3


From 1956a96de488feb05e95c08c9d5e80f63a4be2b1 Mon Sep 17 00:00:00 2001
From: Alexis Bruemmer <alexisb@us.ibm.com>
Date: Fri, 25 Jul 2008 19:44:51 -0700
Subject: x86 calgary: fix handling of devices that aren't behind the Calgary

The calgary code can give drivers addresses above 4GB which is very bad
for hardware that is only 32bit DMA addressable.

With this patch, the calgary code sets the global dma_ops to swiotlb or
nommu properly, and the dma_ops of devices behind the Calgary/CalIOC2
to calgary_dma_ops.  So the calgary code can handle devices safely that
aren't behind the Calgary/CalIOC2.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexis Bruemmer <alexisb@us.ibm.com>
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/pci-calgary_64.c | 71 +++++++++++++++-------------------------
 include/asm-x86/iommu.h          |  1 +
 2 files changed, 27 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1eb86be93d7a..b67a4b1d4eae 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
+
 #include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
@@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
 	}
 }
 
-static int calgary_nontranslate_map_sg(struct device* dev,
-	struct scatterlist *sg, int nelems, int direction)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nelems, i) {
-		struct page *p = sg_page(s);
-
-		BUG_ON(!p);
-		s->dma_address = virt_to_bus(sg_virt(s));
-		s->dma_length = s->length;
-	}
-	return nelems;
-}
-
 static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 	int nelems, int direction)
 {
@@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 	unsigned long entry;
 	int i;
 
-	if (!translation_enabled(tbl))
-		return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
-
 	for_each_sg(sg, s, nelems, i) {
 		BUG_ON(!sg_page(s));
 
@@ -477,7 +459,6 @@ error:
 static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
 	size_t size, int direction)
 {
-	dma_addr_t dma_handle = bad_dma_address;
 	void *vaddr = phys_to_virt(paddr);
 	unsigned long uaddr;
 	unsigned int npages;
@@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
 	uaddr = (unsigned long)vaddr;
 	npages = num_dma_pages(uaddr, size);
 
-	if (translation_enabled(tbl))
-		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
-	else
-		dma_handle = virt_to_bus(vaddr);
-
-	return dma_handle;
+	return iommu_alloc(dev, tbl, vaddr, npages, direction);
 }
 
 static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
 	struct iommu_table *tbl = find_iommu_table(dev);
 	unsigned int npages;
 
-	if (!translation_enabled(tbl))
-		return;
-
 	npages = num_dma_pages(dma_handle, size);
 	iommu_free(tbl, dma_handle, npages);
 }
@@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
 		goto error;
 	memset(ret, 0, size);
 
-	if (translation_enabled(tbl)) {
-		/* set up tces to cover the allocated range */
-		mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
-		if (mapping == bad_dma_address)
-			goto free;
-
-		*dma_handle = mapping;
-	} else /* non translated slot */
-		*dma_handle = virt_to_bus(ret);
-
+	/* set up tces to cover the allocated range */
+	mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+	if (mapping == bad_dma_address)
+		goto free;
+	*dma_handle = mapping;
 	return ret;
-
 free:
 	free_pages((unsigned long)ret, get_order(size));
 	ret = NULL;
@@ -1241,6 +1208,16 @@ static int __init calgary_init(void)
 			goto error;
 	} while (1);
 
+	dev = NULL;
+	for_each_pci_dev(dev) {
+		struct iommu_table *tbl;
+
+		tbl = find_iommu_table(&dev->dev);
+
+		if (translation_enabled(tbl))
+			dev->dev.archdata.dma_ops = &calgary_dma_ops;
+	}
+
 	return ret;
 
 error:
@@ -1262,6 +1239,7 @@ error:
 		calgary_disable_translation(dev);
 		calgary_free_bus(dev);
 		pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+		dev->dev.archdata.dma_ops = NULL;
 	} while (1);
 
 	return ret;
@@ -1503,6 +1481,10 @@ void __init detect_calgary(void)
 		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
 		       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
 		       debugging ? "enabled" : "disabled");
+
+		/* swiotlb for devices that aren't behind the Calgary. */
+		if (max_pfn > MAX_DMA32_PFN)
+			swiotlb = 1;
 	}
 	return;
 
@@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void)
 {
 	int ret;
 
-	if (no_iommu || swiotlb)
+	if (no_iommu || (swiotlb && !calgary_detected))
 		return -ENODEV;
 
 	if (!calgary_detected)
@@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void)
 	if (ret) {
 		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
 		       "falling back to no_iommu\n", ret);
-		if (max_pfn > MAX_DMA32_PFN)
-			printk(KERN_ERR "WARNING more than 4GB of memory, "
-					"32bit PCI may malfunction.\n");
 		return ret;
 	}
 
 	force_iommu = 1;
 	bad_dma_address = 0x0;
-	dma_ops = &calgary_dma_ops;
+	/* dma_ops is set to swiotlb or nommu */
+	if (!dma_ops)
+		dma_ops = &nommu_dma_ops;
 
 	return 0;
 }
diff --git a/include/asm-x86/iommu.h b/include/asm-x86/iommu.h
index d63166fb3ab7..ecc8061904a9 100644
--- a/include/asm-x86/iommu.h
+++ b/include/asm-x86/iommu.h
@@ -3,6 +3,7 @@
 
 extern void pci_iommu_shutdown(void);
 extern void no_iommu_init(void);
+extern struct dma_mapping_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 
-- 
cgit v1.2.3


From 929dfb24fbcd60e2544b2de7bfb4a68da4dfc747 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:44:54 -0700
Subject: parport/share.c: proper externs

This patch adds proper externs for parport_default_timeslice and
parport_default_spintime in include/linux/parport.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/procfs.c | 3 ---
 include/linux/parport.h  | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index d950fc34320a..554e11f9e1ce 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -429,9 +429,6 @@ struct parport_default_sysctl_table
 	ctl_table dev_dir[2];
 };
 
-extern unsigned long parport_default_timeslice;
-extern int parport_default_spintime;
-
 static struct parport_default_sysctl_table
 parport_default_sysctl_table = {
 	.sysctl_header	= NULL,
diff --git a/include/linux/parport.h b/include/linux/parport.h
index dcb9e01a69ca..6a0d7cdb5774 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -560,5 +560,8 @@ extern int parport_device_proc_unregister(struct pardevice *device);
 
 #endif /*  !CONFIG_PARPORT_NOT_PC  */
 
+extern unsigned long parport_default_timeslice;
+extern int parport_default_spintime;
+
 #endif /* __KERNEL__ */
 #endif /* _PARPORT_H_ */
-- 
cgit v1.2.3


From b77899985bdfd85a8e5a6e485033a9b4713d2471 Mon Sep 17 00:00:00 2001
From: Alex Dubov <oakad@yahoo.com>
Date: Fri, 25 Jul 2008 19:45:00 -0700
Subject: memstick: allow "set_param" method to return an error code

Some controllers (Jmicron, for instance) can report temporal failure
condition during power-on.  It is desirable to account for this using a
return value of "set_param" device method.  The return value can also be
handy to distinguish between supported and unsupported device parameters
in run time.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/core/memstick.c  | 18 ++++++++---
 drivers/memstick/host/jmb38x_ms.c | 67 ++++++++++++++++++++++++++++-----------
 drivers/memstick/host/tifm_ms.c   | 17 +++++-----
 include/linux/memstick.h          |  2 +-
 4 files changed, 71 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index 61b98c333cb0..3c7d9a79c1ea 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -415,10 +415,14 @@ err_out:
 	return NULL;
 }
 
-static void memstick_power_on(struct memstick_host *host)
+static int memstick_power_on(struct memstick_host *host)
 {
-	host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_ON);
-	host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_SERIAL);
+	int rc = host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_ON);
+
+	if (!rc)
+		rc = host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_SERIAL);
+
+	return rc;
 }
 
 static void memstick_check(struct work_struct *work)
@@ -573,11 +577,15 @@ EXPORT_SYMBOL(memstick_suspend_host);
  */
 void memstick_resume_host(struct memstick_host *host)
 {
+	int rc = 0;
+
 	mutex_lock(&host->lock);
 	if (host->card)
-		memstick_power_on(host);
+		rc = memstick_power_on(host);
 	mutex_unlock(&host->lock);
-	memstick_detect_change(host);
+
+	if (!rc)
+		memstick_detect_change(host);
 }
 EXPORT_SYMBOL(memstick_resume_host);
 
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index 4e3bfbcdf155..9d82e67737db 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -609,36 +609,68 @@ static void jmb38x_ms_request(struct memstick_host *msh)
 	spin_unlock_irqrestore(&host->lock, flags);
 }
 
-static void jmb38x_ms_reset(struct jmb38x_ms_host *host)
+static int jmb38x_ms_reset(struct jmb38x_ms_host *host)
 {
-	unsigned int host_ctl = readl(host->addr + HOST_CONTROL);
+	int cnt;
 
-	writel(HOST_CONTROL_RESET_REQ, host->addr + HOST_CONTROL);
+	writel(HOST_CONTROL_RESET_REQ | HOST_CONTROL_CLOCK_EN
+	       | readl(host->addr + HOST_CONTROL),
+	       host->addr + HOST_CONTROL);
+	mmiowb();
+
+	for (cnt = 0; cnt < 20; ++cnt) {
+		if (!(HOST_CONTROL_RESET_REQ
+		      & readl(host->addr + HOST_CONTROL)))
+			goto reset_next;
 
-	while (HOST_CONTROL_RESET_REQ
-	       & (host_ctl = readl(host->addr + HOST_CONTROL))) {
 		ndelay(20);
-		dev_dbg(&host->chip->pdev->dev, "reset %08x\n", host_ctl);
 	}
+	dev_dbg(&host->chip->pdev->dev, "reset_req timeout\n");
+	return -EIO;
 
-	writel(HOST_CONTROL_RESET, host->addr + HOST_CONTROL);
+reset_next:
+	writel(HOST_CONTROL_RESET | HOST_CONTROL_CLOCK_EN
+	       | readl(host->addr + HOST_CONTROL),
+	       host->addr + HOST_CONTROL);
+	mmiowb();
+
+	for (cnt = 0; cnt < 20; ++cnt) {
+		if (!(HOST_CONTROL_RESET
+		      & readl(host->addr + HOST_CONTROL)))
+			goto reset_ok;
+
+		ndelay(20);
+	}
+	dev_dbg(&host->chip->pdev->dev, "reset timeout\n");
+	return -EIO;
+
+reset_ok:
 	mmiowb();
 	writel(INT_STATUS_ALL, host->addr + INT_SIGNAL_ENABLE);
 	writel(INT_STATUS_ALL, host->addr + INT_STATUS_ENABLE);
+	return 0;
 }
 
-static void jmb38x_ms_set_param(struct memstick_host *msh,
-				enum memstick_param param,
-				int value)
+static int jmb38x_ms_set_param(struct memstick_host *msh,
+			       enum memstick_param param,
+			       int value)
 {
 	struct jmb38x_ms_host *host = memstick_priv(msh);
 	unsigned int host_ctl = readl(host->addr + HOST_CONTROL);
 	unsigned int clock_ctl = CLOCK_CONTROL_40MHZ, clock_delay = 0;
+	int rc = 0;
 
 	switch (param) {
 	case MEMSTICK_POWER:
 		if (value == MEMSTICK_POWER_ON) {
-			jmb38x_ms_reset(host);
+			rc = jmb38x_ms_reset(host);
+			if (rc)
+				return rc;
+
+			host_ctl = 7;
+			host_ctl |= HOST_CONTROL_POWER_EN
+				 | HOST_CONTROL_CLOCK_EN;
+			writel(host_ctl, host->addr + HOST_CONTROL);
 
 			writel(host->id ? PAD_PU_PD_ON_MS_SOCK1
 					: PAD_PU_PD_ON_MS_SOCK0,
@@ -647,11 +679,7 @@ static void jmb38x_ms_set_param(struct memstick_host *msh,
 			writel(PAD_OUTPUT_ENABLE_MS,
 			       host->addr + PAD_OUTPUT_ENABLE);
 
-			host_ctl = 7;
-			host_ctl |= HOST_CONTROL_POWER_EN
-				 | HOST_CONTROL_CLOCK_EN;
-			writel(host_ctl, host->addr + HOST_CONTROL);
-
+			msleep(10);
 			dev_dbg(&host->chip->pdev->dev, "power on\n");
 		} else if (value == MEMSTICK_POWER_OFF) {
 			host_ctl &= ~(HOST_CONTROL_POWER_EN
@@ -660,7 +688,8 @@ static void jmb38x_ms_set_param(struct memstick_host *msh,
 			writel(0, host->addr + PAD_OUTPUT_ENABLE);
 			writel(PAD_PU_PD_OFF, host->addr + PAD_PU_PD);
 			dev_dbg(&host->chip->pdev->dev, "power off\n");
-		}
+		} else
+			return -EINVAL;
 		break;
 	case MEMSTICK_INTERFACE:
 		host_ctl &= ~(3 << HOST_CONTROL_IF_SHIFT);
@@ -686,12 +715,14 @@ static void jmb38x_ms_set_param(struct memstick_host *msh,
 			host_ctl &= ~HOST_CONTROL_REI;
 			clock_ctl = CLOCK_CONTROL_60MHZ;
 			clock_delay = 0;
-		}
+		} else
+			return -EINVAL;
 		writel(host_ctl, host->addr + HOST_CONTROL);
 		writel(clock_ctl, host->addr + CLOCK_CONTROL);
 		writel(clock_delay, host->addr + CLOCK_DELAY);
 		break;
 	};
+	return 0;
 }
 
 #ifdef CONFIG_PM
diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c
index 8577de4ebb0e..14458764588c 100644
--- a/drivers/memstick/host/tifm_ms.c
+++ b/drivers/memstick/host/tifm_ms.c
@@ -489,15 +489,12 @@ static void tifm_ms_request(struct memstick_host *msh)
 	return;
 }
 
-static void tifm_ms_set_param(struct memstick_host *msh,
-			      enum memstick_param param,
-			      int value)
+static int tifm_ms_set_param(struct memstick_host *msh,
+			     enum memstick_param param,
+			     int value)
 {
 	struct tifm_ms *host = memstick_priv(msh);
 	struct tifm_dev *sock = host->dev;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sock->lock, flags);
 
 	switch (param) {
 	case MEMSTICK_POWER:
@@ -512,7 +509,8 @@ static void tifm_ms_set_param(struct memstick_host *msh,
 			writel(TIFM_MS_SYS_FCLR | TIFM_MS_SYS_INTCLR,
 			       sock->addr + SOCK_MS_SYSTEM);
 			writel(0xffffffff, sock->addr + SOCK_MS_STATUS);
-		}
+		} else
+			return -EINVAL;
 		break;
 	case MEMSTICK_INTERFACE:
 		if (value == MEMSTICK_SERIAL) {
@@ -525,11 +523,12 @@ static void tifm_ms_set_param(struct memstick_host *msh,
 			writel(TIFM_CTRL_FAST_CLK
 			       | readl(sock->addr + SOCK_CONTROL),
 			       sock->addr + SOCK_CONTROL);
-		}
+		} else
+			return -EINVAL;
 		break;
 	};
 
-	spin_unlock_irqrestore(&sock->lock, flags);
+	return 0;
 }
 
 static void tifm_ms_abort(unsigned long data)
diff --git a/include/linux/memstick.h b/include/linux/memstick.h
index 37a5cdb03918..2fe599c66d52 100644
--- a/include/linux/memstick.h
+++ b/include/linux/memstick.h
@@ -284,7 +284,7 @@ struct memstick_host {
 	/* Notify the host that some requests are pending. */
 	void                (*request)(struct memstick_host *host);
 	/* Set host IO parameters (power, clock, etc).     */
-	void                (*set_param)(struct memstick_host *host,
+	int                 (*set_param)(struct memstick_host *host,
 					 enum memstick_param param,
 					 int value);
 	unsigned long       private[0] ____cacheline_aligned;
-- 
cgit v1.2.3


From 17017d8d2c005734d7088d8281ce2daab8fcb097 Mon Sep 17 00:00:00 2001
From: Alex Dubov <oakad@yahoo.com>
Date: Fri, 25 Jul 2008 19:45:01 -0700
Subject: memstick: add "start" and "stop" methods to memstick device

In some cases it may be desirable to ensure that associated driver is not
going to access the media in some period of time.  "start" and "stop"
methods are provided therefore to allow it.

Signed-off-by: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/core/memstick.c    | 11 ++++++++---
 drivers/memstick/core/mspro_block.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/memstick.h            |  4 ++++
 3 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index 3c7d9a79c1ea..7162f772bbfb 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -433,8 +433,11 @@ static void memstick_check(struct work_struct *work)
 
 	dev_dbg(&host->dev, "memstick_check started\n");
 	mutex_lock(&host->lock);
-	if (!host->card)
-		memstick_power_on(host);
+	if (!host->card) {
+		if (memstick_power_on(host))
+			goto out_power_off;
+	} else
+		host->card->stop(host->card);
 
 	card = memstick_alloc_card(host);
 
@@ -452,7 +455,8 @@ static void memstick_check(struct work_struct *work)
 			    || !(host->card->check(host->card))) {
 				device_unregister(&host->card->dev);
 				host->card = NULL;
-			}
+			} else
+				host->card->start(host->card);
 		}
 
 		if (!host->card) {
@@ -465,6 +469,7 @@ static void memstick_check(struct work_struct *work)
 			kfree(card);
 	}
 
+out_power_off:
 	if (!host->card)
 		host->set_param(host, MEMSTICK_POWER, MEMSTICK_POWER_OFF);
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 477d0fb6e588..004ac4d176d9 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -752,6 +752,37 @@ static int mspro_block_has_request(struct mspro_block_data *msb)
 	return rc;
 }
 
+static void mspro_block_stop(struct memstick_dev *card)
+{
+	struct mspro_block_data *msb = memstick_get_drvdata(card);
+	int rc = 0;
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(&msb->q_lock, flags);
+		if (!msb->has_request) {
+			blk_stop_queue(msb->queue);
+			rc = 1;
+		}
+		spin_unlock_irqrestore(&msb->q_lock, flags);
+
+		if (rc)
+			break;
+
+		wait_for_completion(&card->mrq_complete);
+	}
+}
+
+static void mspro_block_start(struct memstick_dev *card)
+{
+	struct mspro_block_data *msb = memstick_get_drvdata(card);
+	unsigned long flags;
+
+	spin_lock_irqsave(&msb->q_lock, flags);
+	blk_start_queue(msb->queue);
+	spin_unlock_irqrestore(&msb->q_lock, flags);
+}
+
 static int mspro_block_queue_thread(void *data)
 {
 	struct memstick_dev *card = data;
@@ -1272,6 +1303,8 @@ static int mspro_block_probe(struct memstick_dev *card)
 	rc = mspro_block_init_disk(card);
 	if (!rc) {
 		card->check = mspro_block_check_card;
+		card->stop = mspro_block_stop;
+		card->start = mspro_block_start;
 		return 0;
 	}
 
diff --git a/include/linux/memstick.h b/include/linux/memstick.h
index 2fe599c66d52..a9f998a3f48b 100644
--- a/include/linux/memstick.h
+++ b/include/linux/memstick.h
@@ -263,6 +263,10 @@ struct memstick_dev {
 	/* Get next request from the media driver.                         */
 	int                      (*next_request)(struct memstick_dev *card,
 						 struct memstick_request **mrq);
+	/* Tell the media driver to stop doing things                      */
+	void                     (*stop)(struct memstick_dev *card);
+	/* Allow the media driver to continue                              */
+	void                     (*start)(struct memstick_dev *card);
 
 	struct device            dev;
 };
-- 
cgit v1.2.3


From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:07 -0700
Subject: kexec jump

This patch provides an enhancement to kexec/kdump.  It implements the
following features:

- Backup/restore memory used by the original kernel before/after
  kexec.

- Save/restore CPU state before/after kexec.

The features of this patch can be used as a general method to call program in
physical mode (paging turning off).  This can be used to call BIOS code under
Linux.

kexec-tools needs to be patched to support kexec jump. The patches and
the precompiled kexec can be download from the following URL:

       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

Usage example of calling some physical mode code and return:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_KEXEC=y
CONFIG_PM=y
CONFIG_KEXEC_JUMP=y

2. Build patched kexec-tool or download the pre-built one.

3. Build some physical mode executable named such as "phy_mode"

4. Boot kernel compiled in step 1.

5. Load physical mode executable with /sbin/kexec. The shell command
   line can be as follow:

   /sbin/kexec --load-preserve-context --args-none phy_mode

6. Call physical mode executable with following shell command line:

   /sbin/kexec -e

Implementation point:

To support jumping without reserving memory.  One shadow backup page (source
page) is allocated for each page used by kexeced code image (destination
page).  When do kexec_load, the image of kexeced code is loaded into source
pages, and before executing, the destination pages and the source pages are
swapped, so the contents of destination pages are backupped.  Before jumping
to the kexeced code image and after jumping back to the original kernel, the
destination pages and the source pages are swapped too.

C ABI (calling convention) is used as communication protocol between
kernel and called code.

A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to
indicate that the loaded kernel image is used for jumping back.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/machine_kexec.c  |   2 +-
 arch/sh/kernel/machine_kexec.c       |   2 +-
 arch/x86/Kconfig                     |   7 ++
 arch/x86/kernel/machine_kexec_32.c   |  27 ++++--
 arch/x86/kernel/machine_kexec_64.c   |   2 +-
 arch/x86/kernel/relocate_kernel_32.S | 174 ++++++++++++++++++++++++++++++-----
 include/asm-x86/kexec.h              |  18 ++--
 include/linux/kexec.h                |  17 +++-
 kernel/kexec.c                       |  57 ++++++++++++
 kernel/sys.c                         |  31 ++-----
 10 files changed, 269 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c
index 29a0e039d436..aab76887a842 100644
--- a/arch/powerpc/kernel/machine_kexec.c
+++ b/arch/powerpc/kernel/machine_kexec.c
@@ -48,7 +48,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 5c17de51987e..ec1eadce4aaa 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -70,7 +70,7 @@ static void kexec_info(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 
 	unsigned long page_list;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e3cba0b45600..7ecb679f0130 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1279,6 +1279,13 @@ config CRASH_DUMP
 	  (CONFIG_RELOCATABLE=y).
 	  For more details see Documentation/kdump/kdump.txt
 
+config KEXEC_JUMP
+	bool "kexec jump (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on KEXEC && PM_SLEEP && X86_32
+	help
+	  Invoke code in physical address mode via KEXEC
+
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
 	default "0x1000000" if X86_NUMAQ
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..2b67609d0a1c 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
 #include <asm/system.h>
+#include <asm/cacheflush.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
  * reboot code buffer to allow us to avoid allocations
  * later.
  *
- * Currently nothing.
+ * Make control page executable.
  */
 int machine_kexec_prepare(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_x(image->control_code_page, 1);
 	return 0;
 }
 
@@ -98,16 +101,24 @@ int machine_kexec_prepare(struct kimage *image)
  */
 void machine_kexec_cleanup(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_nx(image->control_code_page, 1);
 }
 
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	asmlinkage unsigned long
+		(*relocate_kernel_ptr)(unsigned long indirection_page,
+				       unsigned long control_page,
+				       unsigned long start_address,
+				       unsigned int has_pae,
+				       unsigned int preserve_context);
 
 	tracer_disable();
 
@@ -115,10 +126,11 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	local_irq_disable();
 
 	control_page = page_address(image->control_code_page);
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
 
+	relocate_kernel_ptr = control_page;
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
-	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_PGD] = __pa(kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
 #ifdef CONFIG_X86_PAE
@@ -131,6 +143,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
 	page_list[PA_PTE_1] = __pa(kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+	page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -149,8 +162,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0),0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start, cpu_has_pae);
+	image->start = relocate_kernel_ptr((unsigned long)image->head,
+					   (unsigned long)page_list,
+					   image->start, cpu_has_pae,
+					   image->preserve_context);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..703310a99023 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define PAE_PGD_ATTR (_PAGE_PRESENT)
 
+/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
+ * used to save some data for jumping back
+ */
+#define DATA(offset)		(PAGE_SIZE/2+(offset))
+
+/* Minimal CPU state */
+#define ESP			DATA(0x0)
+#define CR0			DATA(0x4)
+#define CR3			DATA(0x8)
+#define CR4			DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE	DATA(0x10)
+#define CP_PA_PGD		DATA(0x14)
+#define CP_PA_SWAP_PAGE		DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x1c)
+
 	.text
 	.align PAGE_SIZE
 	.globl relocate_kernel
 relocate_kernel:
-	movl	8(%esp), %ebp /* list of pages */
+	/* Save the CPU context, used for jumping back */
+
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushf
+
+	movl	20+8(%esp), %ebp /* list of pages */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%esp, ESP(%edi)
+	movl	%cr0, %eax
+	movl	%eax, CR0(%edi)
+	movl	%cr3, %eax
+	movl	%eax, CR3(%edi)
+	movl	%cr4, %eax
+	movl	%eax, CR4(%edi)
 
 #ifdef CONFIG_X86_PAE
 	/* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
 
 relocate_new_kernel:
 	/* read the arguments and say goodbye to the stack */
-	movl  4(%esp), %ebx /* page_list */
-	movl  8(%esp), %ebp /* list of pages */
-	movl  12(%esp), %edx /* start address */
-	movl  16(%esp), %ecx /* cpu_has_pae */
+	movl  20+4(%esp), %ebx /* page_list */
+	movl  20+8(%esp), %ebp /* list of pages */
+	movl  20+12(%esp), %edx /* start address */
+	movl  20+16(%esp), %ecx /* cpu_has_pae */
+	movl  20+20(%esp), %esi /* preserve_context */
 
 	/* zero out flags, and disable interrupts */
 	pushl $0
 	popfl
 
+	/* save some information for jumping back */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%edi, CP_VA_CONTROL_PAGE(%edi)
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, CP_PA_PGD(%edi)
+	movl	PTR(PA_SWAP_PAGE)(%ebp), %eax
+	movl	%eax, CP_PA_SWAP_PAGE(%edi)
+	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
 	/* get physical address of control page now */
 	/* this is impossible after page table switch */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
+	movl	CP_PA_SWAP_PAGE(%edi), %eax
+	pushl	%eax
+	pushl	%ebx
+	call	swap_pages
+	addl	$8, %esp
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+
+	testl	%esi, %esi
+	jnz 1f
+	xorl	%edi, %edi
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %ebp, %ebp
+	ret
+1:
+	popl	%edx
+	movl	CP_PA_SWAP_PAGE(%edi), %esp
+	addl	$PAGE_SIZE, %esp
+2:
+	call	*%edx
+
+	/* get the re-entry point of the peer system */
+	movl	0(%esp), %ebp
+	call	1f
+1:
+	popl	%ebx
+	subl	$(1b - relocate_kernel), %ebx
+	movl	CP_VA_CONTROL_PAGE(%ebx), %edi
+	lea	PAGE_SIZE(%ebx), %esp
+	movl	CP_PA_SWAP_PAGE(%ebx), %eax
+	movl	CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+	pushl	%eax
+	pushl	%edx
+	call	swap_pages
+	addl	$8, %esp
+	movl	CP_PA_PGD(%ebx), %eax
+	movl	%eax, %cr3
+	movl	%cr0, %eax
+	orl	$(1<<31), %eax
+	movl	%eax, %cr0
+	lea	PAGE_SIZE(%edi), %esp
+	movl	%edi, %eax
+	addl	$(virtual_mapped - relocate_kernel), %eax
+	pushl	%eax
+	ret
+
+virtual_mapped:
+	movl	CR4(%edi), %eax
+	movl	%eax, %cr4
+	movl	CR3(%edi), %eax
+	movl	%eax, %cr3
+	movl	CR0(%edi), %eax
+	movl	%eax, %cr0
+	movl	ESP(%edi), %esp
+	movl	%ebp, %eax
+
+	popf
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	ret
+
 	/* Do the copies */
-	movl	%ebx, %ecx
+swap_pages:
+	movl	8(%esp), %edx
+	movl	4(%esp), %ecx
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	movl	%ecx, %ebx
 	jmp	1f
 
 0:	/* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
 	movl    %ecx,   %esi /* For every source page do a copy */
 	andl    $0xfffff000, %esi
 
+	movl	%edi, %eax
+	movl	%esi, %ebp
+
+	movl	%edx, %edi
 	movl    $1024, %ecx
 	rep ; movsl
-	jmp     0b
 
-3:
-
-	/* To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB, it's handy, and not processor dependent.
-	 */
-	xorl	%eax, %eax
-	movl	%eax, %cr3
+	movl	%ebp, %edi
+	movl	%eax, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	movl	%eax, %edi
+	movl	%edx, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	xorl	%eax, %eax
-	xorl	%ebx, %ebx
-	xorl    %ecx, %ecx
-	xorl    %edx, %edx
-	xorl    %esi, %esi
-	xorl    %edi, %edi
-	xorl    %ebp, %ebp
+	lea	PAGE_SIZE(%ebp), %esi
+	jmp     0b
+3:
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
 	ret
diff --git a/include/asm-x86/kexec.h b/include/asm-x86/kexec.h
index 8f855a15f64d..c0e52a14fd4d 100644
--- a/include/asm-x86/kexec.h
+++ b/include/asm-x86/kexec.h
@@ -10,14 +10,15 @@
 # define VA_PTE_0		5
 # define PA_PTE_1		6
 # define VA_PTE_1		7
+# define PA_SWAP_PAGE		8
 # ifdef CONFIG_X86_PAE
-#  define PA_PMD_0		8
-#  define VA_PMD_0		9
-#  define PA_PMD_1		10
-#  define VA_PMD_1		11
-#  define PAGES_NR		12
+#  define PA_PMD_0		9
+#  define VA_PMD_0		10
+#  define PA_PMD_1		11
+#  define VA_PMD_1		12
+#  define PAGES_NR		13
 # else
-#  define PAGES_NR		8
+#  define PAGES_NR		9
 # endif
 #else
 # define PA_CONTROL_PAGE	0
@@ -152,11 +153,12 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
 }
 
 #ifdef CONFIG_X86_32
-asmlinkage NORET_TYPE void
+asmlinkage unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long control_page,
 		unsigned long start_address,
-		unsigned int has_pae) ATTRIB_NORET;
+		unsigned int has_pae,
+		unsigned int preserve_context);
 #else
 NORET_TYPE void
 relocate_kernel(unsigned long indirection_page,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 3265968cd2cd..82f88a8a827b 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -83,6 +83,7 @@ struct kimage {
 
 	unsigned long start;
 	struct page *control_code_page;
+	struct page *swap_page;
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
@@ -98,18 +99,20 @@ struct kimage {
 	unsigned int type : 1;
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
+	unsigned int preserve_context : 1;
 };
 
 
 /* kexec interface functions */
-extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
 extern asmlinkage long sys_kexec_load(unsigned long entry,
 					unsigned long nr_segments,
 					struct kexec_segment __user *segments,
 					unsigned long flags);
+extern int kernel_kexec(void);
 #ifdef CONFIG_COMPAT
 extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
 				unsigned long nr_segments,
@@ -156,8 +159,9 @@ extern struct kimage *kexec_crash_image;
 #define kexec_flush_icache_page(page)
 #endif
 
-#define KEXEC_ON_CRASH  0x00000001
-#define KEXEC_ARCH_MASK 0xffff0000
+#define KEXEC_ON_CRASH		0x00000001
+#define KEXEC_PRESERVE_CONTEXT	0x00000002
+#define KEXEC_ARCH_MASK		0xffff0000
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
@@ -174,7 +178,12 @@ extern struct kimage *kexec_crash_image;
 #define KEXEC_ARCH_MIPS_LE (10 << 16)
 #define KEXEC_ARCH_MIPS    ( 8 << 16)
 
-#define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
+/* List of defined/legal kexec flags */
+#ifndef CONFIG_KEXEC_JUMP
+#define KEXEC_FLAGS    KEXEC_ON_CRASH
+#else
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT)
+#endif
 
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6db42ff8d520..a0d920915b38 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,8 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +244,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 		goto out;
 	}
 
+	image->swap_page = kimage_alloc_control_pages(image, 0);
+	if (!image->swap_page) {
+		printk(KERN_ERR "Could not allocate swap buffer\n");
+		goto out;
+	}
+
 	result = 0;
  out:
 	if (result == 0)
@@ -986,6 +994,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 		if (result)
 			goto out;
 
+		if (flags & KEXEC_PRESERVE_CONTEXT)
+			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
 			goto out;
@@ -1411,3 +1421,50 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 
 module_init(crash_save_vmcoreinfo_init)
+
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (xchg(&kexec_lock, 1))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		local_irq_disable();
+		save_processor_state();
+#endif
+	} else {
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		sysdev_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		restore_processor_state();
+		local_irq_enable();
+#endif
+	}
+
+ Unlock:
+	xchg(&kexec_lock, 0);
+
+	return error;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c9d3fa1f5ff..c01858090a98 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
-/**
- *	kernel_kexec - reboot the system
- *
- *	Move into place and start executing a preloaded standalone
- *	executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-	struct kimage *image;
-	image = xchg(&kexec_image, NULL);
-	if (!image)
-		return;
-	kernel_restart_prepare(NULL);
-	printk(KERN_EMERG "Starting new kernel\n");
-	machine_shutdown();
-	machine_kexec(image);
-#endif
-}
-
 static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		kernel_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-		kernel_kexec();
-		unlock_kernel();
-		return -EINVAL;
+		{
+			int ret;
+			ret = kernel_kexec();
+			unlock_kernel();
+			return ret;
+		}
+#endif
 
 #ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
-- 
cgit v1.2.3


From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:10 -0700
Subject: kexec jump: save/restore device state

This patch implements devices state save/restore before after kexec.

This patch together with features in kexec_jump patch can be used for
following:

- A simple hibernation implementation without ACPI support.  You can kexec a
  hibernating kernel, save the memory image of original system and shutdown
  the system.  When resuming, you restore the memory image of original system
  via ordinary kexec load then jump back.

- Kernel/system debug through making system snapshot.  You can make system
  snapshot, jump back, do some thing and make another system snapshot.

- Cooperative multi-kernel/system.  With kexec jump, you can switch between
  several kernels/systems quickly without boot process except the first time.
  This appears like swap a whole kernel/system out/in.

- A general method to call program in physical mode (paging turning
  off). This can be used to invoke BIOS code under Linux.

The following user-space tools can be used with kexec jump:

- kexec-tools needs to be patched to support kexec jump. The patches
  and the precompiled kexec can be download from the following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

- makedumpfile with patches are used as memory image saving tool, it
  can exclude free pages from original kernel memory image file. The
  patches and the precompiled makedumpfile can be download from the
  following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10

- An initramfs image can be used as the root file system of kexeced
  kernel. An initramfs image built with "BuildRoot" can be downloaded
  from the following URL:
       initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz
  All user space tools above are included in the initramfs image.

Usage example of simple hibernation:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_RELOCATABLE=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
CONFIG_PM=y
CONFIG_HIBERNATION=y
CONFIG_KEXEC_JUMP=y

2. Build an initramfs image contains kexec-tool and makedumpfile, or
   download the pre-built initramfs image, called rootfs.gz in
   following text.

3. Prepare a partition to save memory image of original kernel, called
   hibernating partition in following text.

4. Boot kernel compiled in step 1 (kernel A).

5. In the kernel A, load kernel compiled in step 1 (kernel B) with
   /sbin/kexec. The shell command line can be as follow:

   /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000
     --mem-max=0xffffff --initrd=rootfs.gz

6. Boot the kernel B with following shell command line:

   /sbin/kexec -e

7. The kernel B will boot as normal kexec. In kernel B the memory
   image of kernel A can be saved into hibernating partition as
   follow:

   jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='`
   echo $jump_back_entry > kexec_jump_back_entry
   cp /proc/vmcore dump.elf

   Then you can shutdown the machine as normal.

8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as
   root file system.

9. In kernel C, load the memory image of kernel A as follow:

   /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf

10. Jump back to the kernel A as follow:

   /sbin/kexec -e

   Then, kernel A is resumed.

Implementation point:

To support jumping between two kernels, before jumping to (executing)
the new kernel and jumping back to the original kernel, the devices
are put into quiescent state, and the state of devices and CPU is
saved. After jumping back from kexeced kernel and jumping to the new
kernel, the state of devices and CPU are restored accordingly. The
devices/CPU state save/restore code of software suspend is called to
implement corresponding function.

Known issues:

- Because the segment number supported by sys_kexec_load is limited,
  hibernation image with many segments may not be load. This is
  planned to be eliminated by adding a new flag to sys_kexec_load to
  make a image can be loaded with multiple sys_kexec_load invoking.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                   |  5 +++--
 arch/x86/kernel/machine_kexec_32.c | 12 ++++++++++++
 include/linux/suspend.h            |  2 ++
 kernel/kexec.c                     | 39 ++++++++++++++++++++++++++++++++++++++
 kernel/power/power.h               |  2 --
 5 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7ecb679f0130..6b2debfabddc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1282,9 +1282,10 @@ config CRASH_DUMP
 config KEXEC_JUMP
 	bool "kexec jump (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on KEXEC && PM_SLEEP && X86_32
+	depends on KEXEC && HIBERNATION && X86_32
 	help
-	  Invoke code in physical address mode via KEXEC
+	  Jump between original kernel and kexeced kernel and invoke
+	  code in physical address mode via KEXEC
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 2b67609d0a1c..9fe478d98406 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -125,6 +125,18 @@ void machine_kexec(struct kimage *image)
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/* We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page);
 	memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index e8e69159af71..c63435095970 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -278,4 +278,6 @@ static inline void register_nosave_region_late(unsigned long b, unsigned long e)
 }
 #endif
 
+extern struct mutex pm_mutex;
+
 #endif /* _LINUX_SUSPEND_H */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a0d920915b38..c8a4370e2a34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -26,6 +26,10 @@
 #include <linux/numa.h>
 #include <linux/suspend.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1441,7 +1445,31 @@ int kernel_kexec(void)
 
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+		mutex_lock(&pm_mutex);
+		pm_prepare_console();
+		error = freeze_processes();
+		if (error) {
+			error = -EBUSY;
+			goto Restore_console;
+		}
+		suspend_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (error)
+			goto Resume_console;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Resume_devices;
 		local_irq_disable();
+		/* At this point, device_suspend() has been called,
+		 * but *not* device_power_down(). We *must*
+		 * device_power_down() now.  Otherwise, drivers for
+		 * some devices (e.g. interrupt controllers) become
+		 * desynchronized with the actual state of the
+		 * hardware at resume time, and evil weirdness ensues.
+		 */
+		error = device_power_down(PMSG_FREEZE);
+		if (error)
+			goto Enable_irqs;
 		save_processor_state();
 #endif
 	} else {
@@ -1459,7 +1487,18 @@ int kernel_kexec(void)
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
 		restore_processor_state();
+		device_power_up(PMSG_RESTORE);
+ Enable_irqs:
 		local_irq_enable();
+		enable_nonboot_cpus();
+ Resume_devices:
+		device_resume(PMSG_RESTORE);
+ Resume_console:
+		resume_console();
+		thaw_processes();
+ Restore_console:
+		pm_restore_console();
+		mutex_unlock(&pm_mutex);
 #endif
 	}
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 
 extern int pfn_is_nosave(unsigned long);
 
-extern struct mutex pm_mutex;
-
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
-- 
cgit v1.2.3


From c2147a5092cfe13dbf3210e54e8a622015edeecc Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Better interface for hooking early initcalls

Added early initcall (pre-SMP) support, using an identical interface to
that of regular initcalls.  Functions called from do_pre_smp_initcalls()
could be converted to use this cleaner interface.

This is required by CPU hotplug, because early users have to register
notifiers before going SMP.  One such CPU hotplug user is the relay
interface with buffer-only channels, which needs to register such a
notifier, to be usable in early code.  This in turn is used by kmemtrace.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h |  2 ++
 include/linux/init.h              |  7 +++++++
 init/main.c                       | 13 +++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 729f6b0a60e9..9cd44b162ba1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -359,6 +359,8 @@
 	}
 
 #define INITCALLS							\
+	*(.initcallearly.init)						\
+	__early_initcall_end = .;					\
   	*(.initcall0.init)						\
   	*(.initcall0s.init)						\
   	*(.initcall1.init)						\
diff --git a/include/linux/init.h b/include/linux/init.h
index 42ae95411a93..11b84e106053 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -169,6 +169,13 @@ extern void (*late_time_init)(void);
 	static initcall_t __initcall_##fn##id __used \
 	__attribute__((__section__(".initcall" level ".init"))) = fn
 
+/*
+ * Early initcalls run before initializing SMP.
+ *
+ * Only for built-in code, not modules.
+ */
+#define early_initcall(fn)		__define_initcall("early",fn,early)
+
 /*
  * A "pure" initcall has no dependencies on anything else, and purely
  * initializes variables that couldn't be statically initialized.
diff --git a/init/main.c b/init/main.c
index 0604cbcaf1e4..b6fec08dbbef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -743,13 +743,13 @@ static void __init do_one_initcall(initcall_t fn)
 }
 
 
-extern initcall_t __initcall_start[], __initcall_end[];
+extern initcall_t __initcall_start[], __initcall_end[], __early_initcall_end[];
 
 static void __init do_initcalls(void)
 {
 	initcall_t *call;
 
-	for (call = __initcall_start; call < __initcall_end; call++)
+	for (call = __early_initcall_end; call < __initcall_end; call++)
 		do_one_initcall(*call);
 
 	/* Make sure there is no pending stuff from the initcall sequence */
@@ -783,6 +783,14 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 
+static void __init __do_pre_smp_initcalls(void)
+{
+	initcall_t *call;
+
+	for (call = __initcall_start; call < __early_initcall_end; call++)
+		do_one_initcall(*call);
+}
+
 static void __init do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
@@ -865,6 +873,7 @@ static int __init kernel_init(void * unused)
 
 	smp_prepare_cpus(setup_max_cpus);
 
+	__do_pre_smp_initcalls();
 	do_pre_smp_initcalls();
 
 	smp_init();
-- 
cgit v1.2.3


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  9 ---------
 include/linux/smp.h   |  5 -----
 init/main.c           | 23 +----------------------
 kernel/sched.c        |  5 ++++-
 kernel/smp.c          |  4 +++-
 kernel/softirq.c      |  3 ++-
 kernel/softlockup.c   | 25 ++++++++++++++++++++++---
 7 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3260a5c42b91..adb8077dc463 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -292,7 +292,6 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
-extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
@@ -2222,14 +2221,6 @@ static inline void inc_syscw(struct task_struct *tsk)
 }
 #endif
 
-#ifdef CONFIG_SMP
-void migration_init(void);
-#else
-static inline void migration_init(void)
-{
-}
-#endif
-
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 48262f86c969..66484d4a8459 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -74,15 +74,10 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data);
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 void generic_smp_call_function_single_interrupt(void);
 void generic_smp_call_function_interrupt(void);
-void init_call_single_data(void);
 void ipi_call_lock(void);
 void ipi_call_unlock(void);
 void ipi_call_lock_irq(void);
 void ipi_call_unlock_irq(void);
-#else
-static inline void init_call_single_data(void)
-{
-}
 #endif
 
 /*
diff --git a/init/main.c b/init/main.c
index b6fec08dbbef..20fdc9884b77 100644
--- a/init/main.c
+++ b/init/main.c
@@ -774,16 +774,7 @@ static void __init do_basic_setup(void)
 	do_initcalls();
 }
 
-static int __initdata nosoftlockup;
-
-static int __init nosoftlockup_setup(char *str)
-{
-	nosoftlockup = 1;
-	return 1;
-}
-__setup("nosoftlockup", nosoftlockup_setup);
-
-static void __init __do_pre_smp_initcalls(void)
+static void __init do_pre_smp_initcalls(void)
 {
 	initcall_t *call;
 
@@ -791,17 +782,6 @@ static void __init __do_pre_smp_initcalls(void)
 		do_one_initcall(*call);
 }
 
-static void __init do_pre_smp_initcalls(void)
-{
-	extern int spawn_ksoftirqd(void);
-
-	init_call_single_data();
-	migration_init();
-	spawn_ksoftirqd();
-	if (!nosoftlockup)
-		spawn_softlockup_task();
-}
-
 static void run_init_process(char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -873,7 +853,6 @@ static int __init kernel_init(void * unused)
 
 	smp_prepare_cpus(setup_max_cpus);
 
-	__do_pre_smp_initcalls();
 	do_pre_smp_initcalls();
 
 	smp_init();
diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96aa..fde1a1026359 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..96fc7c0edc59 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2bf..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5d..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.2.3


From 20d8b67c06fa5e74f44e80b0a0fd68c8327f7c6a Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:12 -0700
Subject: relay: add buffer-only channels; useful for early logging

Allows one to create and use a channel with no associated files.  Files
can be initialized later.  This is useful in scenarios such as logging in
early code, before VFS is up.  Therefore, such channels can be created and
used as soon as kmem_cache_init() completed.

This is needed by kmemtrace to do tracing in early kernel code.

[kosaki.motohiro@jp.fujitsu.com: build fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/relay.txt |  10 +++
 include/linux/relay.h               |   5 ++
 kernel/relay.c                      | 170 ++++++++++++++++++++++++++++++------
 3 files changed, 156 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
index 094f2d2f38b1..510b722667ac 100644
--- a/Documentation/filesystems/relay.txt
+++ b/Documentation/filesystems/relay.txt
@@ -294,6 +294,16 @@ user-defined data with a channel, and is immediately available
 (including in create_buf_file()) via chan->private_data or
 buf->chan->private_data.
 
+Buffer-only channels
+--------------------
+
+These channels have no files associated and can be created with
+relay_open(NULL, NULL, ...). Such channels are useful in scenarios such
+as when doing early tracing in the kernel, before the VFS is up. In these
+cases, one may open a buffer-only channel and then call
+relay_late_setup_files() when the kernel is ready to handle files,
+to expose the buffered data to the userspace.
+
 Channel 'modes'
 ---------------
 
diff --git a/include/linux/relay.h b/include/linux/relay.h
index 6cd8c4425fc7..953fc055e875 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -48,6 +48,7 @@ struct rchan_buf
 	size_t *padding;		/* padding counts per sub-buffer */
 	size_t prev_padding;		/* temporary variable */
 	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
+	size_t early_bytes;		/* bytes consumed before VFS inited */
 	unsigned int cpu;		/* this buf's cpu */
 } ____cacheline_aligned;
 
@@ -68,6 +69,7 @@ struct rchan
 	int is_global;			/* One global buffer ? */
 	struct list_head list;		/* for channel list */
 	struct dentry *parent;		/* parent dentry passed to open */
+	int has_base_filename;		/* has a filename associated? */
 	char base_filename[NAME_MAX];	/* saved base filename */
 };
 
@@ -169,6 +171,9 @@ struct rchan *relay_open(const char *base_filename,
 			 size_t n_subbufs,
 			 struct rchan_callbacks *cb,
 			 void *private_data);
+extern int relay_late_setup_files(struct rchan *chan,
+				  const char *base_filename,
+				  struct dentry *parent);
 extern void relay_close(struct rchan *chan);
 extern void relay_flush(struct rchan *chan);
 extern void relay_subbufs_consumed(struct rchan *chan,
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..04006ef970b8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
 
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+					struct dentry *dentry)
+{
+	buf->dentry = dentry;
+	buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+					    struct rchan_buf *buf,
+					    unsigned int cpu)
+{
+	struct dentry *dentry;
+	char *tmpname;
+
+	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		return NULL;
+	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+					   S_IRUSR, buf,
+					   &chan->is_global);
+
+	kfree(tmpname);
+
+	return dentry;
+}
+
 /*
  *	relay_open_buf - create a new relay channel buffer
  *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
  	struct rchan_buf *buf = NULL;
 	struct dentry *dentry;
- 	char *tmpname;
 
  	if (chan->is_global)
 		return chan->buf[0];
 
-	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
- 	if (!tmpname)
- 		goto end;
- 	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
-
 	buf = relay_create_buf(chan);
 	if (!buf)
- 		goto free_name;
+		return NULL;
+
+	if (chan->has_base_filename) {
+		dentry = relay_create_buf_file(chan, buf, cpu);
+		if (!dentry)
+			goto free_buf;
+		relay_set_buf_dentry(buf, dentry);
+	}
 
  	buf->cpu = cpu;
  	__relay_reset(buf, 1);
 
-	/* Create file in fs */
- 	dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
- 					   buf, &chan->is_global);
- 	if (!dentry)
- 		goto free_buf;
-
-	buf->dentry = dentry;
-
  	if(chan->is_global) {
  		chan->buf[0] = buf;
  		buf->cpu = 0;
   	}
 
- 	goto free_name;
+	return buf;
 
 free_buf:
  	relay_destroy_buf(buf);
- 	buf = NULL;
-free_name:
- 	kfree(tmpname);
-end:
-	return buf;
+	return NULL;
 }
 
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 
 /**
  *	relay_open - create a new relay channel
- *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, %NULL for root directory
+ *	@base_filename: base name of files to create, %NULL for buffering only
+ *	@parent: dentry of parent directory, %NULL for root directory or buffer
  *	@subbuf_size: size of sub-buffers
  *	@n_subbufs: number of sub-buffers
  *	@cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
 	unsigned int i;
 	struct rchan *chan;
-	if (!base_filename)
-		return NULL;
 
 	if (!(subbuf_size && n_subbufs))
 		return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
 	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
 	chan->parent = parent;
 	chan->private_data = private_data;
-	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	if (base_filename) {
+		chan->has_base_filename = 1;
+		strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	}
 	setup_callbacks(chan, cb);
 	kref_init(&chan->kref);
 
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
 
+struct rchan_percpu_buf_dispatcher {
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+	struct rchan_percpu_buf_dispatcher *p = info;
+
+	relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ *	relay_late_setup_files - triggers file creation
+ *	@chan: channel to operate on
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, %NULL for root directory
+ *
+ *	Returns 0 if successful, non-zero otherwise.
+ *
+ *	Use to setup files for a previously buffer-only channel.
+ *	Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+			   const char *base_filename,
+			   struct dentry *parent)
+{
+	int err = 0;
+	unsigned int i, curr_cpu;
+	unsigned long flags;
+	struct dentry *dentry;
+	struct rchan_percpu_buf_dispatcher disp;
+
+	if (!chan || !base_filename)
+		return -EINVAL;
+
+	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+	mutex_lock(&relay_channels_mutex);
+	/* Is chan already set up? */
+	if (unlikely(chan->has_base_filename))
+		return -EEXIST;
+	chan->has_base_filename = 1;
+	chan->parent = parent;
+	curr_cpu = get_cpu();
+	/*
+	 * The CPU hotplug notifier ran before us and created buffers with
+	 * no files associated. So it's safe to call relay_setup_buf_file()
+	 * on all currently online CPUs.
+	 */
+	for_each_online_cpu(i) {
+		if (unlikely(!chan->buf[i])) {
+			printk(KERN_ERR "relay_late_setup_files: CPU %u "
+					"has no buffer, it must have!\n", i);
+			BUG();
+			err = -EINVAL;
+			break;
+		}
+
+		dentry = relay_create_buf_file(chan, chan->buf[i], i);
+		if (unlikely(!dentry)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (curr_cpu == i) {
+			local_irq_save(flags);
+			relay_set_buf_dentry(chan->buf[i], dentry);
+			local_irq_restore(flags);
+		} else {
+			disp.buf = chan->buf[i];
+			disp.dentry = dentry;
+			smp_mb();
+			/* relay_channels_mutex must be held, so wait. */
+			err = smp_call_function_single(i,
+						       __relay_set_buf_dentry,
+						       &disp, 1);
+		}
+		if (unlikely(err))
+			break;
+	}
+	put_cpu();
+	mutex_unlock(&relay_channels_mutex);
+
+	return err;
+}
+
 /**
  *	relay_switch_subbuf - switch to a new sub-buffer
  *	@buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
-		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
-			buf->padding[old_subbuf];
+		if (buf->dentry)
+			buf->dentry->d_inode->i_size +=
+				buf->chan->subbuf_size -
+				buf->padding[old_subbuf];
+		else
+			buf->early_bytes += buf->chan->subbuf_size -
+					    buf->padding[old_subbuf];
 		smp_mb();
 		if (waitqueue_active(&buf->read_wait))
 			/*
@@ -1237,4 +1349,4 @@ static __init int relay_init(void)
 	return 0;
 }
 
-module_init(relay_init);
+early_initcall(relay_init);
-- 
cgit v1.2.3


From 080ccd4573607a930367c2128fc709814b2ade5d Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Fri, 25 Jul 2008 19:45:13 -0700
Subject: include/linux/aio.h: removed duplicated include

Removed duplicated include <linux/uio.h> in include/linux/aio.h

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index b51ddd28444e..09b276c35227 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -7,7 +7,6 @@
 #include <linux/uio.h>
 
 #include <asm/atomic.h>
-#include <linux/uio.h>
 
 #define AIO_MAXSEGS		4
 #define AIO_KIOGRP_NR_ATOMIC	8
-- 
cgit v1.2.3


From a0a8f5364a5ad248aec6cb705e0092ff563edc2f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:20 -0700
Subject: x86: implement pte_special

Implement the pte_special bit for x86.  This is required to support
lockless get_user_pages, because we need to know whether or not we can
refcount a particular page given only its pte (and no vma).

[hugh@veritas.com: fix a BUG]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-x86/pgtable.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
index 3e5dbc4195f4..04caa2f544df 100644
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -18,6 +18,7 @@
 #define _PAGE_BIT_UNUSED2	10
 #define _PAGE_BIT_UNUSED3	11
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
@@ -34,6 +35,8 @@
 #define _PAGE_UNUSED3	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define __HAVE_ARCH_PTE_SPECIAL
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
@@ -54,7 +57,7 @@
 
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
 
 #define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT)
 #define _PAGE_CACHE_WB		(0)
@@ -180,7 +183,7 @@ static inline int pte_exec(pte_t pte)
 
 static inline int pte_special(pte_t pte)
 {
-	return 0;
+	return pte_val(pte) & _PAGE_SPECIAL;
 }
 
 static inline int pmd_large(pmd_t pte)
@@ -246,7 +249,7 @@ static inline pte_t pte_clrglobal(pte_t pte)
 
 static inline pte_t pte_mkspecial(pte_t pte)
 {
-	return pte;
+	return __pte(pte_val(pte) | _PAGE_SPECIAL);
 }
 
 extern pteval_t __supported_pte_mask;
-- 
cgit v1.2.3


From 21cc199baa815d7b3f1ace4be20b9558cbddc00f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:22 -0700
Subject: mm: introduce get_user_pages_fast

Introduce a new get_user_pages_fast mm API, which is basically a
get_user_pages with a less general API (but still tends to be suited to
the common case):

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This restricted API can be implemented in a much more scalable way on many
architectures when the ptes are present, by walking the page tables
locklessly (no mmap_sem or page table locks).  When the ptes are not
populated, get_user_pages_fast() could be slower.

This is implemented locklessly on x86, and used in some key direct IO call
sites, in later patches, which provides nearly 10% performance improvement
on a threaded database workload.

Lots of other code could use this too, depending on use cases (eg.  grep
drivers/).  And it might inspire some new and clever ways to use it.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d87a5a5fe87d..f3fd70d6029f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -833,6 +833,39 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
 
+#ifdef CONFIG_HAVE_GET_USER_PAGES_FAST
+/*
+ * get_user_pages_fast provides equivalent functionality to get_user_pages,
+ * operating on current and current->mm (force=0 and doesn't return any vmas).
+ *
+ * get_user_pages_fast may take mmap_sem and page tables, so no assumptions
+ * can be made about locking. get_user_pages_fast is to be implemented in a
+ * way that is advantageous (vs get_user_pages()) when the user memory area is
+ * already faulted in and present in ptes. However if the pages have to be
+ * faulted in, it may turn out to be slightly slower).
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages);
+
+#else
+/*
+ * Should probably be moved to asm-generic, and architectures can include it if
+ * they don't implement their own get_user_pages_fast.
+ */
+#define get_user_pages_fast(start, nr_pages, write, pages)	\
+({								\
+	struct mm_struct *mm = current->mm;			\
+	int ret;						\
+								\
+	down_read(&mm->mmap_sem);				\
+	ret = get_user_pages(current, mm, start, nr_pages,	\
+					write, 0, pages, NULL);	\
+	up_read(&mm->mmap_sem);					\
+								\
+	ret;							\
+})
+#endif
+
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
-- 
cgit v1.2.3


From 8174c430e445a93016ef18f717fe570214fa38bf Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:24 -0700
Subject: x86: lockless get_user_pages_fast()

Implement get_user_pages_fast without locking in the fastpath on x86.

Do an optimistic lockless pagetable walk, without taking mmap_sem or any
page table locks or even mmap_sem.  Page table existence is guaranteed by
turning interrupts off (combined with the fact that we're always looking
up the current mm, means we can do the lockless page table walk within the
constraints of the TLB shootdown design).  Basically we can do this
lockless pagetable walk in a similar manner to the way the CPU's pagetable
walker does not have to take any locks to find present ptes.

This patch (combined with the subsequent ones to convert direct IO to use
it) was found to give about 10% performance improvement on a 2 socket 8
core Intel Xeon system running an OLTP workload on DB2 v9.5

 "To test the effects of the patch, an OLTP workload was run on an IBM
  x3850 M2 server with 2 processors (quad-core Intel Xeon processors at
  2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel.  Comparing
  runs with and without the patch resulted in an overall performance
  benefit of ~9.8%.  Correspondingly, oprofiles showed that samples from
  __up_read and __down_read routines that is seen during thread contention
  for system resources was reduced from 2.8% down to .05%.  Monitoring the
  /proc/vmstat output from the patched run showed that the counter for
  fast_gup contained a very high number while the fast_gup_slow value was
  zero."

(fast_gup is the old name for get_user_pages_fast, fast_gup_slow is a
counter we had for the number of times the slowpath was invoked).

The main reason for the improvement is that DB2 has multiple threads each
issuing direct-IO.  Direct-IO uses get_user_pages, and thus the threads
contend the mmap_sem cacheline, and can also contend on page table locks.

I would anticipate larger performance gains on larger systems, however I
think DB2 uses an adaptive mix of threads and processes, so it could be
that thread contention remains pretty constant as machine size increases.
In which case, we stuck with "only" a 10% gain.

The downside of using get_user_pages_fast is that if there is not a pte
with the correct permissions for the access, we end up falling back to
get_user_pages and so the get_user_pages_fast is a bit of extra work.
However this should not be the common case in most performance critical
code.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: Kconfig fix]
[akpm@linux-foundation.org: Makefile fix/cleanup]
[akpm@linux-foundation.org: warning fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig          |   1 +
 arch/x86/mm/Makefile      |   1 +
 arch/x86/mm/gup.c         | 258 ++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/uaccess.h |   1 +
 mm/Kconfig                |   3 +
 5 files changed, 264 insertions(+)
 create mode 100644 arch/x86/mm/gup.c

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6b2debfabddc..6bdde845818e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_IOREMAP_PROT
+	select HAVE_GET_USER_PAGES_FAST
 	select HAVE_KPROBES
 	select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X
 	select HAVE_KRETPROBES
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1fbb844c3d7a..2977ea37791f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,7 @@
 obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o
 
+obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o
 obj-$(CONFIG_X86_32)		+= pgtable_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..6f733121f32e
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,258 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+
+#include <asm/pgtable.h>
+
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+	return *ptep;
+#else
+	/*
+	 * With get_user_pages_fast, we walk down the pagetables without taking
+	 * any locks.  For this we would like to load the pointers atoimcally,
+	 * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+	 * we do have is the guarantee that a pte will only either go from not
+	 * present to present, or present to not present or both -- it will not
+	 * switch to a completely different present page without a TLB flush in
+	 * between; something that we are blocking by holding interrupts off.
+	 *
+	 * Setting ptes from not present to present goes:
+	 * ptep->pte_high = h;
+	 * smp_wmb();
+	 * ptep->pte_low = l;
+	 *
+	 * And present to not present goes:
+	 * ptep->pte_low = 0;
+	 * smp_wmb();
+	 * ptep->pte_high = 0;
+	 *
+	 * We must ensure here that the load of pte_low sees l iff pte_high
+	 * sees h. We load pte_high *after* loading pte_low, which ensures we
+	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
+	 * which ensures that we haven't picked up a changed pte high. We might
+	 * have got rubbish values from pte_low and pte_high, but we are
+	 * guaranteed that pte_low will not have the present bit set *unless*
+	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+	 * we're safe.
+	 *
+	 * gup_get_pte should not be used or copied outside gup.c without being
+	 * very careful -- it does not atomically load the pte or anything that
+	 * is likely to be useful for you.
+	 */
+	pte_t pte;
+
+retry:
+	pte.pte_low = ptep->pte_low;
+	smp_rmb();
+	pte.pte_high = ptep->pte_high;
+	smp_rmb();
+	if (unlikely(pte.pte_low != ptep->pte_low))
+		goto retry;
+
+	return pte;
+#endif
+}
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t *ptep;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+
+	ptep = pte_offset_map(&pmd, addr);
+	do {
+		pte_t pte = gup_get_pte(ptep);
+		struct page *page;
+
+		if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+			pte_unmap(ptep);
+			return 0;
+		}
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep - 1);
+
+	return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+	VM_BUG_ON(page != compound_head(page));
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_add(nr, &page->_count);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t pte = *(pte_t *)&pmd;
+	struct page *head, *page;
+	int refs;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_val(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL);
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON(compound_head(page) != head);
+		pages[*nr] = page;
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+	get_head_page_multiple(head, refs);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (unlikely(pmd_large(pmd))) {
+			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+				return 0;
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+			int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long end = start + (nr_pages << PAGE_SHIFT);
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					start, nr_pages*PAGE_SIZE)))
+		goto slow_irqon;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+slow_irqon:
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start,
+			(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h
index f6fa4d841bbc..5f702d1d5218 100644
--- a/include/asm-x86/uaccess.h
+++ b/include/asm-x86/uaccess.h
@@ -451,3 +451,4 @@ extern struct movsl_mask {
 #endif
 
 #endif
+
diff --git a/mm/Kconfig b/mm/Kconfig
index aa799007a11b..efee5d379df4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -77,6 +77,9 @@ config FLAT_NODE_MEM_MAP
 	def_bool y
 	depends on !SPARSEMEM
 
+config HAVE_GET_USER_PAGES_FAST
+	bool
+
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows
-- 
cgit v1.2.3


From 47feff2c8eefe85099f87c43d3096855f0085ca0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:29 -0700
Subject: radix-tree: add gang_lookup_slot, gang_lookup_slot_tag

Introduce gang_lookup_slot() and gang_lookup_slot_tag() functions, which
are used by lockless pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  12 ++-
 lib/radix-tree.c           | 178 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 166 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index b8ce2b444bb5..a916c6660dfa 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -99,12 +99,15 @@ do {									\
  *
  * The notable exceptions to this rule are the following functions:
  * radix_tree_lookup
+ * radix_tree_lookup_slot
  * radix_tree_tag_get
  * radix_tree_gang_lookup
+ * radix_tree_gang_lookup_slot
  * radix_tree_gang_lookup_tag
+ * radix_tree_gang_lookup_tag_slot
  * radix_tree_tagged
  *
- * The first 4 functions are able to be called locklessly, using RCU. The
+ * The first 7 functions are able to be called locklessly, using RCU. The
  * caller must ensure calls to these functions are made within rcu_read_lock()
  * regions. Other readers (lock-free or otherwise) and modifications may be
  * running concurrently.
@@ -159,6 +162,9 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+			unsigned long first_index, unsigned int max_items);
 unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
@@ -173,6 +179,10 @@ unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
 		unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag);
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 
 static inline void radix_tree_preload_end(void)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 56ec21a7f73d..9c4f1ffa2864 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -359,18 +359,17 @@ EXPORT_SYMBOL(radix_tree_insert);
  *	Returns:  the slot corresponding to the position @index in the
  *	radix tree @root. This is useful for update-if-exists operations.
  *
- *	This function cannot be called under rcu_read_lock, it must be
- *	excluded from writers, as must the returned slot for subsequent
- *	use by radix_tree_deref_slot() and radix_tree_replace slot.
- *	Caller must hold tree write locked across slot lookup and
- *	replace.
+ *	This function can be called under rcu_read_lock iff the slot is not
+ *	modified by radix_tree_replace_slot, otherwise it must be called
+ *	exclusive from other writers. Any dereference of the slot must be done
+ *	using radix_tree_deref_slot.
  */
 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 {
 	unsigned int height, shift;
 	struct radix_tree_node *node, **slot;
 
-	node = root->rnode;
+	node = rcu_dereference(root->rnode);
 	if (node == NULL)
 		return NULL;
 
@@ -390,7 +389,7 @@ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 	do {
 		slot = (struct radix_tree_node **)
 			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
-		node = *slot;
+		node = rcu_dereference(*slot);
 		if (node == NULL)
 			return NULL;
 
@@ -667,7 +666,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 EXPORT_SYMBOL(radix_tree_next_hole);
 
 static unsigned int
-__lookup(struct radix_tree_node *slot, void **results, unsigned long index,
+__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
 {
 	unsigned int nr_found = 0;
@@ -701,11 +700,9 @@ __lookup(struct radix_tree_node *slot, void **results, unsigned long index,
 
 	/* Bottom level: grab some items */
 	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-		struct radix_tree_node *node;
 		index++;
-		node = slot->slots[i];
-		if (node) {
-			results[nr_found++] = rcu_dereference(node);
+		if (slot->slots[i]) {
+			results[nr_found++] = &(slot->slots[i]);
 			if (nr_found == max_items)
 				goto out;
 		}
@@ -759,13 +756,22 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 
 	ret = 0;
 	while (ret < max_items) {
-		unsigned int nr_found;
+		unsigned int nr_found, slots_found, i;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup(node, results + ret, cur_index,
+		slots_found = __lookup(node, (void ***)results + ret, cur_index,
 					max_items - ret, &next_index);
+		nr_found = 0;
+		for (i = 0; i < slots_found; i++) {
+			struct radix_tree_node *slot;
+			slot = *(((void ***)results)[ret + i]);
+			if (!slot)
+				continue;
+			results[ret + nr_found] = rcu_dereference(slot);
+			nr_found++;
+		}
 		ret += nr_found;
 		if (next_index == 0)
 			break;
@@ -776,12 +782,71 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
 
+/**
+ *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	their slots at *@results and returns the number of items which were
+ *	placed at *@results.
+ *
+ *	The implementation is naive.
+ *
+ *	Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
+ *	be dereferenced with radix_tree_deref_slot, and if using only RCU
+ *	protection, radix_tree_deref_slot may fail requiring a retry.
+ */
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+			unsigned long first_index, unsigned int max_items)
+{
+	unsigned long max_index;
+	struct radix_tree_node *node;
+	unsigned long cur_index = first_index;
+	unsigned int ret;
+
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
+
+	if (!radix_tree_is_indirect_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		results[0] = (void **)&root->rnode;
+		return 1;
+	}
+	node = radix_tree_indirect_to_ptr(node);
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
+	while (ret < max_items) {
+		unsigned int slots_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		slots_found = __lookup(node, results + ret, cur_index,
+					max_items - ret, &next_index);
+		ret += slots_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
+
 /*
  * FIXME: the two tag_get()s here should use find_next_bit() instead of
  * open-coding the search.
  */
 static unsigned int
-__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
+__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index, unsigned int tag)
 {
 	unsigned int nr_found = 0;
@@ -811,11 +876,9 @@ __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
 			unsigned long j = index & RADIX_TREE_MAP_MASK;
 
 			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				struct radix_tree_node *node;
 				index++;
 				if (!tag_get(slot, tag, j))
 					continue;
-				node = slot->slots[j];
 				/*
 				 * Even though the tag was found set, we need to
 				 * recheck that we have a non-NULL node, because
@@ -826,9 +889,8 @@ __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
 				 * lookup ->slots[x] without a lock (ie. can't
 				 * rely on its value remaining the same).
 				 */
-				if (node) {
-					node = rcu_dereference(node);
-					results[nr_found++] = node;
+				if (slot->slots[j]) {
+					results[nr_found++] = &(slot->slots[j]);
 					if (nr_found == max_items)
 						goto out;
 				}
@@ -887,13 +949,22 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 
 	ret = 0;
 	while (ret < max_items) {
-		unsigned int nr_found;
+		unsigned int nr_found, slots_found, i;
 		unsigned long next_index;	/* Index of next search */
 
 		if (cur_index > max_index)
 			break;
-		nr_found = __lookup_tag(node, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
+		slots_found = __lookup_tag(node, (void ***)results + ret,
+				cur_index, max_items - ret, &next_index, tag);
+		nr_found = 0;
+		for (i = 0; i < slots_found; i++) {
+			struct radix_tree_node *slot;
+			slot = *(((void ***)results)[ret + i]);
+			if (!slot)
+				continue;
+			results[ret + nr_found] = rcu_dereference(slot);
+			nr_found++;
+		}
 		ret += nr_found;
 		if (next_index == 0)
 			break;
@@ -904,6 +975,67 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 
+/**
+ *	radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
+ *					  radix tree based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the slots at *@results and
+ *	returns the number of slots which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag)
+{
+	struct radix_tree_node *node;
+	unsigned long max_index;
+	unsigned long cur_index = first_index;
+	unsigned int ret;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	node = rcu_dereference(root->rnode);
+	if (!node)
+		return 0;
+
+	if (!radix_tree_is_indirect_ptr(node)) {
+		if (first_index > 0)
+			return 0;
+		results[0] = (void **)&root->rnode;
+		return 1;
+	}
+	node = radix_tree_indirect_to_ptr(node);
+
+	max_index = radix_tree_maxindex(node->height);
+
+	ret = 0;
+	while (ret < max_items) {
+		unsigned int slots_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		slots_found = __lookup_tag(node, results + ret,
+				cur_index, max_items - ret, &next_index, tag);
+		ret += slots_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
+
+
 /**
  *	radix_tree_shrink    -    shrink height of a radix tree to minimal
  *	@root		radix tree root
-- 
cgit v1.2.3


From e286781d5f2e9c846e012a39653a166e9d31777d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:30 -0700
Subject: mm: speculative page references

If we can be sure that elevating the page_count on a pagecache page will
pin it, we can speculatively run this operation, and subsequently check to
see if we hit the right page rather than relying on holding a lock or
otherwise pinning a reference to the page.

This can be done if get_page/put_page behaves consistently throughout the
whole tree (ie.  if we "get" the page after it has been used for something
else, we must be able to free it with a put_page).

Actually, there is a period where the count behaves differently: when the
page is free or if it is a constituent page of a compound page.  We need
an atomic_inc_not_zero operation to ensure we don't try to grab the page
in either case.

This patch introduces the core locking protocol to the pagecache (ie.
adds page_cache_get_speculative, and tweaks some update-side code to make
it work).

Thanks to Hugh for pointing out an improvement to the algorithm setting
page_count to zero when we have control of all references, in order to
hold off speculative getters.

[kamezawa.hiroyu@jp.fujitsu.com: fix migration_entry_wait()]
[hugh@veritas.com: fix add_to_page_cache]
[akpm@linux-foundation.org: repair a comment]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/cassini.c   |  12 ++++++
 include/linux/pagemap.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++-
 mm/filemap.c            |  32 ++++++++------
 mm/migrate.c            |  20 ++++++++-
 mm/shmem.c              |   6 +--
 mm/swap_state.c         |  17 +++++---
 mm/vmscan.c             |  74 +++++++++++++++++++++++---------
 7 files changed, 227 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 83768df27806..f1936d51b458 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -576,6 +576,18 @@ static void cas_spare_recover(struct cas *cp, const gfp_t flags)
 	list_for_each_safe(elem, tmp, &list) {
 		cas_page_t *page = list_entry(elem, cas_page_t, list);
 
+		/*
+		 * With the lockless pagecache, cassini buffering scheme gets
+		 * slightly less accurate: we might find that a page has an
+		 * elevated reference count here, due to a speculative ref,
+		 * and skip it as in-use. Ideally we would be able to reclaim
+		 * it. However this would be such a rare case, it doesn't
+		 * matter too much as we should pick it up the next time round.
+		 *
+		 * Importantly, if we find that the page has a refcount of 1
+		 * here (our refcount), then we know it is definitely not inuse
+		 * so we can reuse it.
+		 */
 		if (page_count(page->buffer) > 1)
 			continue;
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ee1ec2c7723c..a81d81890422 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -12,6 +12,7 @@
 #include <asm/uaccess.h>
 #include <linux/gfp.h>
 #include <linux/bitops.h>
+#include <linux/hardirq.h> /* for in_interrupt() */
 
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
@@ -62,6 +63,98 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+/*
+ * speculatively take a reference to a page.
+ * If the page is free (_count == 0), then _count is untouched, and 0
+ * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ *
+ * This function must be called inside the same rcu_read_lock() section as has
+ * been used to lookup the page in the pagecache radix-tree (or page table):
+ * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ *
+ * Unless an RCU grace period has passed, the count of all pages coming out
+ * of the allocator must be considered unstable. page_count may return higher
+ * than expected, and put_page must be able to do the right thing when the
+ * page has been finished with, no matter what it is subsequently allocated
+ * for (because put_page is what is used here to drop an invalid speculative
+ * reference).
+ *
+ * This is the interesting part of the lockless pagecache (and lockless
+ * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page)
+ * has the following pattern:
+ * 1. find page in radix tree
+ * 2. conditionally increment refcount
+ * 3. check the page is still in pagecache (if no, goto 1)
+ *
+ * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * following (with tree_lock held for write):
+ * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
+ * B. remove page from pagecache
+ * C. free the page
+ *
+ * There are 2 critical interleavings that matter:
+ * - 2 runs before A: in this case, A sees elevated refcount and bails out
+ * - A runs before 2: in this case, 2 sees zero refcount and retries;
+ *   subsequently, B will complete and 1 will find no page, causing the
+ *   lookup to return NULL.
+ *
+ * It is possible that between 1 and 2, the page is removed then the exact same
+ * page is inserted into the same position in pagecache. That's OK: the
+ * old find_get_page using tree_lock could equally have run before or after
+ * such a re-insertion, depending on order that locks are granted.
+ *
+ * Lookups racing against pagecache insertion isn't a big problem: either 1
+ * will find the page or it will not. Likewise, the old find_get_page could run
+ * either before the insertion or afterwards, depending on timing.
+ */
+static inline int page_cache_get_speculative(struct page *page)
+{
+	VM_BUG_ON(in_interrupt());
+
+#if !defined(CONFIG_SMP) && defined(CONFIG_CLASSIC_RCU)
+# ifdef CONFIG_PREEMPT
+	VM_BUG_ON(!in_atomic());
+# endif
+	/*
+	 * Preempt must be disabled here - we rely on rcu_read_lock doing
+	 * this for us.
+	 *
+	 * Pagecache won't be truncated from interrupt context, so if we have
+	 * found a page in the radix tree here, we have pinned its refcount by
+	 * disabling preempt, and hence no need for the "speculative get" that
+	 * SMP requires.
+	 */
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_inc(&page->_count);
+
+#else
+	if (unlikely(!get_page_unless_zero(page))) {
+		/*
+		 * Either the page has been freed, or will be freed.
+		 * In either case, retry here and the caller should
+		 * do the right thing (see comments above).
+		 */
+		return 0;
+	}
+#endif
+	VM_BUG_ON(PageTail(page));
+
+	return 1;
+}
+
+static inline int page_freeze_refs(struct page *page, int count)
+{
+	return likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+}
+
+static inline void page_unfreeze_refs(struct page *page, int count)
+{
+	VM_BUG_ON(page_count(page) != 0);
+	VM_BUG_ON(count == 0);
+
+	atomic_set(&page->_count, count);
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
@@ -133,13 +226,29 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 	return read_cache_page(mapping, index, filler, data);
 }
 
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void remove_from_page_cache(struct page *page);
 extern void __remove_from_page_cache(struct page *page);
 
+/*
+ * Like add_to_page_cache_locked, but used to add newly allocated pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ */
+static inline int add_to_page_cache(struct page *page,
+		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
+{
+	int error;
+
+	SetPageLocked(page);
+	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
+	if (unlikely(error))
+		ClearPageLocked(page);
+	return error;
+}
+
 /*
  * Return byte-offset into filesystem object for page.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d3ec1ffc66e..4e182a9a14c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -442,39 +442,43 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
  * @page:	page to add
  * @mapping:	the page's address_space
  * @offset:	page index
  * @gfp_mask:	page allocation mode
  *
- * This function is used to add newly allocated pagecache pages;
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
+ * This function is used to add a page to the pagecache. It must be locked.
  * This function does not add the page to the LRU.  The caller must do that.
  */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		pgoff_t offset, gfp_t gfp_mask)
 {
-	int error = mem_cgroup_cache_charge(page, current->mm,
+	int error;
+
+	VM_BUG_ON(!PageLocked(page));
+
+	error = mem_cgroup_cache_charge(page, current->mm,
 					gfp_mask & ~__GFP_HIGHMEM);
 	if (error)
 		goto out;
 
 	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
+		page_cache_get(page);
+		page->mapping = mapping;
+		page->index = offset;
+
 		write_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
-			page_cache_get(page);
-			SetPageLocked(page);
-			page->mapping = mapping;
-			page->index = offset;
+		if (likely(!error)) {
 			mapping->nrpages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
-		} else
+		} else {
+			page->mapping = NULL;
 			mem_cgroup_uncharge_cache_page(page);
+			page_cache_release(page);
+		}
 
 		write_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
@@ -483,7 +487,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 out:
 	return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
diff --git a/mm/migrate.c b/mm/migrate.c
index d8c65a65c61d..3ca6392e82cc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -285,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 
 	page = migration_entry_to_page(entry);
 
-	get_page(page);
+	/*
+	 * Once radix-tree replacement of page migration started, page_count
+	 * *must* be zero. And, we don't want to call wait_on_page_locked()
+	 * against a page without get_page().
+	 * So, we use get_page_unless_zero(), here. Even failed, page fault
+	 * will occur again.
+	 */
+	if (!get_page_unless_zero(page))
+		goto out;
 	pte_unmap_unlock(ptep, ptl);
 	wait_on_page_locked(page);
 	put_page(page);
@@ -305,6 +313,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page)
 {
+	int expected_count;
 	void **pslot;
 
 	if (!mapping) {
@@ -319,12 +328,18 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
 
-	if (page_count(page) != 2 + !!PagePrivate(page) ||
+	expected_count = 2 + !!PagePrivate(page);
+	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
 
+	if (!page_freeze_refs(page, expected_count)) {
+		write_unlock_irq(&mapping->tree_lock);
+		return -EAGAIN;
+	}
+
 	/*
 	 * Now we know that no one else is looking at the page.
 	 */
@@ -338,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 
 	radix_tree_replace_slot(pslot, newpage);
 
+	page_unfreeze_refs(page, expected_count);
 	/*
 	 * Drop cache reference from old page.
 	 * We know this isn't the last reference.
diff --git a/mm/shmem.c b/mm/shmem.c
index f92fea94d037..1089092aecaf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -936,7 +936,7 @@ found:
 	spin_lock(&info->lock);
 	ptr = shmem_swp_entry(info, idx, NULL);
 	if (ptr && ptr->val == entry.val) {
-		error = add_to_page_cache(page, inode->i_mapping,
+		error = add_to_page_cache_locked(page, inode->i_mapping,
 						idx, GFP_NOWAIT);
 		/* does mem_cgroup_uncharge_cache_page on error */
 	} else	/* we must compensate for our precharge above */
@@ -1301,8 +1301,8 @@ repeat:
 			SetPageUptodate(filepage);
 			set_page_dirty(filepage);
 			swap_free(swap);
-		} else if (!(error = add_to_page_cache(
-				swappage, mapping, idx, GFP_NOWAIT))) {
+		} else if (!(error = add_to_page_cache_locked(swappage, mapping,
+					idx, GFP_NOWAIT))) {
 			info->flags |= SHMEM_PAGEIN;
 			shmem_swp_set(info, entry, 0);
 			shmem_swp_unmap(entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..3e3381d6c7ee 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -64,7 +64,7 @@ void show_swap_cache_info(void)
 }
 
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +76,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
+		page_cache_get(page);
+		SetPageSwapCache(page);
+		set_page_private(page, entry.val);
+
 		write_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
-		if (!error) {
-			page_cache_get(page);
-			SetPageSwapCache(page);
-			set_page_private(page, entry.val);
+		if (likely(!error)) {
 			total_swapcache_pages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			INC_CACHE_INFO(add_total);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
+
+		if (unlikely(error)) {
+			set_page_private(page, 0UL);
+			ClearPageSwapCache(page);
+			page_cache_release(page);
+		}
 	}
 	return error;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26672c6cd3ce..0075eac1cd04 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -391,12 +391,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
- * someone else has a ref on the page, abort and return 0.  If it was
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
  */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
@@ -427,24 +425,24 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
-	if (unlikely(page_count(page) != 2))
+	if (!page_freeze_refs(page, 2))
 		goto cannot_free;
-	smp_rmb();
-	if (unlikely(PageDirty(page)))
+	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+	if (unlikely(PageDirty(page))) {
+		page_unfreeze_refs(page, 2);
 		goto cannot_free;
+	}
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		write_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
-		__put_page(page);	/* The pagecache ref */
-		return 1;
+	} else {
+		__remove_from_page_cache(page);
+		write_unlock_irq(&mapping->tree_lock);
 	}
 
-	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
-	__put_page(page);
 	return 1;
 
 cannot_free:
@@ -452,6 +450,26 @@ cannot_free:
 	return 0;
 }
 
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+	if (__remove_mapping(mapping, page)) {
+		/*
+		 * Unfreezing the refcount with 1 rather than 2 effectively
+		 * drops the pagecache ref for us without requiring another
+		 * atomic operation.
+		 */
+		page_unfreeze_refs(page, 1);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -598,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PagePrivate(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
-			if (!mapping && page_count(page) == 1)
-				goto free_it;
+			if (!mapping && page_count(page) == 1) {
+				unlock_page(page);
+				if (put_page_testzero(page))
+					goto free_it;
+				else {
+					/*
+					 * rare race with speculative reference.
+					 * the speculative reference will free
+					 * this page shortly, so we may
+					 * increment nr_reclaimed here (and
+					 * leave it off the LRU).
+					 */
+					nr_reclaimed++;
+					continue;
+				}
+			}
 		}
 
-		if (!mapping || !remove_mapping(mapping, page))
+		if (!mapping || !__remove_mapping(mapping, page))
 			goto keep_locked;
 
-free_it:
 		unlock_page(page);
+free_it:
 		nr_reclaimed++;
-		if (!pagevec_add(&freed_pvec, page))
-			__pagevec_release_nonlru(&freed_pvec);
+		if (!pagevec_add(&freed_pvec, page)) {
+			__pagevec_free(&freed_pvec);
+			pagevec_reinit(&freed_pvec);
+		}
 		continue;
 
 activate_locked:
@@ -623,7 +657,7 @@ keep:
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
-		__pagevec_release_nonlru(&freed_pvec);
+		__pagevec_free(&freed_pvec);
 	count_vm_events(PGACTIVATE, pgactivate);
 	return nr_reclaimed;
 }
-- 
cgit v1.2.3


From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 25 Jul 2008 19:45:32 -0700
Subject: mm: spinlock tree_lock

mapping->tree_lock has no read lockers.  convert the lock from an rwlock
to a spinlock.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                     |  4 ++--
 fs/inode.c                      |  2 +-
 include/asm-arm/cacheflush.h    |  4 ++--
 include/asm-parisc/cacheflush.h |  4 ++--
 include/linux/fs.h              |  2 +-
 mm/filemap.c                    | 10 +++++-----
 mm/migrate.c                    | 11 +++++------
 mm/page-writeback.c             | 12 ++++++------
 mm/swap_state.c                 | 10 +++++-----
 mm/swapfile.c                   |  4 ++--
 mm/truncate.c                   |  6 +++---
 mm/vmscan.c                     |  8 ++++----
 12 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index d48caee12e2a..109b261192d9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -706,7 +706,7 @@ static int __set_page_dirty(struct page *page,
 	if (TestSetPageDirty(page))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 
@@ -719,7 +719,7 @@ static int __set_page_dirty(struct page *page,
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	return 1;
diff --git a/fs/inode.c b/fs/inode.c
index c36d9480335c..35b6414522ea 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -209,7 +209,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	rwlock_init(&inode->i_data.tree_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
diff --git a/include/asm-arm/cacheflush.h b/include/asm-arm/cacheflush.h
index 70b0fe724b62..03cf1ee977b7 100644
--- a/include/asm-arm/cacheflush.h
+++ b/include/asm-arm/cacheflush.h
@@ -424,9 +424,9 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
 }
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_user_range(vma,page,addr,len) \
 	flush_dcache_page(page)
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 2f1e1b05440a..b7ca6dc7fddc 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -45,9 +45,9 @@ void flush_cache_mm(struct mm_struct *mm);
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_page(vma,page)	do { 		\
 	flush_kernel_dcache_page(page);			\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 49d8eb7a71be..53d2edb709b3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -499,7 +499,7 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	rwlock_t		tree_lock;	/* and rwlock protecting it */
+	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
diff --git a/mm/filemap.c b/mm/filemap.c
index feb8448d8618..2ed8b0389c51 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -109,7 +109,7 @@
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
@@ -141,9 +141,9 @@ void remove_from_page_cache(struct page *page)
 
 	BUG_ON(!PageLocked(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 static int sync_page(void *word)
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 		page->mapping = mapping;
 		page->index = offset;
 
-		write_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (likely(!error)) {
 			mapping->nrpages++;
@@ -480,7 +480,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 			page_cache_release(page);
 		}
 
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	} else
 		mem_cgroup_uncharge_cache_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ca6392e82cc..153572fb60b8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -323,7 +323,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 		return 0;
 	}
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
  					page_index(page));
@@ -331,12 +331,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	expected_count = 2 + !!PagePrivate(page);
 	if (page_count(page) != expected_count ||
 			(struct page *)radix_tree_deref_slot(pslot) != page) {
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
 
 	if (!page_freeze_refs(page, expected_count)) {
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
 
@@ -373,10 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 
-	write_unlock_irq(&mapping->tree_lock);
-	if (!PageSwapCache(newpage)) {
+	spin_unlock_irq(&mapping->tree_lock);
+	if (!PageSwapCache(newpage))
 		mem_cgroup_uncharge_cache_page(page);
-	}
 
 	return 0;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		if (!mapping)
 			return 1;
 
-		write_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		mapping2 = page_mapping(page);
 		if (mapping2) { /* Race with truncate? */
 			BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 		}
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
 		if (ret) {
 			radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
 				__bdi_writeout_inc(bdi);
 			}
 		}
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
 		if (!ret) {
 			radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
 	}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e3381d6c7ee..2c217e33d497 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+	.tree_lock	= __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
@@ -80,7 +80,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 		SetPageSwapCache(page);
 		set_page_private(page, entry.val);
 
-		write_lock_irq(&swapper_space.tree_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (likely(!error)) {
@@ -88,7 +88,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			INC_CACHE_INFO(add_total);
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
 
 		if (unlikely(error)) {
@@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page *page)
 
 	entry.val = page_private(page);
 
-	write_lock_irq(&swapper_space.tree_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	write_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2f33edb8bee9..af283933c14e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -369,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
-		write_lock_irq(&swapper_space.tree_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 	}
 	spin_unlock(&swap_lock);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..e68443d74567 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -349,18 +349,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
 		goto failed;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0075eac1cd04..8f71761bc4b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -399,7 +399,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -436,17 +436,17 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
 	} else {
 		__remove_from_page_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 	}
 
 	return 1;
 
 cannot_free:
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  9 ++-------
 arch/powerpc/mm/init_64.c                 | 24 +++++++++---------------
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/mm/pmb.c                          |  2 +-
 arch/xtensa/mm/init.c                     |  2 +-
 drivers/usb/mon/mon_text.c                |  4 ++--
 fs/adfs/super.c                           |  2 +-
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  4 ++--
 fs/befs/linuxvfs.c                        |  2 +-
 fs/bfs/inode.c                            |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/buffer.c                               |  2 +-
 fs/cifs/cifsfs.c                          |  2 +-
 fs/coda/inode.c                           |  2 +-
 fs/ecryptfs/main.c                        |  4 ++--
 fs/efs/super.c                            |  2 +-
 fs/ext2/super.c                           |  2 +-
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  4 ++--
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  2 +-
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  2 +-
 fs/isofs/inode.c                          |  2 +-
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/minix/inode.c                          |  2 +-
 fs/ncpfs/inode.c                          |  2 +-
 fs/nfs/inode.c                            |  2 +-
 fs/ntfs/super.c                           |  2 +-
 fs/ocfs2/dlm/dlmfs.c                      |  3 +--
 fs/ocfs2/super.c                          |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  2 +-
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  2 +-
 fs/smbfs/inode.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/ubifs/super.c                          |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  2 +-
 fs/xfs/linux-2.6/kmem.h                   |  2 +-
 fs/xfs/linux-2.6/xfs_super.c              |  1 -
 include/linux/slab.h                      |  2 +-
 include/linux/slub_def.h                  |  2 +-
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 11 +++++------
 mm/slob.c                                 |  7 +++----
 mm/slub.c                                 | 13 ++++++-------
 net/socket.c                              |  2 +-
 net/sunrpc/rpc_pipe.c                     |  2 +-
 67 files changed, 90 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 60f162dc4fad..8c5e656d5d8c 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1304,7 +1304,7 @@ struct sysdev_class dma_sysclass = {
 
 /* kmem cache implementation */
 
-static void s3c2410_dma_cache_ctor(struct kmem_cache *c, void *p)
+static void s3c2410_dma_cache_ctor(void *p)
 {
 	memset(p, 0, sizeof(struct s3c2410_dma_buf));
 }
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a9..149cb112cd1a 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb42c4dd3217..ed0aab0208a6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -113,7 +113,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -730,11 +730,6 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
 	unsigned int psize;
@@ -756,7 +751,7 @@ static int __init hugetlbpage_init(void)
 						HUGEPTE_TABLE_SIZE(psize),
 						HUGEPTE_TABLE_SIZE(psize),
 						0,
-						zero_ctor);
+						NULL);
 			if (!huge_pgtable_cache(psize))
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a41bc5aa2043..4f7df85129d8 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -163,19 +168,8 @@ struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7123472801d9..690ca7b0dcf6 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -78,7 +78,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(struct kmem_cache *cachep, void *p)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 0b0ec6e04753..46911bcbf17b 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -293,7 +293,7 @@ void pmb_unmap(unsigned long addr)
 	} while (pmbe);
 }
 
-static void pmb_cache_ctor(struct kmem_cache *cachep, void *pmb)
+static void pmb_cache_ctor(void *pmb)
 {
 	struct pmb_entry *pmbe = pmb;
 
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 81d0560eaea2..ee261005b363 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -309,7 +309,7 @@ void show_mem(void)
 
 struct kmem_cache *pgtable_cache __read_mostly;
 
-static void pgd_ctor(struct kmem_cache *cache, void* addr)
+static void pgd_ctor(void* addr)
 {
 	pte_t* ptep = (pte_t*)addr;
 	int i;
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 5e3e4e9b6c77..1f715436d6d3 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -87,7 +87,7 @@ struct mon_reader_text {
 
 static struct dentry *mon_dir;		/* Usually /sys/kernel/debug/usbmon */
 
-static void mon_text_ctor(struct kmem_cache *, void *);
+static void mon_text_ctor(void *);
 
 struct mon_text_ptr {
 	int cnt, limit;
@@ -720,7 +720,7 @@ void mon_text_del(struct mon_bus *mbus)
 /*
  * Slab interface: constructor.
  */
-static void mon_text_ctor(struct kmem_cache *slab, void *mem)
+static void mon_text_ctor(void *mem)
 {
 	/*
 	 * Nothing to initialize. No, really!
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672b..26f3b43726bb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4e0309566406..3a89094f93d0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef6818..250d8c4d66e4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
 	struct afs_vnode *vnode = _vnode;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab3..02c6e62b72f8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -289,7 +289,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 053e690ec9ed..0ed57b5ee012 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -264,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871a..dcf37cada369 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 109b261192d9..5fd497cdd6f3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3272,7 +3272,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df5..fe5f6809cba6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -766,7 +766,7 @@ const struct file_operations cifs_dir_ops = {
 };
 
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc70083..830f51abb971 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6f403cfba14f..448dfd597b5f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -578,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
  * Initializes the ecryptfs_inode_info_cache when it is created
  */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
@@ -589,7 +589,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	void (*ctor)(struct kmem_cache *cache, void *obj);
+	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
 		.cache = &ecryptfs_auth_tok_list_item_cache,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e2..567b134fa1f1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 31308a3b0b8b..fd88c7b43e66 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -159,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 615788c6843a..8ddced384674 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd609..b5479b1dff14 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -595,7 +595,7 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61f..3222f51c41cf 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 
 static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 23676f9d79ce..6d266d793e2c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -498,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d2f7d6e22e2..d2249f174e20 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -956,7 +956,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
 	struct inode * inode = foo;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fadd..bb2cc303ac29 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
 
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	ip->i_alloc = NULL;
 }
 
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ac2ec5ef66e4..4abb1047c689 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3859118531c7..e834e578c93f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec659..b8ae9c90ada0 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dbd01d262ca4..3f58923fb39b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -705,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
diff --git a/fs/inode.c b/fs/inode.c
index 35b6414522ea..b6726f644530 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -224,7 +224,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526b..26948a6033b6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e4..efd401257ed9 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574f..c350057087dd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 359c091d8965..3630718be395 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/locks.c b/fs/locks.c
index 01490300f7cb..5eb259e3cd38 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 523d73713418..d1d1eb84679d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204dec..d642f0e5b365 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6b..52daefa2f521 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1242,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216bc..4a46743b5077 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b77..533a789c3ef8 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	return writelen;
 }
 
-static void dlmfs_init_once(struct kmem_cache *cachep,
-			    void *foo)
+static void dlmfs_init_once(void *foo)
 {
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094fa..2560b33889aa 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1118,7 +1118,7 @@ bail:
 	return status;
 }
 
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e1..9f5b054f06b9 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 02eca2ed9dd7..b37f25dc45a5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -94,7 +94,7 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b3..2aad1044b84c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2ec748ba0bd3..879e54d35c2d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -521,7 +521,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c7..8e51a2aaa977 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -577,7 +577,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct romfs_inode_info *ei = foo;
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed7..3528f40ffb0f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658f..df0d435baa48 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad03..ca1e2d4e03cc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1841,7 +1841,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
  * Inode slab cache constructor.
  */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
 	struct ubifs_inode *ui = obj;
 	inode_init_once(&ui->vfs_inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96cc..5698bbf83bbf 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 227c9d700040..3e30e40aa24d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1302,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e9564902976..a20683cf74dd 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(kmem_zone_t *, void *))
+		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c1..943381284e2e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -843,7 +843,6 @@ xfs_fs_destroy_inode(
 
 STATIC void
 xfs_fs_inode_init_once(
-	kmem_zone_t		*zonep,
 	void			*vnode)
 {
 	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 41103910f8a2..9ff8e8499403 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -58,7 +58,7 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(struct kmem_cache *, void *));
+			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d117ea2825a9..5bad61a93f65 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kmem_cache_order_objects min;
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1fdc2eb2f6d8..474984f9e032 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -207,7 +207,7 @@ static int mqueue_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a4..80e83e459b17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/lib/idr.c b/lib/idr.c
index 3476f8203e97..e728c7fccc4d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -607,7 +607,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer)
+static void idr_cache_ctor(void *idr_layer)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9c4f1ffa2864..be86b32bc874 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1183,7 +1183,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
+radix_tree_node_ctor(void *node)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..39ae5a9bf382 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
 	struct anon_vma *anon_vma = data;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1089092aecaf..952d361774bb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2352,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..918f04f7fef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
 	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *obj);
 
 /* 5) cache creation/removal */
 	const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	unsigned long flags, void (*ctor)(void *))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(cachep, objp + obj_offset(cachep));
+			cachep->ctor(objp + obj_offset(cachep));
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
-			cachep->ctor(cachep, objp);
+			cachep->ctor(objp);
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
-		cachep->ctor(cachep, objp);
+		cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..d8fbd4d1bfa7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
 	unsigned int size, align;
 	unsigned long flags;
 	const char *name;
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-	size_t align, unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *c;
 
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
-		c->ctor(c, b);
+		c->ctor(b);
 
 	return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..b7e2cd5d82db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 
 static unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	/*
 	 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
 	setup_object_debug(s, page, object);
 	if (unlikely(s->ctor))
-		s->ctor(s, object);
+		s->ctor(object);
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	memset(s, 0, kmem_size);
 	s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 
 static struct kmem_cache *find_mergeable(size_t size,
 		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
diff --git a/net/socket.c b/net/socket.c
index 1310a82cbba7..8ef8ba81b9e2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -265,7 +265,7 @@ static void sock_destroy_inode(struct inode *inode)
 			container_of(inode, struct socket_alloc, vfs_inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct socket_alloc *ei = (struct socket_alloc *)foo;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5a9b0e7828cd..23a2b8f6dc49 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 };
 
 static void
-init_once(struct kmem_cache * cachep, void *foo)
+init_once(void *foo)
 {
 	struct rpc_inode *rpci = (struct rpc_inode *) foo;
 
-- 
cgit v1.2.3


From 88ac2921a71f788ed693bcd44731dd6bc1994640 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:43 -0700
Subject: tracehook: add linux/tracehook.h

This patch series introduces the "tracehook" interface layer of inlines in
<linux/tracehook.h>.  There are more details in the log entry for patch
01/23 and in the header file comments inside that patch.  Most of these
changes move code around with little or no change, and they should not
break anything or change any behavior.

This sets a new standard for uniform arch support to enable clean
arch-independent implementations of new debugging and tracing stuff,
denoted by CONFIG_HAVE_ARCH_TRACEHOOK.  Patch 20/23 adds that symbol to
arch/Kconfig, with comments listing everything an arch has to do before
setting "select HAVE_ARCH_TRACEHOOK".  These are elaborted a bit at:

	http://sourceware.org/systemtap/wiki/utrace/arch/HowTo

The new inlines that arch code must define or call have detailed kerneldoc
comments in the generic header files that say what is required.

No arch is obligated to do any work, and no arch's build should be broken
by these changes.  There are several steps that each arch should take so
it can set HAVE_ARCH_TRACEHOOK.  Most of these are simple.  Providing this
support will let new things people add for doing debugging and tracing of
user-level threads "just work" for your arch in the future.  For an arch
that does not provide HAVE_ARCH_TRACEHOOK, some new options for such
features will not be available for config.

I have done some arch work and will submit this to the arch maintainers
after the generic tracehook series settles in.  For now, that work is
available in my GIT repositories, and in patch and mbox-of-patches form at
http://people.redhat.com/roland/utrace/2.6-current/

This paves the way for my "utrace" work, to be submitted later.  But it is
not innately tied to that.  I hope that the tracehook series can go in
soon regardless of what eventually does or doesn't go on top of it.  For
anyone implementing any kind of new tracing/debugging plan, or just
understanding all the context of the existing ptrace implementation,
having tracehook.h makes things much easier to find and understand.

This patch:

This adds the new kernel-internal header file <linux/tracehook.h>.  This
is not yet used at all.  The comments in the header introduce what the
following series of patches is about.

The aim is to formalize and consolidate all the places that the core
kernel code and the arch code now ties into the ptrace implementation.

These patches mostly don't cause any functional change.  They just move
the details of ptrace logic out of core code into tracehook.h inlines,
where they are mostly compiled away to the same as before.  All that
changes is that everything is thoroughly documented and any future
reworking of ptrace, or addition of something new, would not have to touch
core code all over, just change the tracehook.h inlines.

The new linux/ptrace.h inlines are used by the following patches in the
new tracehook_*() inlines.  Using these helpers for the ptrace event stops
makes it simple to change or disable the old ptrace implementation of
these stops conditionally later.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h    | 33 ++++++++++++++++++++++++++++++
 include/linux/tracehook.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 include/linux/tracehook.h

(limited to 'include')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index c6f5f9dd0cee..c74abfc4c7e8 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -121,6 +121,39 @@ static inline void ptrace_unlink(struct task_struct *child)
 int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data);
 int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data);
 
+/**
+ * task_ptrace - return %PT_* flags that apply to a task
+ * @task:	pointer to &task_struct in question
+ *
+ * Returns the %PT_* flags that apply to @task.
+ */
+static inline int task_ptrace(struct task_struct *task)
+{
+	return task->ptrace;
+}
+
+/**
+ * ptrace_event - possibly stop for a ptrace event notification
+ * @mask:	%PT_* bit to check in @current->ptrace
+ * @event:	%PTRACE_EVENT_* value to report if @mask is set
+ * @message:	value for %PTRACE_GETEVENTMSG to return
+ *
+ * This checks the @mask bit to see if ptrace wants stops for this event.
+ * If so we stop, reporting @event and @message to the ptrace parent.
+ *
+ * Returns nonzero if we did a ptrace notification, zero if not.
+ *
+ * Called without locks.
+ */
+static inline int ptrace_event(int mask, int event, unsigned long message)
+{
+	if (mask && likely(!(current->ptrace & mask)))
+		return 0;
+	current->ptrace_message = message;
+	ptrace_notify((event << 8) | SIGTRAP);
+	return 1;
+}
+
 #ifndef force_successful_syscall_return
 /*
  * System call handlers that, upon successful completion, need to return a
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
new file mode 100644
index 000000000000..bea0f3eeff54
--- /dev/null
+++ b/include/linux/tracehook.h
@@ -0,0 +1,52 @@
+/*
+ * Tracing hooks
+ *
+ * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * This file defines hook entry points called by core code where
+ * user tracing/debugging support might need to do something.  These
+ * entry points are called tracehook_*().  Each hook declared below
+ * has a detailed kerneldoc comment giving the context (locking et
+ * al) from which it is called, and the meaning of its return value.
+ *
+ * Each function here typically has only one call site, so it is ok
+ * to have some nontrivial tracehook_*() inlines.  In all cases, the
+ * fast path when no tracing is enabled should be very short.
+ *
+ * The purpose of this file and the tracehook_* layer is to consolidate
+ * the interface that the kernel core and arch code uses to enable any
+ * user debugging or tracing facility (such as ptrace).  The interfaces
+ * here are carefully documented so that maintainers of core and arch
+ * code do not need to think about the implementation details of the
+ * tracing facilities.  Likewise, maintainers of the tracing code do not
+ * need to understand all the calling core or arch code in detail, just
+ * documented circumstances of each call, such as locking conditions.
+ *
+ * If the calling core code changes so that locking is different, then
+ * it is ok to change the interface documented here.  The maintainer of
+ * core code changing should notify the maintainers of the tracing code
+ * that they need to work out the change.
+ *
+ * Some tracehook_*() inlines take arguments that the current tracing
+ * implementations might not necessarily use.  These function signatures
+ * are chosen to pass in all the information that is on hand in the
+ * caller and might conceivably be relevant to a tracer, so that the
+ * core code won't have to be updated when tracing adds more features.
+ * If a call site changes so that some of those parameters are no longer
+ * already on hand without extra work, then the tracehook_* interface
+ * can change so there is no make-work burden on the core code.  The
+ * maintainer of core code changing should notify the maintainers of the
+ * tracing code that they need to work out the change.
+ */
+
+#ifndef _LINUX_TRACEHOOK_H
+#define _LINUX_TRACEHOOK_H	1
+
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+
+#endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From 6341c393fcc37d58727865f1ee2f65e632e9d4f0 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:44 -0700
Subject: tracehook: exec

This moves all the ptrace hooks related to exec into tracehook.h inlines.

This also lifts the calls for tracing out of the binfmt load_binary hooks
into search_binary_handler() after it calls into the binfmt module.  This
change has no effect, since all the binfmt modules' load_binary functions
did the call at the end on success, and now search_binary_handler() does
it immediately after return if successful.  We consolidate the repeated
code, and binfmt modules no longer need to import ptrace_notify().

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c |  6 ------
 fs/binfmt_aout.c          |  6 ------
 fs/binfmt_elf.c           |  6 ------
 fs/binfmt_elf_fdpic.c     |  7 -------
 fs/binfmt_flat.c          |  3 ---
 fs/binfmt_som.c           |  2 --
 fs/exec.c                 | 12 ++++--------
 include/linux/tracehook.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 50 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..a0e1dbe67dc1 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -441,12 +441,6 @@ beyond_if:
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 =
 	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
 	set_fs(USER_DS);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	return 0;
 }
 
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ba4cddb92f1d..204cfd1d7676 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -444,12 +444,6 @@ beyond_if:
 	regs->gp = ex.a_gpvalue;
 #endif
 	start_thread(regs, ex.a_entry, current->mm->start_stack);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	return 0;
 }
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3b6ff854d983..655ed8d30a86 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1003,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif
 
 	start_thread(regs, elf_entry, bprm->p);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	retval = 0;
 out:
 	kfree(loc);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 1b59b1edf26d..fdeadab2f18b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -433,13 +433,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
 	start_thread(regs, entryaddr, current->mm->start_stack);
 
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
-
 	retval = 0;
 
 error:
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2cb1acda3a82..56372ecf1690 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -920,9 +920,6 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	
 	start_thread(regs, start_addr, current->mm->start_stack);
 
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
-
 	return 0;
 }
 
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index fdc36bfd6a7b..68be580ba289 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	map_hpux_gateway_page(current,current->mm);
 
 	start_thread_som(regs, som_entry, bprm->p);
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
 	return 0;
 
 	/* error cleanup */
diff --git a/fs/exec.c b/fs/exec.c
index 5e559013e303..b8792a131533 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -42,13 +42,13 @@
 #include <linux/module.h>
 #include <linux/namei.h>
 #include <linux/proc_fs.h>
-#include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1071,13 +1071,8 @@ EXPORT_SYMBOL(prepare_binprm);
 
 static int unsafe_exec(struct task_struct *p)
 {
-	int unsafe = 0;
-	if (p->ptrace & PT_PTRACED) {
-		if (p->ptrace & PT_PTRACE_CAP)
-			unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	int unsafe = tracehook_unsafe_exec(p);
+
 	if (atomic_read(&p->fs->count) > 1 ||
 	    atomic_read(&p->files->count) > 1 ||
 	    atomic_read(&p->sighand->count) > 1)
@@ -1214,6 +1209,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			read_unlock(&binfmt_lock);
 			retval = fn(bprm, regs);
 			if (retval >= 0) {
+				tracehook_report_exec(fmt, bprm, regs);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index bea0f3eeff54..6276353709c1 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -48,5 +48,51 @@
 
 #include <linux/sched.h>
 #include <linux/ptrace.h>
+#include <linux/security.h>
+struct linux_binprm;
+
+/**
+ * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
+ * @task:		current task doing exec
+ *
+ * Return %LSM_UNSAFE_* bits applied to an exec because of tracing.
+ *
+ * Called with task_lock() held on @task.
+ */
+static inline int tracehook_unsafe_exec(struct task_struct *task)
+{
+	int unsafe = 0;
+	int ptrace = task_ptrace(task);
+	if (ptrace & PT_PTRACED) {
+		if (ptrace & PT_PTRACE_CAP)
+			unsafe |= LSM_UNSAFE_PTRACE_CAP;
+		else
+			unsafe |= LSM_UNSAFE_PTRACE;
+	}
+	return unsafe;
+}
+
+/**
+ * tracehook_report_exec - a successful exec was completed
+ * @fmt:		&struct linux_binfmt that performed the exec
+ * @bprm:		&struct linux_binprm containing exec details
+ * @regs:		user-mode register state
+ *
+ * An exec just completed, we are shortly going to return to user mode.
+ * The freshly initialized register state can be seen and changed in @regs.
+ * The name, file and other pointers in @bprm are still on hand to be
+ * inspected, but will be freed as soon as this returns.
+ *
+ * Called with no locks, but with some kernel resources held live
+ * and a reference on @fmt->module.
+ */
+static inline void tracehook_report_exec(struct linux_binfmt *fmt,
+					 struct linux_binprm *bprm,
+					 struct pt_regs *regs)
+{
+	if (!ptrace_event(PT_TRACE_EXEC, PTRACE_EVENT_EXEC, 0) &&
+	    unlikely(task_ptrace(current) & PT_PTRACED))
+		send_sig(SIGTRAP, current, 0);
+}
 
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From 30199f5a46aee204bf437a4f5b0740f3efe448b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:46 -0700
Subject: tracehook: exit

This moves the PTRACE_EVENT_EXIT tracing into a tracehook.h inline,
tracehook_report_exec().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 15 +++++++++++++++
 kernel/exit.c             |  6 ++----
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6276353709c1..967ab473afbc 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -95,4 +95,19 @@ static inline void tracehook_report_exec(struct linux_binfmt *fmt,
 		send_sig(SIGTRAP, current, 0);
 }
 
+/**
+ * tracehook_report_exit - task has begun to exit
+ * @exit_code:		pointer to value destined for @current->exit_code
+ *
+ * @exit_code points to the value passed to do_exit(), which tracing
+ * might change here.  This is almost the first thing in do_exit(),
+ * before freeing any resources or setting the %PF_EXITING flag.
+ *
+ * Called with no locks held.
+ */
+static inline void tracehook_report_exit(long *exit_code)
+{
+	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/exit.c b/kernel/exit.c
index ad933bb29ec7..c3691cbc220a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1029,10 +1030,7 @@ NORET_TYPE void do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 
-	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
-		current->ptrace_message = code;
-		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-	}
+	tracehook_report_exit(&code);
 
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
-- 
cgit v1.2.3


From 09a05394fe2448a4139b014936330af23fa7ec83 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: clone

This moves all the ptrace initialization and tracing logic for task
creation into tracehook.h and ptrace.h inlines.  It reorganizes the code
slightly, but should not change any behavior.

There are four tracehook entry points, at each important stage of task
creation.  This keeps the interface from the core fork.c code fairly
clean, while supporting the complex setup required for ptrace or something
like it.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h    |  22 ++++++++++
 include/linux/tracehook.h | 100 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c             |  69 +++++++++++++-------------------
 3 files changed, 150 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index c74abfc4c7e8..dae6d85520fb 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -154,6 +154,28 @@ static inline int ptrace_event(int mask, int event, unsigned long message)
 	return 1;
 }
 
+/**
+ * ptrace_init_task - initialize ptrace state for a new child
+ * @child:		new child task
+ * @ptrace:		true if child should be ptrace'd by parent's tracer
+ *
+ * This is called immediately after adding @child to its parent's children
+ * list.  @ptrace is false in the normal case, and true to ptrace @child.
+ *
+ * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
+ */
+static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
+{
+	INIT_LIST_HEAD(&child->ptrace_entry);
+	INIT_LIST_HEAD(&child->ptraced);
+	child->parent = child->real_parent;
+	child->ptrace = 0;
+	if (unlikely(ptrace)) {
+		child->ptrace = current->ptrace;
+		__ptrace_link(child, current->parent);
+	}
+}
+
 #ifndef force_successful_syscall_return
 /*
  * System call handlers that, upon successful completion, need to return a
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 967ab473afbc..3ebc58b59766 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -110,4 +110,104 @@ static inline void tracehook_report_exit(long *exit_code)
 	ptrace_event(PT_TRACE_EXIT, PTRACE_EVENT_EXIT, *exit_code);
 }
 
+/**
+ * tracehook_prepare_clone - prepare for new child to be cloned
+ * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
+ *
+ * This is called before a new user task is to be cloned.
+ * Its return value will be passed to tracehook_finish_clone().
+ *
+ * Called with no locks held.
+ */
+static inline int tracehook_prepare_clone(unsigned clone_flags)
+{
+	if (clone_flags & CLONE_UNTRACED)
+		return 0;
+
+	if (clone_flags & CLONE_VFORK) {
+		if (current->ptrace & PT_TRACE_VFORK)
+			return PTRACE_EVENT_VFORK;
+	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
+		if (current->ptrace & PT_TRACE_CLONE)
+			return PTRACE_EVENT_CLONE;
+	} else if (current->ptrace & PT_TRACE_FORK)
+		return PTRACE_EVENT_FORK;
+
+	return 0;
+}
+
+/**
+ * tracehook_finish_clone - new child created and being attached
+ * @child:		new child task
+ * @clone_flags:	%CLONE_* flags from clone/fork/vfork system call
+ * @trace:		return value from tracehook_clone_prepare()
+ *
+ * This is called immediately after adding @child to its parent's children list.
+ * The @trace value is that returned by tracehook_prepare_clone().
+ *
+ * Called with current's siglock and write_lock_irq(&tasklist_lock) held.
+ */
+static inline void tracehook_finish_clone(struct task_struct *child,
+					  unsigned long clone_flags, int trace)
+{
+	ptrace_init_task(child, (clone_flags & CLONE_PTRACE) || trace);
+}
+
+/**
+ * tracehook_report_clone - in parent, new child is about to start running
+ * @trace:		return value from tracehook_clone_prepare()
+ * @regs:		parent's user register state
+ * @clone_flags:	flags from parent's system call
+ * @pid:		new child's PID in the parent's namespace
+ * @child:		new child task
+ *
+ * Called after a child is set up, but before it has been started running.
+ * The @trace value is that returned by tracehook_clone_prepare().
+ * This is not a good place to block, because the child has not started yet.
+ * Suspend the child here if desired, and block in tracehook_clone_complete().
+ * This must prevent the child from self-reaping if tracehook_clone_complete()
+ * uses the @child pointer; otherwise it might have died and been released by
+ * the time tracehook_report_clone_complete() is called.
+ *
+ * Called with no locks held, but the child cannot run until this returns.
+ */
+static inline void tracehook_report_clone(int trace, struct pt_regs *regs,
+					  unsigned long clone_flags,
+					  pid_t pid, struct task_struct *child)
+{
+	if (unlikely(trace)) {
+		/*
+		 * The child starts up with an immediate SIGSTOP.
+		 */
+		sigaddset(&child->pending.signal, SIGSTOP);
+		set_tsk_thread_flag(child, TIF_SIGPENDING);
+	}
+}
+
+/**
+ * tracehook_report_clone_complete - new child is running
+ * @trace:		return value from tracehook_clone_prepare()
+ * @regs:		parent's user register state
+ * @clone_flags:	flags from parent's system call
+ * @pid:		new child's PID in the parent's namespace
+ * @child:		child task, already running
+ *
+ * This is called just after the child has started running.  This is
+ * just before the clone/fork syscall returns, or blocks for vfork
+ * child completion if @clone_flags has the %CLONE_VFORK bit set.
+ * The @child pointer may be invalid if a self-reaping child died and
+ * tracehook_report_clone() took no action to prevent it from self-reaping.
+ *
+ * Called with no locks held.
+ */
+static inline void tracehook_report_clone_complete(int trace,
+						   struct pt_regs *regs,
+						   unsigned long clone_flags,
+						   pid_t pid,
+						   struct task_struct *child)
+{
+	if (unlikely(trace))
+		ptrace_event(0, trace, pid);
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/fork.c b/kernel/fork.c
index 80e83e459b17..b42f8ed23611 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -865,8 +866,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 
 	new_flags &= ~PF_SUPERPRIV;
 	new_flags |= PF_FORKNOEXEC;
-	if (!(clone_flags & CLONE_PTRACE))
-		p->ptrace = 0;
+	new_flags |= PF_STARTING;
 	p->flags = new_flags;
 	clear_freeze_flag(p);
 }
@@ -907,7 +907,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					struct pt_regs *regs,
 					unsigned long stack_size,
 					int __user *child_tidptr,
-					struct pid *pid)
+					struct pid *pid,
+					int trace)
 {
 	int retval;
 	struct task_struct *p;
@@ -1163,8 +1164,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_entry);
-	INIT_LIST_HEAD(&p->ptraced);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
@@ -1195,7 +1194,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		p->real_parent = current->real_parent;
 	else
 		p->real_parent = current;
-	p->parent = p->real_parent;
 
 	spin_lock(&current->sighand->siglock);
 
@@ -1237,8 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	if (likely(p->pid)) {
 		list_add_tail(&p->sibling, &p->real_parent->children);
-		if (unlikely(p->ptrace & PT_PTRACED))
-			__ptrace_link(p, current->parent);
+		tracehook_finish_clone(p, clone_flags, trace);
 
 		if (thread_group_leader(p)) {
 			if (clone_flags & CLONE_NEWPID)
@@ -1323,29 +1320,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-				&init_struct_pid);
+			    &init_struct_pid, 0);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
 	return task;
 }
 
-static int fork_traceflag(unsigned clone_flags)
-{
-	if (clone_flags & CLONE_UNTRACED)
-		return 0;
-	else if (clone_flags & CLONE_VFORK) {
-		if (current->ptrace & PT_TRACE_VFORK)
-			return PTRACE_EVENT_VFORK;
-	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-		if (current->ptrace & PT_TRACE_CLONE)
-			return PTRACE_EVENT_CLONE;
-	} else if (current->ptrace & PT_TRACE_FORK)
-		return PTRACE_EVENT_FORK;
-
-	return 0;
-}
-
 /*
  *  Ok, this is the main fork-routine.
  *
@@ -1380,14 +1361,14 @@ long do_fork(unsigned long clone_flags,
 		}
 	}
 
-	if (unlikely(current->ptrace)) {
-		trace = fork_traceflag (clone_flags);
-		if (trace)
-			clone_flags |= CLONE_PTRACE;
-	}
+	/*
+	 * When called from kernel_thread, don't do user tracing stuff.
+	 */
+	if (likely(user_mode(regs)))
+		trace = tracehook_prepare_clone(clone_flags);
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
-			child_tidptr, NULL);
+			 child_tidptr, NULL, trace);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1405,24 +1386,30 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
-		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+		tracehook_report_clone(trace, regs, clone_flags, nr, p);
+
+		/*
+		 * We set PF_STARTING at creation in case tracing wants to
+		 * use this to distinguish a fully live task from one that
+		 * hasn't gotten to tracehook_report_clone() yet.  Now we
+		 * clear it and set the child going.
+		 */
+		p->flags &= ~PF_STARTING;
+
+		if (unlikely(clone_flags & CLONE_STOPPED)) {
 			/*
 			 * We'll start up with an immediate SIGSTOP.
 			 */
 			sigaddset(&p->pending.signal, SIGSTOP);
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
-		if (!(clone_flags & CLONE_STOPPED))
-			wake_up_new_task(p, clone_flags);
-		else
 			__set_task_state(p, TASK_STOPPED);
-
-		if (unlikely (trace)) {
-			current->ptrace_message = nr;
-			ptrace_notify ((trace << 8) | SIGTRAP);
+		} else {
+			wake_up_new_task(p, clone_flags);
 		}
 
+		tracehook_report_clone_complete(trace, regs,
+						clone_flags, nr, p);
+
 		if (clone_flags & CLONE_VFORK) {
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
-- 
cgit v1.2.3


From daded34be96b1975ff8539ff62ad8b158ce7d842 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: vfork-done

This moves the PTRACE_EVENT_VFORK_DONE tracing into a tracehook.h inline,
tracehook_report_vfork_done().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 18 ++++++++++++++++++
 kernel/fork.c             |  5 +----
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3ebc58b59766..830e6e16097d 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -210,4 +210,22 @@ static inline void tracehook_report_clone_complete(int trace,
 		ptrace_event(0, trace, pid);
 }
 
+/**
+ * tracehook_report_vfork_done - vfork parent's child has exited or exec'd
+ * @child:		child task, already running
+ * @pid:		new child's PID in the parent's namespace
+ *
+ * Called after a %CLONE_VFORK parent has waited for the child to complete.
+ * The clone/vfork system call will return immediately after this.
+ * The @child pointer may be invalid if a self-reaping child died and
+ * tracehook_report_clone() took no action to prevent it from self-reaping.
+ *
+ * Called with no locks held.
+ */
+static inline void tracehook_report_vfork_done(struct task_struct *child,
+					       pid_t pid)
+{
+	ptrace_event(PT_TRACE_VFORK_DONE, PTRACE_EVENT_VFORK_DONE, pid);
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/fork.c b/kernel/fork.c
index b42f8ed23611..abb3ed6298f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1414,10 +1414,7 @@ long do_fork(unsigned long clone_flags,
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
 			freezer_count();
-			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
-				current->ptrace_message = nr;
-				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-			}
+			tracehook_report_vfork_done(p, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
-- 
cgit v1.2.3


From dae33574dcf5211e1f43c7e45fa29f73ba3e00cb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:48 -0700
Subject: tracehook: release_task

This moves the ptrace-related logic from release_task into tracehook.h and
ptrace.h inlines.  It provides clean hooks both before and after locking
tasklist_lock, for future tracing logic to do more cleanup without the
lock.

This also changes release_task() itself in the rare "zap_leader" case to
set the leader to EXIT_DEAD before iterating.  This maintains the
invariant that release_task() only ever handles a task in EXIT_DEAD.  This
is a common-sense invariant that is already always true except in this one
arcane case of zombie leader whose parent ignores SIGCHLD.

This change is harmless and only costs one store in this one rare case.
It keeps the expected state more consisently sane, which is nicer when
debugging weirdness in release_task().  It also lets some future code in
the tracehook entry points rely on this invariant for bookkeeping.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h    | 13 +++++++++++++
 include/linux/tracehook.h | 28 ++++++++++++++++++++++++++++
 kernel/exit.c             | 21 +++++++++------------
 3 files changed, 50 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index dae6d85520fb..ed69c03692d9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -176,6 +176,19 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 	}
 }
 
+/**
+ * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped
+ * @task:	task in %EXIT_DEAD state
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static inline void ptrace_release_task(struct task_struct *task)
+{
+	BUG_ON(!list_empty(&task->ptraced));
+	ptrace_unlink(task);
+	BUG_ON(!list_empty(&task->ptrace_entry));
+}
+
 #ifndef force_successful_syscall_return
 /*
  * System call handlers that, upon successful completion, need to return a
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 830e6e16097d..9a5b3be2503a 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -228,4 +228,32 @@ static inline void tracehook_report_vfork_done(struct task_struct *child,
 	ptrace_event(PT_TRACE_VFORK_DONE, PTRACE_EVENT_VFORK_DONE, pid);
 }
 
+/**
+ * tracehook_prepare_release_task - task is being reaped, clean up tracing
+ * @task:		task in %EXIT_DEAD state
+ *
+ * This is called in release_task() just before @task gets finally reaped
+ * and freed.  This would be the ideal place to remove and clean up any
+ * tracing-related state for @task.
+ *
+ * Called with no locks held.
+ */
+static inline void tracehook_prepare_release_task(struct task_struct *task)
+{
+}
+
+/**
+ * tracehook_finish_release_task - task is being reaped, clean up tracing
+ * @task:		task in %EXIT_DEAD state
+ *
+ * This is called in release_task() when @task is being in the middle of
+ * being reaped.  After this, there must be no tracing entanglements.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static inline void tracehook_finish_release_task(struct task_struct *task)
+{
+	ptrace_release_task(task);
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/exit.c b/kernel/exit.c
index c3691cbc220a..da28745f7c38 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -163,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
-	BUG_ON(!list_empty(&p->ptraced));
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_entry));
-}
 
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
+	tracehook_prepare_release_task(p);
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_release_task(p);
+	tracehook_finish_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -205,6 +195,13 @@ repeat:
 		 * that case.
 		 */
 		zap_leader = task_detached(leader);
+
+		/*
+		 * This maintains the invariant that release_task()
+		 * only runs on a task in EXIT_DEAD, just for sanity.
+		 */
+		if (zap_leader)
+			leader->exit_state = EXIT_DEAD;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 0d094efeb1e98010c6b99923f1eb7e17bf1e3a74 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:49 -0700
Subject: tracehook: tracehook_tracer_task

This adds the tracehook_tracer_task() hook to consolidate all forms of
"Who is using ptrace on me?" logic.  This is used for "TracerPid:" in
/proc and for permission checks.  We also clean up the selinux code the
called an identical accessor.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c           |  9 +++++++--
 fs/proc/base.c            | 13 +++++++++----
 include/linux/tracehook.h | 18 ++++++++++++++++++
 security/selinux/hooks.c  | 22 +++-------------------
 4 files changed, 37 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 797d775e0354..0d6eb33597c6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -168,8 +169,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	rcu_read_lock();
 	ppid = pid_alive(p) ?
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-	tpid = pid_alive(p) && p->ptrace ?
-		task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+	tpid = 0;
+	if (pid_alive(p)) {
+		struct task_struct *tracer = tracehook_tracer_task(p);
+		if (tracer)
+			tpid = task_pid_nr_ns(tracer, ns);
+	}
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a891fe4cb43b..4b74dba69a6d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -69,6 +69,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -231,10 +232,14 @@ static int check_mem_permission(struct task_struct *task)
 	 * If current is actively ptrace'ing, and would also be
 	 * permitted to freshly attach with ptrace now, permit it.
 	 */
-	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
-	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
-		return 0;
+	if (task_is_stopped_or_traced(task)) {
+		int match;
+		rcu_read_lock();
+		match = (tracehook_tracer_task(task) == current);
+		rcu_read_unlock();
+		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+			return 0;
+	}
 
 	/*
 	 * Noone else is allowed.
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 9a5b3be2503a..6468ca0fe69b 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -72,6 +72,24 @@ static inline int tracehook_unsafe_exec(struct task_struct *task)
 	return unsafe;
 }
 
+/**
+ * tracehook_tracer_task - return the task that is tracing the given task
+ * @tsk:		task to consider
+ *
+ * Returns NULL if noone is tracing @task, or the &struct task_struct
+ * pointer to its tracer.
+ *
+ * Must called under rcu_read_lock().  The pointer returned might be kept
+ * live only by RCU.  During exec, this may be called with task_lock()
+ * held on @task, still held from when tracehook_unsafe_exec() was called.
+ */
+static inline struct task_struct *tracehook_tracer_task(struct task_struct *tsk)
+{
+	if (task_ptrace(tsk) & PT_PTRACED)
+		return rcu_dereference(tsk->parent);
+	return NULL;
+}
+
 /**
  * tracehook_report_exec - a successful exec was completed
  * @fmt:		&struct linux_binfmt that performed the exec
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 63f131fc42e4..3481cde5bf15 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -25,7 +25,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/security.h>
@@ -1971,22 +1971,6 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
 
-/**
- * task_tracer_task - return the task that is tracing the given task
- * @task:		task to consider
- *
- * Returns NULL if noone is tracing @task, or the &struct task_struct
- * pointer to its tracer.
- *
- * Must be called under rcu_read_lock().
- */
-static struct task_struct *task_tracer_task(struct task_struct *task)
-{
-	if (task->ptrace & PT_PTRACED)
-		return rcu_dereference(task->parent);
-	return NULL;
-}
-
 /* binprm security operations */
 
 static int selinux_bprm_alloc_security(struct linux_binprm *bprm)
@@ -2238,7 +2222,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 			u32 ptsid = 0;
 
 			rcu_read_lock();
-			tracer = task_tracer_task(current);
+			tracer = tracehook_tracer_task(current);
 			if (likely(tracer != NULL)) {
 				sec = tracer->security;
 				ptsid = sec->sid;
@@ -5247,7 +5231,7 @@ static int selinux_setprocattr(struct task_struct *p,
 		   Otherwise, leave SID unchanged and fail. */
 		task_lock(p);
 		rcu_read_lock();
-		tracer = task_tracer_task(p);
+		tracer = tracehook_tracer_task(p);
 		if (tracer != NULL) {
 			struct task_security_struct *ptsec = tracer->security;
 			u32 ptsid = ptsec->sid;
-- 
cgit v1.2.3


From fa8e26ccd485216fc45c8c2dd1ec3b7ef1a0a2f8 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:50 -0700
Subject: tracehook: tracehook_expect_breakpoints

This adds tracehook_expect_breakpoints() as a formal hook for the nommu
code to use for its, "Is text-poking likely?" check at mmap time.  This
names the actual semantics the code means to test, and documents it.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 15 +++++++++++++++
 mm/nommu.c                |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6468ca0fe69b..e113e09b0341 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,21 @@
 #include <linux/security.h>
 struct linux_binprm;
 
+/**
+ * tracehook_expect_breakpoints - guess if task memory might be touched
+ * @task:		current task, making a new mapping
+ *
+ * Return nonzero if @task is expected to want breakpoint insertion in
+ * its memory at some point.  A zero return is no guarantee it won't
+ * be done, but this is a hint that it's known to be likely.
+ *
+ * May be called with @task->mm->mmap_sem held for writing.
+ */
+static inline int tracehook_expect_breakpoints(struct task_struct *task)
+{
+	return (task_ptrace(task) & PT_PTRACED) != 0;
+}
+
 /**
  * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
  * @task:		current task doing exec
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..5edccd9c9218 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -745,7 +745,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	 * it's being traced - otherwise breakpoints set in it may interfere
 	 * with another untraced process
 	 */
-	if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+	if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
 		vm_flags &= ~VM_MAYSHARE;
 
 	return vm_flags;
-- 
cgit v1.2.3


From c45aea27617d6a1e0aacddc3b0233f704222fcbd Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:50 -0700
Subject: tracehook: tracehook_signal_handler

This defines tracehook_signal_handler() as a hook for the arch signal
handling code to call.  It gives ptrace the opportunity to stop for a
pseudo-single-step trap immediately after signal handler setup is done.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index e113e09b0341..2d1426f8e33b 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -289,4 +289,27 @@ static inline void tracehook_finish_release_task(struct task_struct *task)
 	ptrace_release_task(task);
 }
 
+/**
+ * tracehook_signal_handler - signal handler setup is complete
+ * @sig:		number of signal being delivered
+ * @info:		siginfo_t of signal being delivered
+ * @ka:			sigaction setting that chose the handler
+ * @regs:		user register state
+ * @stepping:		nonzero if debugger single-step or block-step in use
+ *
+ * Called by the arch code after a signal handler has been set up.
+ * Register and stack state reflects the user handler about to run.
+ * Signal mask changes have already been made.
+ *
+ * Called without locks, shortly before returning to user mode
+ * (or handling more signals).
+ */
+static inline void tracehook_signal_handler(int sig, siginfo_t *info,
+					    const struct k_sigaction *ka,
+					    struct pt_regs *regs, int stepping)
+{
+	if (stepping)
+		ptrace_notify(SIGTRAP);
+}
+
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From 35de254dc60f91004b3b5ebb1fc7b2c3093d6032 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:51 -0700
Subject: tracehook: tracehook_consider_ignored_signal

This defines tracehook_consider_ignored_signal() has a fine-grained hook
for deciding to prevent the normal short-circuit of sending an ignored
signal, as ptrace does.  There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 19 +++++++++++++++++++
 kernel/signal.c           | 27 ++++++++++++++++-----------
 2 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 2d1426f8e33b..8cffd34f88d5 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -312,4 +312,23 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
 		ptrace_notify(SIGTRAP);
 }
 
+/**
+ * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
+ * @task:		task receiving the signal
+ * @sig:		signal number being sent
+ * @handler:		%SIG_IGN or %SIG_DFL
+ *
+ * Return zero iff tracing doesn't care to examine this ignored signal,
+ * so it can short-circuit normal delivery and never even get queued.
+ * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN.
+ *
+ * Called with @task->sighand->siglock held.
+ */
+static inline int tracehook_consider_ignored_signal(struct task_struct *task,
+						    int sig,
+						    void __user *handler)
+{
+	return (task_ptrace(task) & PT_PTRACED) != 0;
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8715c18b27b9..9efd1cee6d0b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-	void __user *handler;
+	return t->sighand->action[sig - 1].sa.sa_handler;
+}
 
+static int sig_handler_ignored(void __user *handler, int sig)
+{
 	/* Is it explicitly or implicitly ignored? */
-
-	handler = t->sighand->action[sig - 1].sa.sa_handler;
 	return handler == SIG_IGN ||
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-	/*
-	 * Tracers always want to know about signals..
-	 */
-	if (t->ptrace & PT_PTRACED)
-		return 0;
+	void __user *handler;
 
 	/*
 	 * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	return __sig_ignored(t, sig);
+	handler = sig_handler(t, sig);
+	if (!sig_handler_ignored(handler, sig))
+		return 0;
+
+	/*
+	 * Tracers may want to know about even ignored signals.
+	 */
+	return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 
 /*
@@ -2298,7 +2303,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 		 *   (for example, SIGCHLD), shall cause the pending signal to
 		 *   be discarded, whether or not it is blocked"
 		 */
-		if (__sig_ignored(t, sig)) {
+		if (sig_handler_ignored(sig_handler(t, sig), sig)) {
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
-- 
cgit v1.2.3


From 445a91d2fe3667fb8fc251433645f686933cf56a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:52 -0700
Subject: tracehook: tracehook_consider_fatal_signal

This defines tracehook_consider_fatal_signal() has a fine-grained hook for
deciding to skip the special cases for a fatal signal, as ptrace does.
There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 21 +++++++++++++++++++++
 kernel/signal.c           |  9 +++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8cffd34f88d5..8b4c15e208fe 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -331,4 +331,25 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
+/**
+ * tracehook_consider_fatal_signal - suppress special handling of fatal signal
+ * @task:		task receiving the signal
+ * @sig:		signal number being sent
+ * @handler:		%SIG_DFL or %SIG_IGN
+ *
+ * Return nonzero to prevent special handling of this termination signal.
+ * Normally @handler is %SIG_DFL.  It can be %SIG_IGN if @sig is ignored,
+ * in which case force_sig() is about to reset it to %SIG_DFL.
+ * When this returns zero, this signal might cause a quick termination
+ * that does not give the debugger a chance to intercept the signal.
+ *
+ * Called with or without @task->sighand->siglock held.
+ */
+static inline int tracehook_consider_fatal_signal(struct task_struct *task,
+						  int sig,
+						  void __user *handler)
+{
+	return (task_ptrace(task) & PT_PTRACED) != 0;
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/signal.c b/kernel/signal.c
index 9efd1cee6d0b..1a942ce32ba0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -300,12 +300,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+	void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
 	if (is_global_init(tsk))
 		return 1;
-	if (tsk->ptrace & PT_PTRACED)
+	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
-		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+	return !tracehook_consider_fatal_signal(tsk, sig, handler);
 }
 
 
@@ -761,7 +761,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	if (sig_fatal(p, sig) &&
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+	    (sig == SIGKILL ||
+	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3


From 283d7559e7712f95a05331eb0a85394c6368101b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:52 -0700
Subject: tracehook: syscall

This adds standard tracehook.h inlines for arch code to call when
TIF_SYSCALL_TRACE has been set.  This replaces having each arch implement
the ptrace guts for its syscall tracing support.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8b4c15e208fe..3548694a24db 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -66,6 +66,76 @@ static inline int tracehook_expect_breakpoints(struct task_struct *task)
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
+/*
+ * ptrace report for syscall entry and exit looks identical.
+ */
+static inline void ptrace_report_syscall(struct pt_regs *regs)
+{
+	int ptrace = task_ptrace(current);
+
+	if (!(ptrace & PT_PTRACED))
+		return;
+
+	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+}
+
+/**
+ * tracehook_report_syscall_entry - task is about to attempt a system call
+ * @regs:		user register state of current task
+ *
+ * This will be called if %TIF_SYSCALL_TRACE has been set, when the
+ * current task has just entered the kernel for a system call.
+ * Full user register state is available here.  Changing the values
+ * in @regs can affect the system call number and arguments to be tried.
+ * It is safe to block here, preventing the system call from beginning.
+ *
+ * Returns zero normally, or nonzero if the calling arch code should abort
+ * the system call.  That must prevent normal entry so no system call is
+ * made.  If @task ever returns to user mode after this, its register state
+ * is unspecified, but should be something harmless like an %ENOSYS error
+ * return.
+ *
+ * Called without locks, just after entering kernel mode.
+ */
+static inline __must_check int tracehook_report_syscall_entry(
+	struct pt_regs *regs)
+{
+	ptrace_report_syscall(regs);
+	return 0;
+}
+
+/**
+ * tracehook_report_syscall_exit - task has just finished a system call
+ * @regs:		user register state of current task
+ * @step:		nonzero if simulating single-step or block-step
+ *
+ * This will be called if %TIF_SYSCALL_TRACE has been set, when the
+ * current task has just finished an attempted system call.  Full
+ * user register state is available here.  It is safe to block here,
+ * preventing signals from being processed.
+ *
+ * If @step is nonzero, this report is also in lieu of the normal
+ * trap that would follow the system call instruction because
+ * user_enable_block_step() or user_enable_single_step() was used.
+ * In this case, %TIF_SYSCALL_TRACE might not be set.
+ *
+ * Called without locks, just before checking for pending signals.
+ */
+static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
+{
+	ptrace_report_syscall(regs);
+}
+
 /**
  * tracehook_unsafe_exec - check for exec declared unsafe due to tracing
  * @task:		current task doing exec
-- 
cgit v1.2.3


From 7bcf6a2ca5f639b038c48711ebe6c4eca2036641 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:53 -0700
Subject: tracehook: get_signal_to_deliver

This defines the tracehook_get_signal() hook to allow tracing code to slip
in before normal signal dequeuing.  This lays the groundwork for new
tracing features that can inject synthetic signals outside the normal
queue or control the disposition of delivered signals.  The calling
convention lets tracehook_get_signal() decide both exactly what will
happen and what signal number to report in the handler/exit.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 29 +++++++++++++++++++++++++++++
 kernel/signal.c           | 38 +++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 3548694a24db..42a0d7b11959 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -422,4 +422,33 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
+/**
+ * tracehook_get_signal - deliver synthetic signal to traced task
+ * @task:		@current
+ * @regs:		task_pt_regs(@current)
+ * @info:		details of synthetic signal
+ * @return_ka:		sigaction for synthetic signal
+ *
+ * Return zero to check for a real pending signal normally.
+ * Return -1 after releasing the siglock to repeat the check.
+ * Return a signal number to induce an artifical signal delivery,
+ * setting *@info and *@return_ka to specify its details and behavior.
+ *
+ * The @return_ka->sa_handler value controls the disposition of the
+ * signal, no matter the signal number.  For %SIG_DFL, the return value
+ * is a representative signal to indicate the behavior (e.g. %SIGTERM
+ * for death, %SIGQUIT for core dump, %SIGSTOP for job control stop,
+ * %SIGTSTP for stop unless in an orphaned pgrp), but the signal number
+ * reported will be @info->si_signo instead.
+ *
+ * Called with @task->sighand->siglock held, before dequeuing pending signals.
+ */
+static inline int tracehook_get_signal(struct task_struct *task,
+				       struct pt_regs *regs,
+				       siginfo_t *info,
+				       struct k_sigaction *return_ka)
+{
+	return 0;
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/signal.c b/kernel/signal.c
index 1a942ce32ba0..10b31ecdd9c8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1754,17 +1754,33 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
-		signr = dequeue_signal(current, &current->blocked, info);
-		if (!signr)
-			break; /* will return 0 */
+		/*
+		 * Tracing can induce an artifical signal and choose sigaction.
+		 * The return value in @signr determines the default action,
+		 * but @info->si_signo is the signal number we will report.
+		 */
+		signr = tracehook_get_signal(current, regs, info, return_ka);
+		if (unlikely(signr < 0))
+			goto relock;
+		if (unlikely(signr != 0))
+			ka = return_ka;
+		else {
+			signr = dequeue_signal(current, &current->blocked,
+					       info);
 
-		if (signr != SIGKILL) {
-			signr = ptrace_signal(signr, info, regs, cookie);
 			if (!signr)
-				continue;
+				break; /* will return 0 */
+
+			if (signr != SIGKILL) {
+				signr = ptrace_signal(signr, info,
+						      regs, cookie);
+				if (!signr)
+					continue;
+			}
+
+			ka = &sighand->action[signr-1];
 		}
 
-		ka = &sighand->action[signr-1];
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) {
@@ -1812,7 +1828,7 @@ relock:
 				spin_lock_irq(&sighand->siglock);
 			}
 
-			if (likely(do_signal_stop(signr))) {
+			if (likely(do_signal_stop(info->si_signo))) {
 				/* It released the siglock.  */
 				goto relock;
 			}
@@ -1833,7 +1849,7 @@ relock:
 
 		if (sig_kernel_coredump(signr)) {
 			if (print_fatal_signals)
-				print_fatal_signal(regs, signr);
+				print_fatal_signal(regs, info->si_signo);
 			/*
 			 * If it was able to dump core, this kills all
 			 * other threads in the group and synchronizes with
@@ -1842,13 +1858,13 @@ relock:
 			 * first and our do_group_exit call below will use
 			 * that value and ignore the one we pass it.
 			 */
-			do_coredump((long)signr, signr, regs);
+			do_coredump(info->si_signo, info->si_signo, regs);
 		}
 
 		/*
 		 * Death signals, no core dump.
 		 */
-		do_group_exit(signr);
+		do_group_exit(info->si_signo);
 		/* NOTREACHED */
 	}
 	spin_unlock_irq(&sighand->siglock);
-- 
cgit v1.2.3


From fa00b80b3c41a845b3d56f866fb40a2e98754c51 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: job control

This defines the tracehook_notify_jctl() hook to formalize the ptrace
effects on the job control notifications.  There is no change, only
cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 20 ++++++++++++++++++++
 kernel/signal.c           | 10 +++++-----
 2 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 42a0d7b11959..6dc428dd2f38 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -451,4 +451,24 @@ static inline int tracehook_get_signal(struct task_struct *task,
 	return 0;
 }
 
+/**
+ * tracehook_notify_jctl - report about job control stop/continue
+ * @notify:		nonzero if this is the last thread in the group to stop
+ * @why:		%CLD_STOPPED or %CLD_CONTINUED
+ *
+ * This is called when we might call do_notify_parent_cldstop().
+ * It's called when about to stop for job control; we are already in
+ * %TASK_STOPPED state, about to call schedule().  It's also called when
+ * a delayed %CLD_STOPPED or %CLD_CONTINUED report is ready to be made.
+ *
+ * Return nonzero to generate a %SIGCHLD with @why, which is
+ * normal if @notify is nonzero.
+ *
+ * Called with no locks held.
+ */
+static inline int tracehook_notify_jctl(int notify, int why)
+{
+	return notify || (current->ptrace & PT_PTRACED);
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/signal.c b/kernel/signal.c
index 10b31ecdd9c8..e9e699f4b1bd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -596,9 +596,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	return security_task_kill(t, info, sig, 0);
 }
 
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-
 /*
  * Handle magic process-wide effects of stop/continue signals. Unlike
  * the signal actions, these happen immediately at signal-generation
@@ -1605,7 +1602,7 @@ finish_stop(int stop_count)
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+	if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
@@ -1741,6 +1738,9 @@ relock:
 		signal->flags &= ~SIGNAL_CLD_MASK;
 		spin_unlock_irq(&sighand->siglock);
 
+		if (unlikely(!tracehook_notify_jctl(1, why)))
+			goto relock;
+
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current->group_leader, why);
 		read_unlock(&tasklist_lock);
@@ -1906,7 +1906,7 @@ void exit_signals(struct task_struct *tsk)
 out:
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	if (unlikely(group_stop)) {
+	if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(tsk, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
-- 
cgit v1.2.3


From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: death

This moves the ptrace logic in task death (exit_notify) into tracehook.h
inlines.  Some code is rearranged slightly to make things nicer.  There is
no change, only cleanup.

There is one hook called with the tasklist_lock write-locked, as ptrace
needs.  There is also a new hook called after exit_state changes and
without locks.  This is a better place for tracing work to be in the
future, since it doesn't delay the whole system with locking.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h     |  2 +-
 include/linux/tracehook.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/exit.c             | 26 ++++++++----------------
 kernel/signal.c           | 10 ++++++---
 4 files changed, 69 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index adb8077dc463..a95d84d0da95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1796,7 +1796,7 @@ extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
-extern void do_notify_parent(struct task_struct *, int);
+extern int do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6dc428dd2f38..4c50e1b57349 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -471,4 +471,56 @@ static inline int tracehook_notify_jctl(int notify, int why)
 	return notify || (current->ptrace & PT_PTRACED);
 }
 
+/**
+ * tracehook_notify_death - task is dead, ready to notify parent
+ * @task:		@current task now exiting
+ * @death_cookie:	value to pass to tracehook_report_death()
+ * @group_dead:		nonzero if this was the last thread in the group to die
+ *
+ * Return the signal number to send our parent with do_notify_parent(), or
+ * zero to send no signal and leave a zombie, or -1 to self-reap right now.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static inline int tracehook_notify_death(struct task_struct *task,
+					 void **death_cookie, int group_dead)
+{
+	if (task->exit_signal == -1)
+		return task->ptrace ? SIGCHLD : -1;
+
+	/*
+	 * If something other than our normal parent is ptracing us, then
+	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+	 * only has special meaning to our real parent.
+	 */
+	if (thread_group_empty(task) && !ptrace_reparented(task))
+		return task->exit_signal;
+
+	return task->ptrace ? SIGCHLD : 0;
+}
+
+/**
+ * tracehook_report_death - task is dead and ready to be reaped
+ * @task:		@current task now exiting
+ * @signal:		signal number sent to parent, or 0 or -1
+ * @death_cookie:	value passed back from tracehook_notify_death()
+ * @group_dead:		nonzero if this was the last thread in the group to die
+ *
+ * Thread has just become a zombie or is about to self-reap.  If positive,
+ * @signal is the signal number just sent to the parent (usually %SIGCHLD).
+ * If @signal is -1, this thread will self-reap.  If @signal is 0, this is
+ * a delayed_group_leader() zombie.  The @death_cookie was passed back by
+ * tracehook_notify_death().
+ *
+ * If normal reaping is not inhibited, @task->exit_state might be changing
+ * in parallel.
+ *
+ * Called without locks.
+ */
+static inline void tracehook_report_death(struct task_struct *task,
+					  int signal, void *death_cookie,
+					  int group_dead)
+{
+}
+
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/exit.c b/kernel/exit.c
index da28745f7c38..6cdf60712bd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int state;
+	int signal;
+	void *cookie;
 
 	/*
 	 * This does two things:
@@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-	/* If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = ptrace_reparented(tsk) ?
-				SIGCHLD : tsk->exit_signal;
-		do_notify_parent(tsk, signal);
-	} else if (tsk->ptrace) {
-		do_notify_parent(tsk, SIGCHLD);
-	}
+	signal = tracehook_notify_death(tsk, &cookie, group_dead);
+	if (signal > 0)
+		signal = do_notify_parent(tsk, signal);
 
-	state = EXIT_ZOMBIE;
-	if (task_detached(tsk) && likely(!tsk->ptrace))
-		state = EXIT_DEAD;
-	tsk->exit_state = state;
+	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	write_unlock_irq(&tasklist_lock);
 
+	tracehook_report_death(tsk, signal, cookie, group_dead);
+
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == EXIT_DEAD)
+	if (signal < 0)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e9e699f4b1bd..0e862d3130ff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
  */
-
-void do_notify_parent(struct task_struct *tsk, int sig)
+int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 		 */
 		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = 0;
+			sig = -1;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
+
+	return sig;
 }
 
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
-- 
cgit v1.2.3


From b787f7ba677840da16a2228c16571ce8a1fcb799 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:55 -0700
Subject: tracehook: force signal_pending()

This defines a new hook tracehook_force_sigpending() that lets tracing
code decide to force TIF_SIGPENDING on in recalc_sigpending().

This is not used yet, so it compiles away to nothing for now.  It lays the
groundwork for new tracing code that can interrupt a task synthetically
without actually sending a signal.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 14 ++++++++++++++
 kernel/signal.c           |  4 +++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 4c50e1b57349..43bc51b6bd33 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -422,6 +422,20 @@ static inline int tracehook_consider_fatal_signal(struct task_struct *task,
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
 
+/**
+ * tracehook_force_sigpending - let tracing force signal_pending(current) on
+ *
+ * Called when recomputing our signal_pending() flag.  Return nonzero
+ * to force the signal_pending() flag on, so that tracehook_get_signal()
+ * will be called before the next return to user mode.
+ *
+ * Called with @current->sighand->siglock held.
+ */
+static inline int tracehook_force_sigpending(void)
+{
+	return 0;
+}
+
 /**
  * tracehook_get_signal - deliver synthetic signal to traced task
  * @task:		@current
diff --git a/kernel/signal.c b/kernel/signal.c
index 0e862d3130ff..954f77d7e3bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -134,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-	if (!recalc_sigpending_tsk(current) && !freezing(current))
+	if (unlikely(tracehook_force_sigpending()))
+		set_thread_flag(TIF_SIGPENDING);
+	else if (!recalc_sigpending_tsk(current) && !freezing(current))
 		clear_thread_flag(TIF_SIGPENDING);
 
 }
-- 
cgit v1.2.3


From 64b1208d5b0ef8859fd52ea7ae286a3eb994669b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:56 -0700
Subject: tracehook: TIF_NOTIFY_RESUME

This adds tracehook.h inlines to enable a new arch feature in support of
user debugging/tracing.  This is not used yet, but it lays the groundwork
for a debugger to be able to wrangle a task that's possibly running,
without interrupting its syscalls in progress.

Each arch should define TIF_NOTIFY_RESUME, and in their entry.S code treat
it much like TIF_SIGPENDING.  That is, it causes you to take the slow path
when returning to user mode, where you get the full user-mode state
accessible as for signal handling or ptrace.  The arch code should check
TIF_NOTIFY_RESUME after handling TIF_SIGPENDING.  When it's set, clear it
and then call tracehook_notify_resume().

In future, tracing code will call set_notify_resume() when it wants to get
a callback in tracehook_notify_resume().

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 43bc51b6bd33..32867ab86c70 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -537,4 +537,38 @@ static inline void tracehook_report_death(struct task_struct *task,
 {
 }
 
+#ifdef TIF_NOTIFY_RESUME
+/**
+ * set_notify_resume - cause tracehook_notify_resume() to be called
+ * @task:		task that will call tracehook_notify_resume()
+ *
+ * Calling this arranges that @task will call tracehook_notify_resume()
+ * before returning to user mode.  If it's already running in user mode,
+ * it will enter the kernel and call tracehook_notify_resume() soon.
+ * If it's blocked, it will not be woken.
+ */
+static inline void set_notify_resume(struct task_struct *task)
+{
+	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
+		kick_process(task);
+}
+
+/**
+ * tracehook_notify_resume - report when about to return to user mode
+ * @regs:		user-mode registers of @current task
+ *
+ * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
+ * about to return to user mode, and the user state in @regs can be
+ * inspected or adjusted.  The caller in arch code has cleared
+ * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
+ * asynchronously, this will be called again before we return to
+ * user mode.
+ *
+ * Called without locks.
+ */
+static inline void tracehook_notify_resume(struct pt_regs *regs)
+{
+}
+#endif	/* TIF_NOTIFY_RESUME */
+
 #endif	/* <linux/tracehook.h> */
-- 
cgit v1.2.3


From 828c365cc8b8d38c346fccb19fa80d28f2240831 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:57 -0700
Subject: tracehook: asm/syscall.h

This adds asm-generic/syscall.h, which documents what a real
asm-ARCH/syscall.h file should define.  This is not used yet, but will
provide all the machine-dependent details of examining a user system call
about to begin, in progress, or just ended.

Each arch should add an asm-ARCH/syscall.h that defines all the entry
points documented in asm-generic/syscall.h, as short inlines if possible.
This lets us write new tracing code that understands user system call
registers, without any new arch-specific work.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/syscall.h | 141 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/tracehook.h     |   3 +-
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 include/asm-generic/syscall.h

(limited to 'include')

diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
new file mode 100644
index 000000000000..abcf34c2fdc7
--- /dev/null
+++ b/include/asm-generic/syscall.h
@@ -0,0 +1,141 @@
+/*
+ * Access to user system call parameters and results
+ *
+ * Copyright (C) 2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * This file is a stub providing documentation for what functions
+ * asm-ARCH/syscall.h files need to define.  Most arch definitions
+ * will be simple inlines.
+ *
+ * All of these functions expect to be called with no locks,
+ * and only when the caller is sure that the task of interest
+ * cannot return to user mode while we are looking at it.
+ */
+
+#ifndef _ASM_SYSCALL_H
+#define _ASM_SYSCALL_H	1
+
+struct task_struct;
+struct pt_regs;
+
+/**
+ * syscall_get_nr - find what system call a task is executing
+ * @task:	task of interest, must be blocked
+ * @regs:	task_pt_regs() of @task
+ *
+ * If @task is executing a system call or is at system call
+ * tracing about to attempt one, returns the system call number.
+ * If @task is not executing a system call, i.e. it's blocked
+ * inside the kernel for a fault or signal, returns -1.
+ *
+ * It's only valid to call this when @task is known to be blocked.
+ */
+long syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
+
+/**
+ * syscall_rollback - roll back registers after an aborted system call
+ * @task:	task of interest, must be in system call exit tracing
+ * @regs:	task_pt_regs() of @task
+ *
+ * It's only valid to call this when @task is stopped for system
+ * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
+ * after tracehook_report_syscall_entry() returned nonzero to prevent
+ * the system call from taking place.
+ *
+ * This rolls back the register state in @regs so it's as if the
+ * system call instruction was a no-op.  The registers containing
+ * the system call number and arguments are as they were before the
+ * system call instruction.  This may not be the same as what the
+ * register state looked like at system call entry tracing.
+ */
+void syscall_rollback(struct task_struct *task, struct pt_regs *regs);
+
+/**
+ * syscall_get_error - check result of traced system call
+ * @task:	task of interest, must be blocked
+ * @regs:	task_pt_regs() of @task
+ *
+ * Returns 0 if the system call succeeded, or -ERRORCODE if it failed.
+ *
+ * It's only valid to call this when @task is stopped for tracing on exit
+ * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ */
+long syscall_get_error(struct task_struct *task, struct pt_regs *regs);
+
+/**
+ * syscall_get_return_value - get the return value of a traced system call
+ * @task:	task of interest, must be blocked
+ * @regs:	task_pt_regs() of @task
+ *
+ * Returns the return value of the successful system call.
+ * This value is meaningless if syscall_get_error() returned nonzero.
+ *
+ * It's only valid to call this when @task is stopped for tracing on exit
+ * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ */
+long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs);
+
+/**
+ * syscall_set_return_value - change the return value of a traced system call
+ * @task:	task of interest, must be blocked
+ * @regs:	task_pt_regs() of @task
+ * @error:	negative error code, or zero to indicate success
+ * @val:	user return value if @error is zero
+ *
+ * This changes the results of the system call that user mode will see.
+ * If @error is zero, the user sees a successful system call with a
+ * return value of @val.  If @error is nonzero, it's a negated errno
+ * code; the user sees a failed system call with this errno code.
+ *
+ * It's only valid to call this when @task is stopped for tracing on exit
+ * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ */
+void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs,
+			      int error, long val);
+
+/**
+ * syscall_get_arguments - extract system call parameter values
+ * @task:	task of interest, must be blocked
+ * @regs:	task_pt_regs() of @task
+ * @i:		argument index [0,5]
+ * @n:		number of arguments; n+i must be [1,6].
+ * @args:	array filled with argument values
+ *
+ * Fetches @n arguments to the system call starting with the @i'th argument
+ * (from 0 through 5).  Argument @i is stored in @args[0], and so on.
+ * An arch inline version is probably optimal when @i and @n are constants.
+ *
+ * It's only valid to call this when @task is stopped for tracing on
+ * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * It's invalid to call this with @i + @n > 6; we only support system calls
+ * taking up to 6 arguments.
+ */
+void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
+			   unsigned int i, unsigned int n, unsigned long *args);
+
+/**
+ * syscall_set_arguments - change system call parameter value
+ * @task:	task of interest, must be in system call entry tracing
+ * @regs:	task_pt_regs() of @task
+ * @i:		argument index [0,5]
+ * @n:		number of arguments; n+i must be [1,6].
+ * @args:	array of argument values to store
+ *
+ * Changes @n arguments to the system call starting with the @i'th argument.
+ * @n'th argument to @val.  Argument @i gets value @args[0], and so on.
+ * An arch inline version is probably optimal when @i and @n are constants.
+ *
+ * It's only valid to call this when @task is stopped for tracing on
+ * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT.
+ * It's invalid to call this with @i + @n > 6; we only support system calls
+ * taking up to 6 arguments.
+ */
+void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
+			   unsigned int i, unsigned int n,
+			   const unsigned long *args);
+
+#endif	/* _ASM_SYSCALL_H */
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 32867ab86c70..589f429619c9 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -103,7 +103,8 @@ static inline void ptrace_report_syscall(struct pt_regs *regs)
  * the system call.  That must prevent normal entry so no system call is
  * made.  If @task ever returns to user mode after this, its register state
  * is unspecified, but should be something harmless like an %ENOSYS error
- * return.
+ * return.  It should preserve enough information so that syscall_rollback()
+ * can work (see asm-generic/syscall.h).
  *
  * Called without locks, just after entering kernel mode.
  */
-- 
cgit v1.2.3


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c |  4 ++--
 include/linux/sched.h      |  8 ++++++--
 kernel/kthread.c           |  2 +-
 kernel/ptrace.c            |  2 +-
 kernel/sched.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 19d4493c6193..fc8f3509df27 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2626,7 +2626,7 @@ pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
 	/*
 	 * make sure the task is off any CPU
 	 */
-	wait_task_inactive(task);
+	wait_task_inactive(task, 0);
 
 	/* more to come... */
 
@@ -4774,7 +4774,7 @@ recheck:
 
 		UNPROTECT_CTX(ctx, flags);
 
-		wait_task_inactive(task);
+		wait_task_inactive(task, 0);
 
 		PROTECT_CTX(ctx, flags);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a95d84d0da95..f59318a0099b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1882,9 +1882,13 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_inactive(struct task_struct * p);
+extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-#define wait_task_inactive(p)	do { } while (0)
+static inline unsigned long wait_task_inactive(struct task_struct *p,
+					       long match_state)
+{
+	return 1;
+}
 #endif
 
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a1026359..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.2.3


From bbc698636ed48b6fcd323964e0f847a6a796325d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:59 -0700
Subject: task_current_syscall

This adds the new function task_current_syscall() on machines where the
asm/syscall.h interface is supported (CONFIG_HAVE_ARCH_TRACEHOOK).  It's
exported for modules to use in the future.  This function safely samples
the state of a blocked thread to collect what system call it is blocked
in, and the six system call argument registers.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  4 +++
 lib/Makefile           |  2 ++
 lib/syscall.c          | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 lib/syscall.c

(limited to 'include')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ed69c03692d9..fd31756e1a00 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -314,6 +314,10 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
+extern int task_current_syscall(struct task_struct *target, long *callno,
+				unsigned long args[6], unsigned int maxargs,
+				unsigned long *sp, unsigned long *pc);
+
 #endif
 
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index 9085ad6fa53d..942c7250f603 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -78,6 +78,8 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_LMB) += lmb.o
 
+obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/syscall.c b/lib/syscall.c
new file mode 100644
index 000000000000..a4f7067f72fa
--- /dev/null
+++ b/lib/syscall.c
@@ -0,0 +1,75 @@
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <asm/syscall.h>
+
+static int collect_syscall(struct task_struct *target, long *callno,
+			   unsigned long args[6], unsigned int maxargs,
+			   unsigned long *sp, unsigned long *pc)
+{
+	struct pt_regs *regs = task_pt_regs(target);
+	if (unlikely(!regs))
+		return -EAGAIN;
+
+	*sp = user_stack_pointer(regs);
+	*pc = instruction_pointer(regs);
+
+	*callno = syscall_get_nr(target, regs);
+	if (*callno != -1L && maxargs > 0)
+		syscall_get_arguments(target, regs, 0, maxargs, args);
+
+	return 0;
+}
+
+/**
+ * task_current_syscall - Discover what a blocked task is doing.
+ * @target:		thread to examine
+ * @callno:		filled with system call number or -1
+ * @args:		filled with @maxargs system call arguments
+ * @maxargs:		number of elements in @args to fill
+ * @sp:			filled with user stack pointer
+ * @pc:			filled with user PC
+ *
+ * If @target is blocked in a system call, returns zero with *@callno
+ * set to the the call's number and @args filled in with its arguments.
+ * Registers not used for system call arguments may not be available and
+ * it is not kosher to use &struct user_regset calls while the system
+ * call is still in progress.  Note we may get this result if @target
+ * has finished its system call but not yet returned to user mode, such
+ * as when it's stopped for signal handling or syscall exit tracing.
+ *
+ * If @target is blocked in the kernel during a fault or exception,
+ * returns zero with *@callno set to -1 and does not fill in @args.
+ * If so, it's now safe to examine @target using &struct user_regset
+ * get() calls as long as we're sure @target won't return to user mode.
+ *
+ * Returns -%EAGAIN if @target does not remain blocked.
+ *
+ * Returns -%EINVAL if @maxargs is too large (maximum is six).
+ */
+int task_current_syscall(struct task_struct *target, long *callno,
+			 unsigned long args[6], unsigned int maxargs,
+			 unsigned long *sp, unsigned long *pc)
+{
+	long state;
+	unsigned long ncsw;
+
+	if (unlikely(maxargs > 6))
+		return -EINVAL;
+
+	if (target == current)
+		return collect_syscall(target, callno, args, maxargs, sp, pc);
+
+	state = target->state;
+	if (unlikely(!state))
+		return -EAGAIN;
+
+	ncsw = wait_task_inactive(target, state);
+	if (unlikely(!ncsw) ||
+	    unlikely(collect_syscall(target, callno, args, maxargs, sp, pc)) ||
+	    unlikely(wait_task_inactive(target, state) != ncsw))
+		return -EAGAIN;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(task_current_syscall);
-- 
cgit v1.2.3


From 9d8fddfb17aaee4ffc5e3d0560620d0fa8b50a42 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:23 -0700
Subject: mm/allocpercpu.c: make 4 functions static

This patch makes the following needlessly global functions static:
 - percpu_depopulate()
 - __percpu_depopulate_mask()
 - percpu_populate()
 - __percpu_populate_mask()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu.h | 29 -----------------------------
 mm/allocpercpu.c       | 20 +++++++++++---------
 2 files changed, 11 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4cdd393e71e1..fac3337547eb 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -74,11 +74,6 @@ struct percpu_data {
         (__typeof__(ptr))__p->ptrs[(cpu)];	          \
 })
 
-extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu);
-extern void percpu_depopulate(void *__pdata, int cpu);
-extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-				  cpumask_t *mask);
-extern void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask);
 extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
 extern void percpu_free(void *__pdata);
 
@@ -86,26 +81,6 @@ extern void percpu_free(void *__pdata);
 
 #define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static inline void percpu_depopulate(void *__pdata, int cpu)
-{
-}
-
-static inline void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
-{
-}
-
-static inline void *percpu_populate(void *__pdata, size_t size, gfp_t gfp,
-				    int cpu)
-{
-	return percpu_ptr(__pdata, cpu);
-}
-
-static inline int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-					 cpumask_t *mask)
-{
-	return 0;
-}
-
 static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
 	return kzalloc(size, gfp);
@@ -118,10 +93,6 @@ static inline void percpu_free(void *__pdata)
 
 #endif /* CONFIG_SMP */
 
-#define percpu_populate_mask(__pdata, size, gfp, mask) \
-	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
-#define percpu_depopulate_mask(__pdata, mask) \
-	__percpu_depopulate_mask((__pdata), &(mask))
 #define percpu_alloc_mask(size, gfp, mask) \
 	__percpu_alloc_mask((size), (gfp), &(mask))
 
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 843364594e23..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
  * Depopulating per-cpu data for a cpu going offline would be a typical
  * use case. You need to register a cpu hotplug handler for that purpose.
  */
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
 {
 	struct percpu_data *pdata = __percpu_disguise(__pdata);
 
 	kfree(pdata->ptrs[cpu]);
 	pdata->ptrs[cpu] = NULL;
 }
-EXPORT_SYMBOL_GPL(percpu_depopulate);
 
 /**
  * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
  * @__pdata: per-cpu data to depopulate
  * @mask: depopulate per-cpu data for cpu's selected through mask bits
  */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
 {
 	int cpu;
 	for_each_cpu_mask_nr(cpu, *mask)
 		percpu_depopulate(__pdata, cpu);
 }
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+
+#define percpu_depopulate_mask(__pdata, mask) \
+	__percpu_depopulate_mask((__pdata), &(mask))
 
 /**
  * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
  * use case. You need to register a cpu hotplug handler for that purpose.
  * Per-cpu object is populated with zeroed buffer.
  */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 {
 	struct percpu_data *pdata = __percpu_disguise(__pdata);
 	int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 		pdata->ptrs[cpu] = kzalloc(size, gfp);
 	return pdata->ptrs[cpu];
 }
-EXPORT_SYMBOL_GPL(percpu_populate);
 
 /**
  * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,8 +79,8 @@ EXPORT_SYMBOL_GPL(percpu_populate);
  *
  * Per-cpu objects are populated with zeroed buffers.
  */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-			   cpumask_t *mask)
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+				  cpumask_t *mask)
 {
 	cpumask_t populated;
 	int cpu;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
 			cpu_set(cpu, populated);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+	__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
  * percpu_alloc_mask - initial setup of per-cpu data
-- 
cgit v1.2.3


From 15f59adae001766a2c7f7fe4f196387bb04bcff5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:23 -0700
Subject: make mm/memory.c:print_bad_pte() static

This patch makes the needlessly global print_bad_pte() static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3fd70d6029f..6e695eaab4ce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -810,7 +810,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
-void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
 extern void do_invalidatepage(struct page *page, unsigned long offset);
diff --git a/mm/memory.c b/mm/memory.c
index 262e3eb6601a..a8ca04faaea6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -374,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
  *
  * The calling function must still handle the error.
  */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+			  unsigned long vaddr)
 {
 	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
 			"vm_flags = %lx, vaddr = %lx\n",
-- 
cgit v1.2.3


From 7c363b8c6536f26934172d3c46f0bbec01a97c61 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:24 -0700
Subject: mm/swapfile.c: make code static

This patch makes the following needlessly global code static:
 - swap_lock
 - nr_swapfiles
 - struct swap_list

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 ---
 mm/swapfile.c        | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0b3377650c85..de40f169a4e4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -237,7 +237,6 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
-extern unsigned int nr_swapfiles;
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
@@ -254,8 +253,6 @@ extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
-extern spinlock_t swap_lock;
-
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
 extern void grab_swap_token(void);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index af283933c14e..6beb6251e99d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,8 +33,8 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
-DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static DEFINE_SPINLOCK(swap_lock);
+static unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
 static int least_priority;
@@ -44,7 +44,7 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
 
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-- 
cgit v1.2.3


From 9580d85f9cdb076c4bfb467bc6c0d3c5e499957a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:25 -0700
Subject: drivers/char/rtc.c: make 2 functions static

The following functions can now become static:
 - rtc_interrupt()
 - rtc_get_rtc_time()

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Bernhard Walle <bwalle@suse.de>
Acked-by: Paul Gortmaker <p_gortmaker@yahoo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/rtc.c  | 5 +++--
 include/linux/rtc.h | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index dbefbb30ed44..d9799e2bcfbf 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -144,6 +144,7 @@ static ssize_t rtc_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos);
 
 static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+static void rtc_get_rtc_time(struct rtc_time *rtc_tm);
 
 #ifdef RTC_IRQ
 static unsigned int rtc_poll(struct file *file, poll_table *wait);
@@ -235,7 +236,7 @@ static inline unsigned char rtc_is_updating(void)
  *	(See ./arch/XXXX/kernel/time.c for the set_rtc_mmss() function.)
  */
 
-irqreturn_t rtc_interrupt(int irq, void *dev_id)
+static irqreturn_t rtc_interrupt(int irq, void *dev_id)
 {
 	/*
 	 *	Can be an alarm interrupt, update complete interrupt,
@@ -1303,7 +1304,7 @@ static int rtc_proc_open(struct inode *inode, struct file *file)
 }
 #endif
 
-void rtc_get_rtc_time(struct rtc_time *rtc_tm)
+static void rtc_get_rtc_time(struct rtc_time *rtc_tm)
 {
 	unsigned long uip_watchdog = jiffies, flags;
 	unsigned char ctrl;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index b01fe004cb5e..91f597ad6acc 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -225,8 +225,6 @@ typedef struct rtc_task {
 int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
-void rtc_get_rtc_time(struct rtc_time *rtc_tm);
-irqreturn_t rtc_interrupt(int irq, void *dev_id);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From aafe0ad81d7458b6c61eeb39068432683c70c9a9 Mon Sep 17 00:00:00 2001
From: Ian Molton <spyro@f2s.com>
Date: Sat, 12 Jul 2008 11:55:42 +0100
Subject: [ARM] pxa: PXA25x UDC - Fix warning during build

Fixes an unterminated ' warning building PXA25X UDC.

Signed-off-by: Ian Molton <spyro@f2s.com>
---
 include/asm-arm/arch-pxa/pxa25x-udc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-arm/arch-pxa/pxa25x-udc.h b/include/asm-arm/arch-pxa/pxa25x-udc.h
index 840305916b6d..1b80a4805a60 100644
--- a/include/asm-arm/arch-pxa/pxa25x-udc.h
+++ b/include/asm-arm/arch-pxa/pxa25x-udc.h
@@ -2,7 +2,7 @@
 #define _ASM_ARCH_PXA25X_UDC_H
 
 #ifdef _ASM_ARCH_PXA27X_UDC_H
-#error You can't include both PXA25x and PXA27x UDC support
+#error "You can't include both PXA25x and PXA27x UDC support"
 #endif
 
 #define UDC_RES1	__REG(0x40600004)  /* UDC Undocumented - Reserved1 */
-- 
cgit v1.2.3