From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 12 May 2008 15:44:41 +0200 Subject: x86: move tracedata to RODATA .. allowing it to be write-protected just as other read-only data under CONFIG_DEBUG_RODATA. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- drivers/base/power/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/base') diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 2b4b392dcbc1..87a7f1d02578 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device); * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ -void generate_resume_trace(void *tracedata, unsigned int user) +void generate_resume_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); -- cgit v1.2.3 From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 May 2008 23:00:01 +0200 Subject: Introduce new top level suspend and hibernation callbacks Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning 'extended') representing suspend and hibernation operations for bus types, device classes, device types and device drivers. Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops' objects, if defined, instead of the ->suspend(), ->resume(), ->suspend_late(), and ->resume_early() callbacks (the old callbacks will be considered as legacy and gradually phased out). The main purpose of doing this is to separate suspend (aka S2RAM and standby) callbacks from hibernation callbacks in such a way that the new callbacks won't take arguments and the semantics of each of them will be clearly specified. This has been requested for multiple times by many people, including Linus himself, and the reason is that within the current scheme if ->resume() is called, for example, it's difficult to say why it's been called (ie. is it a resume from RAM or from hibernation or a suspend/hibernation failure etc.?). The second purpose is to make the suspend/hibernation callbacks more flexible so that device drivers can handle more than they can within the current scheme. For example, some drivers may need to prevent new children of the device from being registered before their ->suspend() callbacks are executed or they may want to carry out some operations requiring the availability of some other devices, not directly bound via the parent-child relationship, in order to prepare for the execution of ->suspend(), etc. Ultimately, we'd like to stop using the freezing of tasks for suspend and therefore the drivers' suspend/hibernation code will have to take care of the handling of the user space during suspend/hibernation. That, in turn, would be difficult within the current scheme, without the new ->prepare() and ->complete() callbacks. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- arch/x86/kernel/apm_32.c | 8 +- drivers/base/power/main.c | 675 +++++++++++++++++++++++++++++++++++---------- drivers/base/power/power.h | 2 +- drivers/base/power/trace.c | 4 +- include/linux/device.h | 9 + include/linux/pm.h | 314 +++++++++++++++++++-- kernel/power/disk.c | 22 +- kernel/power/main.c | 6 +- 8 files changed, 845 insertions(+), 195 deletions(-) (limited to 'drivers/base') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9290e29013..c1735f61a2c0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1211,9 +1211,9 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); - device_resume(); + device_resume(PMSG_RESUME); queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1238,7 +1238,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); - device_power_up(); + device_power_up(PMSG_RESUME); local_irq_enable(); } @@ -1324,7 +1324,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - device_resume(); + device_resume(PMSG_RESUME); queue_event(event, NULL); } ignore_normal_resume = 0; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 45cc3d9eacb8..d571204aaff7 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -12,11 +12,9 @@ * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * - * A different set of lists than the global subsystem list are used to - * keep track of power info because we use different lists to hold - * devices based on what stage of the power management process they - * are in. The power domain dependencies may also differ from the - * ancestral dependencies that the subsystem list maintains. + * A separate list is used for keeping track of power info, because the power + * domain dependencies may differ from the ancestral dependencies that the + * subsystem list maintains. */ #include @@ -30,31 +28,40 @@ #include "power.h" /* - * The entries in the dpm_active list are in a depth first order, simply + * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * - * All the other lists are kept in the same order, for consistency. - * However the lists aren't always traversed in the same order. - * Semaphores must be acquired from the top (i.e., front) down - * and released in the opposite order. Devices must be suspended - * from the bottom (i.e., end) up and resumed in the opposite order. - * That way no parent will be suspended while it still has an active - * child. - * * Since device_pm_add() may be called with a device semaphore held, * we must never try to acquire a device semaphore while holding * dpm_list_mutex. */ -LIST_HEAD(dpm_active); -static LIST_HEAD(dpm_off); -static LIST_HEAD(dpm_off_irq); +LIST_HEAD(dpm_list); static DEFINE_MUTEX(dpm_list_mtx); -/* 'true' if all devices have been suspended, protected by dpm_list_mtx */ -static bool all_sleeping; +/* + * Set once the preparation of devices for a PM transition has started, reset + * before starting to resume devices. Protected by dpm_list_mtx. + */ +static bool transition_started; + +/** + * device_pm_lock - lock the list of active devices used by the PM core + */ +void device_pm_lock(void) +{ + mutex_lock(&dpm_list_mtx); +} + +/** + * device_pm_unlock - unlock the list of active devices used by the PM core + */ +void device_pm_unlock(void) +{ + mutex_unlock(&dpm_list_mtx); +} /** * device_pm_add - add a device to the list of active devices @@ -68,17 +75,25 @@ int device_pm_add(struct device *dev) dev->bus ? dev->bus->name : "No Bus", kobject_name(&dev->kobj)); mutex_lock(&dpm_list_mtx); - if ((dev->parent && dev->parent->power.sleeping) || all_sleeping) { - if (dev->parent->power.sleeping) - dev_warn(dev, "parent %s is sleeping\n", + if (dev->parent) { + if (dev->parent->power.status >= DPM_SUSPENDING) { + dev_warn(dev, "parent %s is sleeping, will not add\n", dev->parent->bus_id); - else - dev_warn(dev, "all devices are sleeping\n"); + WARN_ON(true); + } + } else if (transition_started) { + /* + * We refuse to register parentless devices while a PM + * transition is in progress in order to avoid leaving them + * unhandled down the road + */ WARN_ON(true); } error = dpm_sysfs_add(dev); - if (!error) - list_add_tail(&dev->power.entry, &dpm_active); + if (!error) { + dev->power.status = DPM_ON; + list_add_tail(&dev->power.entry, &dpm_list); + } mutex_unlock(&dpm_list_mtx); return error; } @@ -100,73 +115,243 @@ void device_pm_remove(struct device *dev) mutex_unlock(&dpm_list_mtx); } +/** + * pm_op - execute the PM operation appropiate for given PM event + * @dev: Device. + * @ops: PM operations to choose from. + * @state: PM transition of the system being carried out. + */ +static int pm_op(struct device *dev, struct pm_ops *ops, pm_message_t state) +{ + int error = 0; + + switch (state.event) { +#ifdef CONFIG_SUSPEND + case PM_EVENT_SUSPEND: + if (ops->suspend) { + error = ops->suspend(dev); + suspend_report_result(ops->suspend, error); + } + break; + case PM_EVENT_RESUME: + if (ops->resume) { + error = ops->resume(dev); + suspend_report_result(ops->resume, error); + } + break; +#endif /* CONFIG_SUSPEND */ +#ifdef CONFIG_HIBERNATION + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + if (ops->freeze) { + error = ops->freeze(dev); + suspend_report_result(ops->freeze, error); + } + break; + case PM_EVENT_HIBERNATE: + if (ops->poweroff) { + error = ops->poweroff(dev); + suspend_report_result(ops->poweroff, error); + } + break; + case PM_EVENT_THAW: + case PM_EVENT_RECOVER: + if (ops->thaw) { + error = ops->thaw(dev); + suspend_report_result(ops->thaw, error); + } + break; + case PM_EVENT_RESTORE: + if (ops->restore) { + error = ops->restore(dev); + suspend_report_result(ops->restore, error); + } + break; +#endif /* CONFIG_HIBERNATION */ + default: + error = -EINVAL; + } + return error; +} + +/** + * pm_noirq_op - execute the PM operation appropiate for given PM event + * @dev: Device. + * @ops: PM operations to choose from. + * @state: PM transition of the system being carried out. + * + * The operation is executed with interrupts disabled by the only remaining + * functional CPU in the system. + */ +static int pm_noirq_op(struct device *dev, struct pm_ext_ops *ops, + pm_message_t state) +{ + int error = 0; + + switch (state.event) { +#ifdef CONFIG_SUSPEND + case PM_EVENT_SUSPEND: + if (ops->suspend_noirq) { + error = ops->suspend_noirq(dev); + suspend_report_result(ops->suspend_noirq, error); + } + break; + case PM_EVENT_RESUME: + if (ops->resume_noirq) { + error = ops->resume_noirq(dev); + suspend_report_result(ops->resume_noirq, error); + } + break; +#endif /* CONFIG_SUSPEND */ +#ifdef CONFIG_HIBERNATION + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + if (ops->freeze_noirq) { + error = ops->freeze_noirq(dev); + suspend_report_result(ops->freeze_noirq, error); + } + break; + case PM_EVENT_HIBERNATE: + if (ops->poweroff_noirq) { + error = ops->poweroff_noirq(dev); + suspend_report_result(ops->poweroff_noirq, error); + } + break; + case PM_EVENT_THAW: + case PM_EVENT_RECOVER: + if (ops->thaw_noirq) { + error = ops->thaw_noirq(dev); + suspend_report_result(ops->thaw_noirq, error); + } + break; + case PM_EVENT_RESTORE: + if (ops->restore_noirq) { + error = ops->restore_noirq(dev); + suspend_report_result(ops->restore_noirq, error); + } + break; +#endif /* CONFIG_HIBERNATION */ + default: + error = -EINVAL; + } + return error; +} + +static char *pm_verb(int event) +{ + switch (event) { + case PM_EVENT_SUSPEND: + return "suspend"; + case PM_EVENT_RESUME: + return "resume"; + case PM_EVENT_FREEZE: + return "freeze"; + case PM_EVENT_QUIESCE: + return "quiesce"; + case PM_EVENT_HIBERNATE: + return "hibernate"; + case PM_EVENT_THAW: + return "thaw"; + case PM_EVENT_RESTORE: + return "restore"; + case PM_EVENT_RECOVER: + return "recover"; + default: + return "(unknown PM event)"; + } +} + +static void pm_dev_dbg(struct device *dev, pm_message_t state, char *info) +{ + dev_dbg(dev, "%s%s%s\n", info, pm_verb(state.event), + ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? + ", may wakeup" : ""); +} + +static void pm_dev_err(struct device *dev, pm_message_t state, char *info, + int error) +{ + printk(KERN_ERR "PM: Device %s failed to %s%s: error %d\n", + kobject_name(&dev->kobj), pm_verb(state.event), info, error); +} + /*------------------------- Resume routines -------------------------*/ /** - * resume_device_early - Power on one device (early resume). + * resume_device_noirq - Power on one device (early resume). * @dev: Device. + * @state: PM transition of the system being carried out. * * Must be called with interrupts disabled. */ -static int resume_device_early(struct device *dev) +static int resume_device_noirq(struct device *dev, pm_message_t state) { int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); - if (dev->bus && dev->bus->resume_early) { - dev_dbg(dev, "EARLY resume\n"); + if (!dev->bus) + goto End; + + if (dev->bus->pm) { + pm_dev_dbg(dev, state, "EARLY "); + error = pm_noirq_op(dev, dev->bus->pm, state); + } else if (dev->bus->resume_early) { + pm_dev_dbg(dev, state, "legacy EARLY "); error = dev->bus->resume_early(dev); } - + End: TRACE_RESUME(error); return error; } /** * dpm_power_up - Power on all regular (non-sysdev) devices. + * @state: PM transition of the system being carried out. * - * Walk the dpm_off_irq list and power each device up. This - * is used for devices that required they be powered down with - * interrupts disabled. As devices are powered on, they are moved - * to the dpm_off list. + * Execute the appropriate "noirq resume" callback for all devices marked + * as DPM_OFF_IRQ. * * Must be called with interrupts disabled and only one CPU running. */ -static void dpm_power_up(void) +static void dpm_power_up(pm_message_t state) { + struct device *dev; - while (!list_empty(&dpm_off_irq)) { - struct list_head *entry = dpm_off_irq.next; - struct device *dev = to_device(entry); + list_for_each_entry(dev, &dpm_list, power.entry) + if (dev->power.status > DPM_OFF) { + int error; - list_move_tail(entry, &dpm_off); - resume_device_early(dev); - } + dev->power.status = DPM_OFF; + error = resume_device_noirq(dev, state); + if (error) + pm_dev_err(dev, state, " early", error); + } } /** * device_power_up - Turn on all devices that need special attention. + * @state: PM transition of the system being carried out. * * Power on system devices, then devices that required we shut them down * with interrupts disabled. * * Must be called with interrupts disabled. */ -void device_power_up(void) +void device_power_up(pm_message_t state) { sysdev_resume(); - dpm_power_up(); + dpm_power_up(state); } EXPORT_SYMBOL_GPL(device_power_up); /** * resume_device - Restore state for one device. * @dev: Device. - * + * @state: PM transition of the system being carried out. */ -static int resume_device(struct device *dev) +static int resume_device(struct device *dev, pm_message_t state) { int error = 0; @@ -175,21 +360,40 @@ static int resume_device(struct device *dev) down(&dev->sem); - if (dev->bus && dev->bus->resume) { - dev_dbg(dev,"resuming\n"); - error = dev->bus->resume(dev); + if (dev->bus) { + if (dev->bus->pm) { + pm_dev_dbg(dev, state, ""); + error = pm_op(dev, &dev->bus->pm->base, state); + } else if (dev->bus->resume) { + pm_dev_dbg(dev, state, "legacy "); + error = dev->bus->resume(dev); + } + if (error) + goto End; } - if (!error && dev->type && dev->type->resume) { - dev_dbg(dev,"resuming\n"); - error = dev->type->resume(dev); + if (dev->type) { + if (dev->type->pm) { + pm_dev_dbg(dev, state, "type "); + error = pm_op(dev, dev->type->pm, state); + } else if (dev->type->resume) { + pm_dev_dbg(dev, state, "legacy type "); + error = dev->type->resume(dev); + } + if (error) + goto End; } - if (!error && dev->class && dev->class->resume) { - dev_dbg(dev,"class resume\n"); - error = dev->class->resume(dev); + if (dev->class) { + if (dev->class->pm) { + pm_dev_dbg(dev, state, "class "); + error = pm_op(dev, dev->class->pm, state); + } else if (dev->class->resume) { + pm_dev_dbg(dev, state, "legacy class "); + error = dev->class->resume(dev); + } } - + End: up(&dev->sem); TRACE_RESUME(error); @@ -198,78 +402,161 @@ static int resume_device(struct device *dev) /** * dpm_resume - Resume every device. + * @state: PM transition of the system being carried out. * - * Resume the devices that have either not gone through - * the late suspend, or that did go through it but also - * went through the early resume. + * Execute the appropriate "resume" callback for all devices the status of + * which indicates that they are inactive. + */ +static void dpm_resume(pm_message_t state) +{ + struct list_head list; + + INIT_LIST_HEAD(&list); + mutex_lock(&dpm_list_mtx); + transition_started = false; + while (!list_empty(&dpm_list)) { + struct device *dev = to_device(dpm_list.next); + + get_device(dev); + if (dev->power.status >= DPM_OFF) { + int error; + + dev->power.status = DPM_RESUMING; + mutex_unlock(&dpm_list_mtx); + + error = resume_device(dev, state); + + mutex_lock(&dpm_list_mtx); + if (error) + pm_dev_err(dev, state, "", error); + } else if (dev->power.status == DPM_SUSPENDING) { + /* Allow new children of the device to be registered */ + dev->power.status = DPM_RESUMING; + } + if (!list_empty(&dev->power.entry)) + list_move_tail(&dev->power.entry, &list); + put_device(dev); + } + list_splice(&list, &dpm_list); + mutex_unlock(&dpm_list_mtx); +} + +/** + * complete_device - Complete a PM transition for given device + * @dev: Device. + * @state: PM transition of the system being carried out. + */ +static void complete_device(struct device *dev, pm_message_t state) +{ + down(&dev->sem); + + if (dev->class && dev->class->pm && dev->class->pm->complete) { + pm_dev_dbg(dev, state, "completing class "); + dev->class->pm->complete(dev); + } + + if (dev->type && dev->type->pm && dev->type->pm->complete) { + pm_dev_dbg(dev, state, "completing type "); + dev->type->pm->complete(dev); + } + + if (dev->bus && dev->bus->pm && dev->bus->pm->base.complete) { + pm_dev_dbg(dev, state, "completing "); + dev->bus->pm->base.complete(dev); + } + + up(&dev->sem); +} + +/** + * dpm_complete - Complete a PM transition for all devices. + * @state: PM transition of the system being carried out. * - * Take devices from the dpm_off_list, resume them, - * and put them on the dpm_locked list. + * Execute the ->complete() callbacks for all devices that are not marked + * as DPM_ON. */ -static void dpm_resume(void) +static void dpm_complete(pm_message_t state) { + struct list_head list; + + INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); - all_sleeping = false; - while(!list_empty(&dpm_off)) { - struct list_head *entry = dpm_off.next; - struct device *dev = to_device(entry); + while (!list_empty(&dpm_list)) { + struct device *dev = to_device(dpm_list.prev); - list_move_tail(entry, &dpm_active); - dev->power.sleeping = false; - mutex_unlock(&dpm_list_mtx); - resume_device(dev); - mutex_lock(&dpm_list_mtx); + get_device(dev); + if (dev->power.status > DPM_ON) { + dev->power.status = DPM_ON; + mutex_unlock(&dpm_list_mtx); + + complete_device(dev, state); + + mutex_lock(&dpm_list_mtx); + } + if (!list_empty(&dev->power.entry)) + list_move(&dev->power.entry, &list); + put_device(dev); } + list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); } /** * device_resume - Restore state of each device in system. + * @state: PM transition of the system being carried out. * * Resume all the devices, unlock them all, and allow new * devices to be registered once again. */ -void device_resume(void) +void device_resume(pm_message_t state) { might_sleep(); - dpm_resume(); + dpm_resume(state); + dpm_complete(state); } EXPORT_SYMBOL_GPL(device_resume); /*------------------------- Suspend routines -------------------------*/ -static inline char *suspend_verb(u32 event) +/** + * resume_event - return a PM message representing the resume event + * corresponding to given sleep state. + * @sleep_state: PM message representing a sleep state. + */ +static pm_message_t resume_event(pm_message_t sleep_state) { - switch (event) { - case PM_EVENT_SUSPEND: return "suspend"; - case PM_EVENT_FREEZE: return "freeze"; - case PM_EVENT_PRETHAW: return "prethaw"; - default: return "(unknown suspend event)"; + switch (sleep_state.event) { + case PM_EVENT_SUSPEND: + return PMSG_RESUME; + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return PMSG_RECOVER; + case PM_EVENT_HIBERNATE: + return PMSG_RESTORE; } -} - -static void -suspend_device_dbg(struct device *dev, pm_message_t state, char *info) -{ - dev_dbg(dev, "%s%s%s\n", info, suspend_verb(state.event), - ((state.event == PM_EVENT_SUSPEND) && device_may_wakeup(dev)) ? - ", may wakeup" : ""); + return PMSG_ON; } /** - * suspend_device_late - Shut down one device (late suspend). + * suspend_device_noirq - Shut down one device (late suspend). * @dev: Device. - * @state: Power state device is entering. + * @state: PM transition of the system being carried out. * * This is called with interrupts off and only a single CPU running. */ -static int suspend_device_late(struct device *dev, pm_message_t state) +static int suspend_device_noirq(struct device *dev, pm_message_t state) { int error = 0; - if (dev->bus && dev->bus->suspend_late) { - suspend_device_dbg(dev, state, "LATE "); + if (!dev->bus) + return 0; + + if (dev->bus->pm) { + pm_dev_dbg(dev, state, "LATE "); + error = pm_noirq_op(dev, dev->bus->pm, state); + } else if (dev->bus->suspend_late) { + pm_dev_dbg(dev, state, "legacy LATE "); error = dev->bus->suspend_late(dev, state); suspend_report_result(dev->bus->suspend_late, error); } @@ -278,37 +565,30 @@ static int suspend_device_late(struct device *dev, pm_message_t state) /** * device_power_down - Shut down special devices. - * @state: Power state to enter. + * @state: PM transition of the system being carried out. * - * Power down devices that require interrupts to be disabled - * and move them from the dpm_off list to the dpm_off_irq list. + * Power down devices that require interrupts to be disabled. * Then power down system devices. * * Must be called with interrupts disabled and only one CPU running. */ int device_power_down(pm_message_t state) { + struct device *dev; int error = 0; - while (!list_empty(&dpm_off)) { - struct list_head *entry = dpm_off.prev; - struct device *dev = to_device(entry); - - error = suspend_device_late(dev, state); + list_for_each_entry_reverse(dev, &dpm_list, power.entry) { + error = suspend_device_noirq(dev, state); if (error) { - printk(KERN_ERR "Could not power down device %s: " - "error %d\n", - kobject_name(&dev->kobj), error); + pm_dev_err(dev, state, " late", error); break; } - if (!list_empty(&dev->power.entry)) - list_move(&dev->power.entry, &dpm_off_irq); + dev->power.status = DPM_OFF_IRQ; } - if (!error) error = sysdev_suspend(state); if (error) - dpm_power_up(); + dpm_power_up(resume_event(state)); return error; } EXPORT_SYMBOL_GPL(device_power_down); @@ -316,7 +596,7 @@ EXPORT_SYMBOL_GPL(device_power_down); /** * suspend_device - Save state of one device. * @dev: Device. - * @state: Power state device is entering. + * @state: PM transition of the system being carried out. */ static int suspend_device(struct device *dev, pm_message_t state) { @@ -324,24 +604,43 @@ static int suspend_device(struct device *dev, pm_message_t state) down(&dev->sem); - if (dev->class && dev->class->suspend) { - suspend_device_dbg(dev, state, "class "); - error = dev->class->suspend(dev, state); - suspend_report_result(dev->class->suspend, error); + if (dev->class) { + if (dev->class->pm) { + pm_dev_dbg(dev, state, "class "); + error = pm_op(dev, dev->class->pm, state); + } else if (dev->class->suspend) { + pm_dev_dbg(dev, state, "legacy class "); + error = dev->class->suspend(dev, state); + suspend_report_result(dev->class->suspend, error); + } + if (error) + goto End; } - if (!error && dev->type && dev->type->suspend) { - suspend_device_dbg(dev, state, "type "); - error = dev->type->suspend(dev, state); - suspend_report_result(dev->type->suspend, error); + if (dev->type) { + if (dev->type->pm) { + pm_dev_dbg(dev, state, "type "); + error = pm_op(dev, dev->type->pm, state); + } else if (dev->type->suspend) { + pm_dev_dbg(dev, state, "legacy type "); + error = dev->type->suspend(dev, state); + suspend_report_result(dev->type->suspend, error); + } + if (error) + goto End; } - if (!error && dev->bus && dev->bus->suspend) { - suspend_device_dbg(dev, state, ""); - error = dev->bus->suspend(dev, state); - suspend_report_result(dev->bus->suspend, error); + if (dev->bus) { + if (dev->bus->pm) { + pm_dev_dbg(dev, state, ""); + error = pm_op(dev, &dev->bus->pm->base, state); + } else if (dev->bus->suspend) { + pm_dev_dbg(dev, state, "legacy "); + error = dev->bus->suspend(dev, state); + suspend_report_result(dev->bus->suspend, error); + } } - + End: up(&dev->sem); return error; @@ -349,67 +648,141 @@ static int suspend_device(struct device *dev, pm_message_t state) /** * dpm_suspend - Suspend every device. - * @state: Power state to put each device in. + * @state: PM transition of the system being carried out. * - * Walk the dpm_locked list. Suspend each device and move it - * to the dpm_off list. - * - * (For historical reasons, if it returns -EAGAIN, that used to mean - * that the device would be called again with interrupts disabled. - * These days, we use the "suspend_late()" callback for that, so we - * print a warning and consider it an error). + * Execute the appropriate "suspend" callbacks for all devices. */ static int dpm_suspend(pm_message_t state) { + struct list_head list; int error = 0; + INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); - while (!list_empty(&dpm_active)) { - struct list_head *entry = dpm_active.prev; - struct device *dev = to_device(entry); - - WARN_ON(dev->parent && dev->parent->power.sleeping); + while (!list_empty(&dpm_list)) { + struct device *dev = to_device(dpm_list.prev); - dev->power.sleeping = true; + get_device(dev); mutex_unlock(&dpm_list_mtx); + error = suspend_device(dev, state); + mutex_lock(&dpm_list_mtx); if (error) { - printk(KERN_ERR "Could not suspend device %s: " - "error %d%s\n", - kobject_name(&dev->kobj), - error, - (error == -EAGAIN ? - " (please convert to suspend_late)" : - "")); - dev->power.sleeping = false; + pm_dev_err(dev, state, "", error); + put_device(dev); break; } + dev->power.status = DPM_OFF; if (!list_empty(&dev->power.entry)) - list_move(&dev->power.entry, &dpm_off); + list_move(&dev->power.entry, &list); + put_device(dev); } - if (!error) - all_sleeping = true; + list_splice(&list, dpm_list.prev); mutex_unlock(&dpm_list_mtx); + return error; +} + +/** + * prepare_device - Execute the ->prepare() callback(s) for given device. + * @dev: Device. + * @state: PM transition of the system being carried out. + */ +static int prepare_device(struct device *dev, pm_message_t state) +{ + int error = 0; + + down(&dev->sem); + + if (dev->bus && dev->bus->pm && dev->bus->pm->base.prepare) { + pm_dev_dbg(dev, state, "preparing "); + error = dev->bus->pm->base.prepare(dev); + suspend_report_result(dev->bus->pm->base.prepare, error); + if (error) + goto End; + } + + if (dev->type && dev->type->pm && dev->type->pm->prepare) { + pm_dev_dbg(dev, state, "preparing type "); + error = dev->type->pm->prepare(dev); + suspend_report_result(dev->type->pm->prepare, error); + if (error) + goto End; + } + + if (dev->class && dev->class->pm && dev->class->pm->prepare) { + pm_dev_dbg(dev, state, "preparing class "); + error = dev->class->pm->prepare(dev); + suspend_report_result(dev->class->pm->prepare, error); + } + End: + up(&dev->sem); + + return error; +} +/** + * dpm_prepare - Prepare all devices for a PM transition. + * @state: PM transition of the system being carried out. + * + * Execute the ->prepare() callback for all devices. + */ +static int dpm_prepare(pm_message_t state) +{ + struct list_head list; + int error = 0; + + INIT_LIST_HEAD(&list); + mutex_lock(&dpm_list_mtx); + transition_started = true; + while (!list_empty(&dpm_list)) { + struct device *dev = to_device(dpm_list.next); + + get_device(dev); + dev->power.status = DPM_PREPARING; + mutex_unlock(&dpm_list_mtx); + + error = prepare_device(dev, state); + + mutex_lock(&dpm_list_mtx); + if (error) { + dev->power.status = DPM_ON; + if (error == -EAGAIN) { + put_device(dev); + continue; + } + printk(KERN_ERR "PM: Failed to prepare device %s " + "for power transition: error %d\n", + kobject_name(&dev->kobj), error); + put_device(dev); + break; + } + dev->power.status = DPM_SUSPENDING; + if (!list_empty(&dev->power.entry)) + list_move_tail(&dev->power.entry, &list); + put_device(dev); + } + list_splice(&list, &dpm_list); + mutex_unlock(&dpm_list_mtx); return error; } /** * device_suspend - Save state and stop all devices in system. - * @state: new power management state + * @state: PM transition of the system being carried out. * - * Prevent new devices from being registered, then lock all devices - * and suspend them. + * Prepare and suspend all devices. */ int device_suspend(pm_message_t state) { int error; might_sleep(); - error = dpm_suspend(state); + error = dpm_prepare(state); + if (!error) + error = dpm_suspend(state); if (error) - device_resume(); + device_resume(resume_event(state)); return error; } EXPORT_SYMBOL_GPL(device_suspend); diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index a6894f2a4b99..a3252c0e2887 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -4,7 +4,7 @@ * main.c */ -extern struct list_head dpm_active; /* The active device list */ +extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 2b4b392dcbc1..8c1e656b5f8b 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -188,9 +188,9 @@ static int show_file_hash(unsigned int value) static int show_dev_hash(unsigned int value) { int match = 0; - struct list_head * entry = dpm_active.prev; + struct list_head *entry = dpm_list.prev; - while (entry != &dpm_active) { + while (entry != &dpm_list) { struct device * dev = to_device(entry); unsigned int hash = hash_string(DEVSEED, dev->bus_id, DEVHASH); if (hash == value) { diff --git a/include/linux/device.h b/include/linux/device.h index 6a2d04c011bc..f71a78d123ae 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -68,6 +68,8 @@ struct bus_type { int (*resume_early)(struct device *dev); int (*resume)(struct device *dev); + struct pm_ext_ops *pm; + struct bus_type_private *p; }; @@ -131,6 +133,8 @@ struct device_driver { int (*resume) (struct device *dev); struct attribute_group **groups; + struct pm_ops *pm; + struct driver_private *p; }; @@ -197,6 +201,8 @@ struct class { int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); + + struct pm_ops *pm; }; extern int __must_check class_register(struct class *class); @@ -248,8 +254,11 @@ struct device_type { struct attribute_group **groups; int (*uevent)(struct device *dev, struct kobj_uevent_env *env); void (*release)(struct device *dev); + int (*suspend)(struct device *dev, pm_message_t state); int (*resume)(struct device *dev); + + struct pm_ops *pm; }; /* interface for exporting device attributes */ diff --git a/include/linux/pm.h b/include/linux/pm.h index 39a7ee859b67..4ad9de94449a 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -112,7 +112,9 @@ typedef struct pm_message { int event; } pm_message_t; -/* +/** + * struct pm_ops - device PM callbacks + * * Several driver power state transitions are externally visible, affecting * the state of pending I/O queues and (for drivers that touch hardware) * interrupts, wakeups, DMA, and other hardware state. There may also be @@ -120,6 +122,284 @@ typedef struct pm_message { * to the rest of the driver stack (such as a driver that's ON gating off * clocks which are not in active use). * + * The externally visible transitions are handled with the help of the following + * callbacks included in this structure: + * + * @prepare: Prepare the device for the upcoming transition, but do NOT change + * its hardware state. Prevent new children of the device from being + * registered after @prepare() returns (the driver's subsystem and + * generally the rest of the kernel is supposed to prevent new calls to the + * probe method from being made too once @prepare() has succeeded). If + * @prepare() detects a situation it cannot handle (e.g. registration of a + * child already in progress), it may return -EAGAIN, so that the PM core + * can execute it once again (e.g. after the new child has been registered) + * to recover from the race condition. This method is executed for all + * kinds of suspend transitions and is followed by one of the suspend + * callbacks: @suspend(), @freeze(), or @poweroff(). + * The PM core executes @prepare() for all devices before starting to + * execute suspend callbacks for any of them, so drivers may assume all of + * the other devices to be present and functional while @prepare() is being + * executed. In particular, it is safe to make GFP_KERNEL memory + * allocations from within @prepare(). However, drivers may NOT assume + * anything about the availability of the user space at that time and it + * is not correct to request firmware from within @prepare() (it's too + * late to do that). [To work around this limitation, drivers may + * register suspend and hibernation notifiers that are executed before the + * freezing of tasks.] + * + * @complete: Undo the changes made by @prepare(). This method is executed for + * all kinds of resume transitions, following one of the resume callbacks: + * @resume(), @thaw(), @restore(). Also called if the state transition + * fails before the driver's suspend callback (@suspend(), @freeze(), + * @poweroff()) can be executed (e.g. if the suspend callback fails for one + * of the other devices that the PM core has unsuccessfully attempted to + * suspend earlier). + * The PM core executes @complete() after it has executed the appropriate + * resume callback for all devices. + * + * @suspend: Executed before putting the system into a sleep state in which the + * contents of main memory are preserved. Quiesce the device, put it into + * a low power state appropriate for the upcoming system state (such as + * PCI_D3hot), and enable wakeup events as appropriate. + * + * @resume: Executed after waking the system up from a sleep state in which the + * contents of main memory were preserved. Put the device into the + * appropriate state, according to the information saved in memory by the + * preceding @suspend(). The driver starts working again, responding to + * hardware events and software requests. The hardware may have gone + * through a power-off reset, or it may have maintained state from the + * previous suspend() which the driver may rely on while resuming. On most + * platforms, there are no restrictions on availability of resources like + * clocks during @resume(). + * + * @freeze: Hibernation-specific, executed before creating a hibernation image. + * Quiesce operations so that a consistent image can be created, but do NOT + * otherwise put the device into a low power device state and do NOT emit + * system wakeup events. Save in main memory the device settings to be + * used by @restore() during the subsequent resume from hibernation or by + * the subsequent @thaw(), if the creation of the image or the restoration + * of main memory contents from it fails. + * + * @thaw: Hibernation-specific, executed after creating a hibernation image OR + * if the creation of the image fails. Also executed after a failing + * attempt to restore the contents of main memory from such an image. + * Undo the changes made by the preceding @freeze(), so the device can be + * operated in the same way as immediately before the call to @freeze(). + * + * @poweroff: Hibernation-specific, executed after saving a hibernation image. + * Quiesce the device, put it into a low power state appropriate for the + * upcoming system state (such as PCI_D3hot), and enable wakeup events as + * appropriate. + * + * @restore: Hibernation-specific, executed after restoring the contents of main + * memory from a hibernation image. Driver starts working again, + * responding to hardware events and software requests. Drivers may NOT + * make ANY assumptions about the hardware state right prior to @restore(). + * On most platforms, there are no restrictions on availability of + * resources like clocks during @restore(). + * + * All of the above callbacks, except for @complete(), return error codes. + * However, the error codes returned by the resume operations, @resume(), + * @thaw(), and @restore(), do not cause the PM core to abort the resume + * transition during which they are returned. The error codes returned in + * that cases are only printed by the PM core to the system logs for debugging + * purposes. Still, it is recommended that drivers only return error codes + * from their resume methods in case of an unrecoverable failure (i.e. when the + * device being handled refuses to resume and becomes unusable) to allow us to + * modify the PM core in the future, so that it can avoid attempting to handle + * devices that failed to resume and their children. + * + * It is allowed to unregister devices while the above callbacks are being + * executed. However, it is not allowed to unregister a device from within any + * of its own callbacks. + */ + +struct pm_ops { + int (*prepare)(struct device *dev); + void (*complete)(struct device *dev); + int (*suspend)(struct device *dev); + int (*resume)(struct device *dev); + int (*freeze)(struct device *dev); + int (*thaw)(struct device *dev); + int (*poweroff)(struct device *dev); + int (*restore)(struct device *dev); +}; + +/** + * struct pm_ext_ops - extended device PM callbacks + * + * Some devices require certain operations related to suspend and hibernation + * to be carried out with interrupts disabled. Thus, 'struct pm_ext_ops' below + * is defined, adding callbacks to be executed with interrupts disabled to + * 'struct pm_ops'. + * + * The following callbacks included in 'struct pm_ext_ops' are executed with + * the nonboot CPUs switched off and with interrupts disabled on the only + * functional CPU. They also are executed with the PM core list of devices + * locked, so they must NOT unregister any devices. + * + * @suspend_noirq: Complete the operations of ->suspend() by carrying out any + * actions required for suspending the device that need interrupts to be + * disabled + * + * @resume_noirq: Prepare for the execution of ->resume() by carrying out any + * actions required for resuming the device that need interrupts to be + * disabled + * + * @freeze_noirq: Complete the operations of ->freeze() by carrying out any + * actions required for freezing the device that need interrupts to be + * disabled + * + * @thaw_noirq: Prepare for the execution of ->thaw() by carrying out any + * actions required for thawing the device that need interrupts to be + * disabled + * + * @poweroff_noirq: Complete the operations of ->poweroff() by carrying out any + * actions required for handling the device that need interrupts to be + * disabled + * + * @restore_noirq: Prepare for the execution of ->restore() by carrying out any + * actions required for restoring the operations of the device that need + * interrupts to be disabled + * + * All of the above callbacks return error codes, but the error codes returned + * by the resume operations, @resume_noirq(), @thaw_noirq(), and + * @restore_noirq(), do not cause the PM core to abort the resume transition + * during which they are returned. The error codes returned in that cases are + * only printed by the PM core to the system logs for debugging purposes. + * Still, as stated above, it is recommended that drivers only return error + * codes from their resume methods if the device being handled fails to resume + * and is not usable any more. + */ + +struct pm_ext_ops { + struct pm_ops base; + int (*suspend_noirq)(struct device *dev); + int (*resume_noirq)(struct device *dev); + int (*freeze_noirq)(struct device *dev); + int (*thaw_noirq)(struct device *dev); + int (*poweroff_noirq)(struct device *dev); + int (*restore_noirq)(struct device *dev); +}; + +/** + * PM_EVENT_ messages + * + * The following PM_EVENT_ messages are defined for the internal use of the PM + * core, in order to provide a mechanism allowing the high level suspend and + * hibernation code to convey the necessary information to the device PM core + * code: + * + * ON No transition. + * + * FREEZE System is going to hibernate, call ->prepare() and ->freeze() + * for all devices. + * + * SUSPEND System is going to suspend, call ->prepare() and ->suspend() + * for all devices. + * + * HIBERNATE Hibernation image has been saved, call ->prepare() and + * ->poweroff() for all devices. + * + * QUIESCE Contents of main memory are going to be restored from a (loaded) + * hibernation image, call ->prepare() and ->freeze() for all + * devices. + * + * RESUME System is resuming, call ->resume() and ->complete() for all + * devices. + * + * THAW Hibernation image has been created, call ->thaw() and + * ->complete() for all devices. + * + * RESTORE Contents of main memory have been restored from a hibernation + * image, call ->restore() and ->complete() for all devices. + * + * RECOVER Creation of a hibernation image or restoration of the main + * memory contents from a hibernation image has failed, call + * ->thaw() and ->complete() for all devices. + */ + +#define PM_EVENT_ON 0x0000 +#define PM_EVENT_FREEZE 0x0001 +#define PM_EVENT_SUSPEND 0x0002 +#define PM_EVENT_HIBERNATE 0x0004 +#define PM_EVENT_QUIESCE 0x0008 +#define PM_EVENT_RESUME 0x0010 +#define PM_EVENT_THAW 0x0020 +#define PM_EVENT_RESTORE 0x0040 +#define PM_EVENT_RECOVER 0x0080 + +#define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) + +#define PMSG_FREEZE ((struct pm_message){ .event = PM_EVENT_FREEZE, }) +#define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) +#define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) +#define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) +#define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) +#define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) +#define PMSG_RECOVER ((struct pm_message){ .event = PM_EVENT_RECOVER, }) +#define PMSG_ON ((struct pm_message){ .event = PM_EVENT_ON, }) + +/** + * Device power management states + * + * These state labels are used internally by the PM core to indicate the current + * status of a device with respect to the PM core operations. + * + * DPM_ON Device is regarded as operational. Set this way + * initially and when ->complete() is about to be called. + * Also set when ->prepare() fails. + * + * DPM_PREPARING Device is going to be prepared for a PM transition. Set + * when ->prepare() is about to be called. + * + * DPM_RESUMING Device is going to be resumed. Set when ->resume(), + * ->thaw(), or ->restore() is about to be called. + * + * DPM_SUSPENDING Device has been prepared for a power transition. Set + * when ->prepare() has just succeeded. + * + * DPM_OFF Device is regarded as inactive. Set immediately after + * ->suspend(), ->freeze(), or ->poweroff() has succeeded. + * Also set when ->resume()_noirq, ->thaw_noirq(), or + * ->restore_noirq() is about to be called. + * + * DPM_OFF_IRQ Device is in a "deep sleep". Set immediately after + * ->suspend_noirq(), ->freeze_noirq(), or + * ->poweroff_noirq() has just succeeded. + */ + +enum dpm_state { + DPM_INVALID, + DPM_ON, + DPM_PREPARING, + DPM_RESUMING, + DPM_SUSPENDING, + DPM_OFF, + DPM_OFF_IRQ, +}; + +struct dev_pm_info { + pm_message_t power_state; + unsigned can_wakeup:1; + unsigned should_wakeup:1; + enum dpm_state status; /* Owned by the PM core */ +#ifdef CONFIG_PM_SLEEP + struct list_head entry; +#endif +}; + +/* + * The PM_EVENT_ messages are also used by drivers implementing the legacy + * suspend framework, based on the ->suspend() and ->resume() callbacks common + * for suspend and hibernation transitions, according to the rules below. + */ + +/* Necessary, because several drivers use PM_EVENT_PRETHAW */ +#define PM_EVENT_PRETHAW PM_EVENT_QUIESCE + +/* * One transition is triggered by resume(), after a suspend() call; the * message is implicit: * @@ -164,35 +444,13 @@ typedef struct pm_message { * or from system low-power states such as standby or suspend-to-RAM. */ -#define PM_EVENT_ON 0 -#define PM_EVENT_FREEZE 1 -#define PM_EVENT_SUSPEND 2 -#define PM_EVENT_HIBERNATE 4 -#define PM_EVENT_PRETHAW 8 - -#define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) - -#define PMSG_FREEZE ((struct pm_message){ .event = PM_EVENT_FREEZE, }) -#define PMSG_PRETHAW ((struct pm_message){ .event = PM_EVENT_PRETHAW, }) -#define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) -#define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) -#define PMSG_ON ((struct pm_message){ .event = PM_EVENT_ON, }) - -struct dev_pm_info { - pm_message_t power_state; - unsigned can_wakeup:1; - unsigned should_wakeup:1; - bool sleeping:1; /* Owned by the PM core */ -#ifdef CONFIG_PM_SLEEP - struct list_head entry; -#endif -}; +#ifdef CONFIG_PM_SLEEP +extern void device_pm_lock(void); +extern void device_power_up(pm_message_t state); +extern void device_resume(pm_message_t state); +extern void device_pm_unlock(void); extern int device_power_down(pm_message_t state); -extern void device_power_up(void); -extern void device_resume(void); - -#ifdef CONFIG_PM_SLEEP extern int device_suspend(pm_message_t state); extern int device_prepare_suspend(pm_message_t state); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 14a656cdc652..d416be0efa8a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -193,6 +193,7 @@ static int create_image(int platform_mode) if (error) return error; + device_pm_lock(); local_irq_disable(); /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. @@ -224,9 +225,11 @@ static int create_image(int platform_mode) /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ - device_power_up(); + device_power_up(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode) Finish: platform_finish(platform_mode); Resume_devices: - device_resume(); + device_resume(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Resume_console: resume_console(); Close: @@ -300,8 +304,9 @@ static int resume_target_kernel(void) { int error; + device_pm_lock(); local_irq_disable(); - error = device_power_down(PMSG_PRETHAW); + error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -329,9 +334,10 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); - device_power_up(); + device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); + device_pm_unlock(); return error; } @@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); - error = device_suspend(PMSG_PRETHAW); + error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode) enable_nonboot_cpus(); } platform_restore_cleanup(platform_mode); - device_resume(); + device_resume(PMSG_RECOVER); Finish: resume_console(); pm_restore_console(); @@ -403,6 +409,7 @@ int hibernation_platform_enter(void) if (error) goto Finish; + device_pm_lock(); local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { @@ -411,6 +418,7 @@ int hibernation_platform_enter(void) while (1); } local_irq_enable(); + device_pm_unlock(); /* * We don't need to reenable the nonboot CPUs or resume consoles, since @@ -419,7 +427,7 @@ int hibernation_platform_enter(void) Finish: hibernation_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESTORE); Resume_console: resume_console(); Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 6a6d5eb3524e..d023b6b584e5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state) { int error = 0; + device_pm_lock(); arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state) if (!suspend_test(TEST_CORE)) error = suspend_ops->enter(state); - device_power_up(); + device_power_up(PMSG_RESUME); Done: arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + device_pm_unlock(); return error; } @@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: - device_resume(); + device_resume(PMSG_RESUME); Resume_console: resume_console(); Close: -- cgit v1.2.3 From 25e18499e08cb097cbbfeab5de25d094d5312ee5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 21 May 2008 01:40:43 +0200 Subject: Implement new suspend and hibernation callbacks for platform busses Implement new suspend and hibernation callbacks for the platform bus type. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Greg KH Signed-off-by: Jesse Barnes --- drivers/base/platform.c | 296 ++++++++++++++++++++++++++++++++++++++-- include/linux/platform_device.h | 1 + 2 files changed, 289 insertions(+), 8 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 911ec600fe71..3f940393d6c7 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -453,6 +453,8 @@ int platform_driver_register(struct platform_driver *drv) drv->driver.suspend = platform_drv_suspend; if (drv->resume) drv->driver.resume = platform_drv_resume; + if (drv->pm) + drv->driver.pm = &drv->pm->base; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(platform_driver_register); @@ -560,7 +562,9 @@ static int platform_match(struct device *dev, struct device_driver *drv) return (strncmp(pdev->name, drv->name, BUS_ID_SIZE) == 0); } -static int platform_suspend(struct device *dev, pm_message_t mesg) +#ifdef CONFIG_PM_SLEEP + +static int platform_legacy_suspend(struct device *dev, pm_message_t mesg) { int ret = 0; @@ -570,7 +574,7 @@ static int platform_suspend(struct device *dev, pm_message_t mesg) return ret; } -static int platform_suspend_late(struct device *dev, pm_message_t mesg) +static int platform_legacy_suspend_late(struct device *dev, pm_message_t mesg) { struct platform_driver *drv = to_platform_driver(dev->driver); struct platform_device *pdev; @@ -583,7 +587,7 @@ static int platform_suspend_late(struct device *dev, pm_message_t mesg) return ret; } -static int platform_resume_early(struct device *dev) +static int platform_legacy_resume_early(struct device *dev) { struct platform_driver *drv = to_platform_driver(dev->driver); struct platform_device *pdev; @@ -596,7 +600,7 @@ static int platform_resume_early(struct device *dev) return ret; } -static int platform_resume(struct device *dev) +static int platform_legacy_resume(struct device *dev) { int ret = 0; @@ -606,15 +610,291 @@ static int platform_resume(struct device *dev) return ret; } +static int platform_pm_prepare(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm && drv->pm->prepare) + ret = drv->pm->prepare(dev); + + return ret; +} + +static void platform_pm_complete(struct device *dev) +{ + struct device_driver *drv = dev->driver; + + if (drv && drv->pm && drv->pm->complete) + drv->pm->complete(dev); +} + +#ifdef CONFIG_SUSPEND + +static int platform_pm_suspend(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm) { + if (drv->pm->suspend) + ret = drv->pm->suspend(dev); + } else { + ret = platform_legacy_suspend(dev, PMSG_SUSPEND); + } + + return ret; +} + +static int platform_pm_suspend_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->suspend_noirq) + ret = pdrv->pm->suspend_noirq(dev); + } else { + ret = platform_legacy_suspend_late(dev, PMSG_SUSPEND); + } + + return ret; +} + +static int platform_pm_resume(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm) { + if (drv->pm->resume) + ret = drv->pm->resume(dev); + } else { + ret = platform_legacy_resume(dev); + } + + return ret; +} + +static int platform_pm_resume_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->resume_noirq) + ret = pdrv->pm->resume_noirq(dev); + } else { + ret = platform_legacy_resume_early(dev); + } + + return ret; +} + +#else /* !CONFIG_SUSPEND */ + +#define platform_pm_suspend NULL +#define platform_pm_resume NULL +#define platform_pm_suspend_noirq NULL +#define platform_pm_resume_noirq NULL + +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_HIBERNATION + +static int platform_pm_freeze(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (!drv) + return 0; + + if (drv->pm) { + if (drv->pm->freeze) + ret = drv->pm->freeze(dev); + } else { + ret = platform_legacy_suspend(dev, PMSG_FREEZE); + } + + return ret; +} + +static int platform_pm_freeze_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->freeze_noirq) + ret = pdrv->pm->freeze_noirq(dev); + } else { + ret = platform_legacy_suspend_late(dev, PMSG_FREEZE); + } + + return ret; +} + +static int platform_pm_thaw(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm) { + if (drv->pm->thaw) + ret = drv->pm->thaw(dev); + } else { + ret = platform_legacy_resume(dev); + } + + return ret; +} + +static int platform_pm_thaw_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->thaw_noirq) + ret = pdrv->pm->thaw_noirq(dev); + } else { + ret = platform_legacy_resume_early(dev); + } + + return ret; +} + +static int platform_pm_poweroff(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm) { + if (drv->pm->poweroff) + ret = drv->pm->poweroff(dev); + } else { + ret = platform_legacy_suspend(dev, PMSG_HIBERNATE); + } + + return ret; +} + +static int platform_pm_poweroff_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->poweroff_noirq) + ret = pdrv->pm->poweroff_noirq(dev); + } else { + ret = platform_legacy_suspend_late(dev, PMSG_HIBERNATE); + } + + return ret; +} + +static int platform_pm_restore(struct device *dev) +{ + struct device_driver *drv = dev->driver; + int ret = 0; + + if (drv && drv->pm) { + if (drv->pm->restore) + ret = drv->pm->restore(dev); + } else { + ret = platform_legacy_resume(dev); + } + + return ret; +} + +static int platform_pm_restore_noirq(struct device *dev) +{ + struct platform_driver *pdrv; + int ret = 0; + + if (!dev->driver) + return 0; + + pdrv = to_platform_driver(dev->driver); + if (pdrv->pm) { + if (pdrv->pm->restore_noirq) + ret = pdrv->pm->restore_noirq(dev); + } else { + ret = platform_legacy_resume_early(dev); + } + + return ret; +} + +#else /* !CONFIG_HIBERNATION */ + +#define platform_pm_freeze NULL +#define platform_pm_thaw NULL +#define platform_pm_poweroff NULL +#define platform_pm_restore NULL +#define platform_pm_freeze_noirq NULL +#define platform_pm_thaw_noirq NULL +#define platform_pm_poweroff_noirq NULL +#define platform_pm_restore_noirq NULL + +#endif /* !CONFIG_HIBERNATION */ + +struct pm_ext_ops platform_pm_ops = { + .base = { + .prepare = platform_pm_prepare, + .complete = platform_pm_complete, + .suspend = platform_pm_suspend, + .resume = platform_pm_resume, + .freeze = platform_pm_freeze, + .thaw = platform_pm_thaw, + .poweroff = platform_pm_poweroff, + .restore = platform_pm_restore, + }, + .suspend_noirq = platform_pm_suspend_noirq, + .resume_noirq = platform_pm_resume_noirq, + .freeze_noirq = platform_pm_freeze_noirq, + .thaw_noirq = platform_pm_thaw_noirq, + .poweroff_noirq = platform_pm_poweroff_noirq, + .restore_noirq = platform_pm_restore_noirq, +}; + +#define PLATFORM_PM_OPS_PTR &platform_pm_ops + +#else /* !CONFIG_PM_SLEEP */ + +#define PLATFORM_PM_OPS_PTR NULL + +#endif /* !CONFIG_PM_SLEEP */ + struct bus_type platform_bus_type = { .name = "platform", .dev_attrs = platform_dev_attrs, .match = platform_match, .uevent = platform_uevent, - .suspend = platform_suspend, - .suspend_late = platform_suspend_late, - .resume_early = platform_resume_early, - .resume = platform_resume, + .pm = PLATFORM_PM_OPS_PTR, }; EXPORT_SYMBOL_GPL(platform_bus_type); diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 3261681c82a4..95ac21ab3a09 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -53,6 +53,7 @@ struct platform_driver { int (*suspend_late)(struct platform_device *, pm_message_t state); int (*resume_early)(struct platform_device *); int (*resume)(struct platform_device *); + struct pm_ext_ops *pm; struct device_driver driver; }; -- cgit v1.2.3 From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Jun 2008 23:24:06 +0200 Subject: Suspend-related patches for 2.6.27 ACPI PM: Add possibility to change suspend sequence There are some systems out there that don't work correctly with our current suspend/hibernation code ordering. Provide a workaround for these systems allowing them to pass 'acpi_sleep=old_ordering' in the kernel command line so that it will use the pre-ACPI 2.0 ("old") suspend code ordering. Unfortunately, this requires us to add a platform hook to the resuming of devices for recovering the platform in case one of the device drivers' .suspend() routines returns error code. Namely, ACPI 1.0 specifies that _PTS should be called before suspending devices, but _WAK still should be called before resuming them in order to undo the changes made by _PTS. However, if there is an error during suspending devices, they are automatically resumed without returning control to the PM core, so the _WAK has to be called from within device_resume() in that cases. The patch also reorders and refactors the ACPI suspend/hibernation code to avoid duplication as far as reasonably possible. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 6 +- arch/x86/kernel/acpi/sleep.c | 2 + drivers/acpi/sleep/main.c | 276 +++++++++++++++++++++--------------- drivers/base/power/main.c | 2 - include/linux/acpi.h | 3 + include/linux/suspend.h | 14 +- kernel/power/disk.c | 28 +++- kernel/power/main.c | 10 +- 8 files changed, 215 insertions(+), 126 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9cf7b34f2db0..18d793ea0dd3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -147,10 +147,14 @@ and is between 256 and 4096 characters. It is defined in the file default: 0 acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep } + Format: { s3_bios, s3_mode, s3_beep, old_ordering } See Documentation/power/video.txt for s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. + old_ordering causes the ACPI 1.0 ordering of the _PTS + control method, wrt putting devices into low power + states, to be enforced (the ACPI 2.0 ordering of _PTS is + used by default). acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index afc25ee9964b..882e970032d5 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -124,6 +124,8 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c index 0f2caea2fc83..4addf8ad50ae 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -24,10 +24,6 @@ u8 sleep_states[ACPI_S_STATE_COUNT]; -#ifdef CONFIG_PM_SLEEP -static u32 acpi_target_sleep_state = ACPI_STATE_S0; -#endif - static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP @@ -50,9 +46,96 @@ static int acpi_sleep_prepare(u32 acpi_state) return 0; } -#ifdef CONFIG_SUSPEND -static struct platform_suspend_ops acpi_suspend_ops; +#ifdef CONFIG_PM_SLEEP +static u32 acpi_target_sleep_state = ACPI_STATE_S0; + +/* + * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the + * user to request that behavior by using the 'acpi_old_suspend_ordering' + * kernel command line option that causes the following variable to be set. + */ +static bool old_suspend_ordering; + +void __init acpi_old_suspend_ordering(void) +{ + old_suspend_ordering = true; +} + +/** + * acpi_pm_disable_gpes - Disable the GPEs. + */ +static int acpi_pm_disable_gpes(void) +{ + acpi_hw_disable_all_gpes(); + return 0; +} + +/** + * __acpi_pm_prepare - Prepare the platform to enter the target state. + * + * If necessary, set the firmware waking vector and do arch-specific + * nastiness to get the wakeup code to the waking vector. + */ +static int __acpi_pm_prepare(void) +{ + int error = acpi_sleep_prepare(acpi_target_sleep_state); + + if (error) + acpi_target_sleep_state = ACPI_STATE_S0; + return error; +} + +/** + * acpi_pm_prepare - Prepare the platform to enter the target sleep + * state and disable the GPEs. + */ +static int acpi_pm_prepare(void) +{ + int error = __acpi_pm_prepare(); + + if (!error) + acpi_hw_disable_all_gpes(); + return error; +} + +/** + * acpi_pm_finish - Instruct the platform to leave a sleep state. + * + * This is called after we wake back up (or if entering the sleep state + * failed). + */ +static void acpi_pm_finish(void) +{ + u32 acpi_state = acpi_target_sleep_state; + + if (acpi_state == ACPI_STATE_S0) + return; + printk(KERN_INFO PREFIX "Waking up from system sleep state S%d\n", + acpi_state); + acpi_disable_wakeup_device(acpi_state); + acpi_leave_sleep_state(acpi_state); + + /* reset firmware waking vector */ + acpi_set_firmware_waking_vector((acpi_physical_address) 0); + + acpi_target_sleep_state = ACPI_STATE_S0; +} + +/** + * acpi_pm_end - Finish up suspend sequence. + */ +static void acpi_pm_end(void) +{ + /* + * This is necessary in case acpi_pm_finish() is not called during a + * failing transition to a sleep state. + */ + acpi_target_sleep_state = ACPI_STATE_S0; +} +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_SUSPEND extern void do_suspend_lowlevel(void); static u32 acpi_suspend_states[] = { @@ -66,7 +149,6 @@ static u32 acpi_suspend_states[] = { * acpi_suspend_begin - Set the target system sleep state to the state * associated with given @pm_state, if supported. */ - static int acpi_suspend_begin(suspend_state_t pm_state) { u32 acpi_state = acpi_suspend_states[pm_state]; @@ -82,25 +164,6 @@ static int acpi_suspend_begin(suspend_state_t pm_state) return error; } -/** - * acpi_suspend_prepare - Do preliminary suspend work. - * - * If necessary, set the firmware waking vector and do arch-specific - * nastiness to get the wakeup code to the waking vector. - */ - -static int acpi_suspend_prepare(void) -{ - int error = acpi_sleep_prepare(acpi_target_sleep_state); - - if (error) { - acpi_target_sleep_state = ACPI_STATE_S0; - return error; - } - - return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT; -} - /** * acpi_suspend_enter - Actually enter a sleep state. * @pm_state: ignored @@ -109,7 +172,6 @@ static int acpi_suspend_prepare(void) * assembly, which in turn call acpi_enter_sleep_state(). * It's unfortunate, but it works. Please fix if you're feeling frisky. */ - static int acpi_suspend_enter(suspend_state_t pm_state) { acpi_status status = AE_OK; @@ -166,39 +228,6 @@ static int acpi_suspend_enter(suspend_state_t pm_state) return ACPI_SUCCESS(status) ? 0 : -EFAULT; } -/** - * acpi_suspend_finish - Instruct the platform to leave a sleep state. - * - * This is called after we wake back up (or if entering the sleep state - * failed). - */ - -static void acpi_suspend_finish(void) -{ - u32 acpi_state = acpi_target_sleep_state; - - acpi_disable_wakeup_device(acpi_state); - acpi_leave_sleep_state(acpi_state); - - /* reset firmware waking vector */ - acpi_set_firmware_waking_vector((acpi_physical_address) 0); - - acpi_target_sleep_state = ACPI_STATE_S0; -} - -/** - * acpi_suspend_end - Finish up suspend sequence. - */ - -static void acpi_suspend_end(void) -{ - /* - * This is necessary in case acpi_suspend_finish() is not called during a - * failing transition to a sleep state. - */ - acpi_target_sleep_state = ACPI_STATE_S0; -} - static int acpi_suspend_state_valid(suspend_state_t pm_state) { u32 acpi_state; @@ -218,10 +247,39 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state) static struct platform_suspend_ops acpi_suspend_ops = { .valid = acpi_suspend_state_valid, .begin = acpi_suspend_begin, - .prepare = acpi_suspend_prepare, + .prepare = acpi_pm_prepare, + .enter = acpi_suspend_enter, + .finish = acpi_pm_finish, + .end = acpi_pm_end, +}; + +/** + * acpi_suspend_begin_old - Set the target system sleep state to the + * state associated with given @pm_state, if supported, and + * execute the _PTS control method. This function is used if the + * pre-ACPI 2.0 suspend ordering has been requested. + */ +static int acpi_suspend_begin_old(suspend_state_t pm_state) +{ + int error = acpi_suspend_begin(pm_state); + + if (!error) + error = __acpi_pm_prepare(); + return error; +} + +/* + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +static struct platform_suspend_ops acpi_suspend_ops_old = { + .valid = acpi_suspend_state_valid, + .begin = acpi_suspend_begin_old, + .prepare = acpi_pm_disable_gpes, .enter = acpi_suspend_enter, - .finish = acpi_suspend_finish, - .end = acpi_suspend_end, + .finish = acpi_pm_finish, + .end = acpi_pm_end, + .recover = acpi_pm_finish, }; #endif /* CONFIG_SUSPEND */ @@ -229,22 +287,9 @@ static struct platform_suspend_ops acpi_suspend_ops = { static int acpi_hibernation_begin(void) { acpi_target_sleep_state = ACPI_STATE_S4; - return 0; } -static int acpi_hibernation_prepare(void) -{ - int error = acpi_sleep_prepare(ACPI_STATE_S4); - - if (error) { - acpi_target_sleep_state = ACPI_STATE_S0; - return error; - } - - return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT; -} - static int acpi_hibernation_enter(void) { acpi_status status = AE_OK; @@ -274,52 +319,55 @@ static void acpi_hibernation_leave(void) acpi_leave_sleep_state_prep(ACPI_STATE_S4); } -static void acpi_hibernation_finish(void) +static void acpi_pm_enable_gpes(void) { - acpi_disable_wakeup_device(ACPI_STATE_S4); - acpi_leave_sleep_state(ACPI_STATE_S4); - - /* reset firmware waking vector */ - acpi_set_firmware_waking_vector((acpi_physical_address) 0); - - acpi_target_sleep_state = ACPI_STATE_S0; + acpi_hw_enable_all_runtime_gpes(); } -static void acpi_hibernation_end(void) -{ - /* - * This is necessary in case acpi_hibernation_finish() is not called - * during a failing transition to the sleep state. - */ - acpi_target_sleep_state = ACPI_STATE_S0; -} +static struct platform_hibernation_ops acpi_hibernation_ops = { + .begin = acpi_hibernation_begin, + .end = acpi_pm_end, + .pre_snapshot = acpi_pm_prepare, + .finish = acpi_pm_finish, + .prepare = acpi_pm_prepare, + .enter = acpi_hibernation_enter, + .leave = acpi_hibernation_leave, + .pre_restore = acpi_pm_disable_gpes, + .restore_cleanup = acpi_pm_enable_gpes, +}; -static int acpi_hibernation_pre_restore(void) +/** + * acpi_hibernation_begin_old - Set the target system sleep state to + * ACPI_STATE_S4 and execute the _PTS control method. This + * function is used if the pre-ACPI 2.0 suspend ordering has been + * requested. + */ +static int acpi_hibernation_begin_old(void) { - acpi_status status; - - status = acpi_hw_disable_all_gpes(); - - return ACPI_SUCCESS(status) ? 0 : -EFAULT; -} + int error = acpi_sleep_prepare(ACPI_STATE_S4); -static void acpi_hibernation_restore_cleanup(void) -{ - acpi_hw_enable_all_runtime_gpes(); + if (!error) + acpi_target_sleep_state = ACPI_STATE_S4; + return error; } -static struct platform_hibernation_ops acpi_hibernation_ops = { - .begin = acpi_hibernation_begin, - .end = acpi_hibernation_end, - .pre_snapshot = acpi_hibernation_prepare, - .finish = acpi_hibernation_finish, - .prepare = acpi_hibernation_prepare, +/* + * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has + * been requested. + */ +static struct platform_hibernation_ops acpi_hibernation_ops_old = { + .begin = acpi_hibernation_begin_old, + .end = acpi_pm_end, + .pre_snapshot = acpi_pm_disable_gpes, + .finish = acpi_pm_finish, + .prepare = acpi_pm_disable_gpes, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, - .pre_restore = acpi_hibernation_pre_restore, - .restore_cleanup = acpi_hibernation_restore_cleanup, + .pre_restore = acpi_pm_disable_gpes, + .restore_cleanup = acpi_pm_enable_gpes, + .recover = acpi_pm_finish, }; -#endif /* CONFIG_HIBERNATION */ +#endif /* CONFIG_HIBERNATION */ int acpi_suspend(u32 acpi_state) { @@ -461,13 +509,15 @@ int __init acpi_sleep_init(void) } } - suspend_set_ops(&acpi_suspend_ops); + suspend_set_ops(old_suspend_ordering ? + &acpi_suspend_ops_old : &acpi_suspend_ops); #endif #ifdef CONFIG_HIBERNATION status = acpi_get_sleep_type_data(ACPI_STATE_S4, &type_a, &type_b); if (ACPI_SUCCESS(status)) { - hibernation_set_ops(&acpi_hibernation_ops); + hibernation_set_ops(old_suspend_ordering ? + &acpi_hibernation_ops_old : &acpi_hibernation_ops); sleep_states[ACPI_STATE_S4] = 1; printk(" S4"); } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d571204aaff7..3250c5257b74 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -781,8 +781,6 @@ int device_suspend(pm_message_t state) error = dpm_prepare(state); if (!error) error = dpm_suspend(state); - if (error) - device_resume(resume_event(state)); return error; } EXPORT_SYMBOL_GPL(device_suspend); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 41f7ce7edd7a..33adcf91ef41 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -234,6 +234,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n, int acpi_check_mem_region(resource_size_t start, resource_size_t n, const char *name); +#ifdef CONFIG_PM_SLEEP +void __init acpi_old_suspend_ordering(void); +#endif /* CONFIG_PM_SLEEP */ #else /* CONFIG_ACPI */ static inline int early_acpi_boot_init(void) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index a6977423baf7..e8e69159af71 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -86,6 +86,11 @@ typedef int __bitwise suspend_state_t; * that implement @begin(), but platforms implementing @begin() should * also provide a @end() which cleans up transitions aborted before * @enter(). + * + * @recover: Recover the platform from a suspend failure. + * Called by the PM core if the suspending of devices fails. + * This callback is optional and should only be implemented by platforms + * which require special recovery actions in that situation. */ struct platform_suspend_ops { int (*valid)(suspend_state_t state); @@ -94,6 +99,7 @@ struct platform_suspend_ops { int (*enter)(suspend_state_t state); void (*finish)(void); void (*end)(void); + void (*recover)(void); }; #ifdef CONFIG_SUSPEND @@ -149,7 +155,7 @@ extern void mark_free_pages(struct zone *zone); * The methods in this structure allow a platform to carry out special * operations required by it during a hibernation transition. * - * All the methods below must be implemented. + * All the methods below, except for @recover(), must be implemented. * * @begin: Tell the platform driver that we're starting hibernation. * Called right after shrinking memory and before freezing devices. @@ -189,6 +195,11 @@ extern void mark_free_pages(struct zone *zone); * @restore_cleanup: Clean up after a failing image restoration. * Called right after the nonboot CPUs have been enabled and before * thawing devices (runs with IRQs on). + * + * @recover: Recover the platform from a failure to suspend devices. + * Called by the PM core if the suspending of devices during hibernation + * fails. This callback is optional and should only be implemented by + * platforms which require special recovery actions in that situation. */ struct platform_hibernation_ops { int (*begin)(void); @@ -200,6 +211,7 @@ struct platform_hibernation_ops { void (*leave)(void); int (*pre_restore)(void); void (*restore_cleanup)(void); + void (*recover)(void); }; #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/disk.c b/kernel/power/disk.c index d416be0efa8a..f011e0870b52 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode) hibernation_ops->restore_cleanup(); } +/** + * platform_recover - recover the platform from a failure to suspend + * devices. + */ + +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + /** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control @@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode) suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) - goto Resume_console; + goto Recover_platform; if (hibernation_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; error = platform_pre_snapshot(platform_mode); if (error || hibernation_test(TEST_PLATFORM)) @@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Resume_console: resume_console(); Close: platform_end(platform_mode); return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; } /** @@ -398,8 +412,11 @@ int hibernation_platform_enter(void) suspend_console(); error = device_suspend(PMSG_HIBERNATE); - if (error) - goto Resume_console; + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } error = hibernation_ops->prepare(); if (error) @@ -428,7 +445,6 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); - Resume_console: resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index d023b6b584e5..3398f4651aa1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state) error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_console; + goto Recover_platform; } if (suspend_test(TEST_DEVICES)) - goto Resume_devices; + goto Recover_platform; if (suspend_ops->prepare) { error = suspend_ops->prepare(); @@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_ops->finish(); Resume_devices: device_resume(PMSG_RESUME); - Resume_console: resume_console(); Close: if (suspend_ops->end) suspend_ops->end(); return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; } /** -- cgit v1.2.3 From c50cbb05a05cf1f9ca3592272eff053c847727d8 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 4 Jun 2008 21:47:29 -0700 Subject: cpu topology: always define CPU topology information This can result in an empty topology directory in sysfs, and requires in-kernel users to protect all uses with #ifdef - see . The documentation of CPU topology specifies what the defaults should be if only partial information is available from the hardware. So we can provide these defaults as a fallback. This patch: - Adds default definitions of the 4 topology macros to - Changes drivers/base/topology.c to use the topology macros unconditionally and to cope with definitions that aren't lvalues - Updates documentation accordingly [ From: Andrew Morton - fold now-duplicated code - fix layout ] Signed-off-by: Ben Hutchings Cc: Vegard Nossum Cc: Nick Piggin Cc: Chandra Seetharaman Cc: Suresh Siddha Cc: Mike Travis Cc: Christoph Lameter Cc: John Hawkes Cc: Zhang, Yanmin Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- Documentation/cputopology.txt | 26 +++++++++----------------- drivers/base/topology.c | 38 ++++++++++---------------------------- include/linux/topology.h | 13 +++++++++++++ 3 files changed, 32 insertions(+), 45 deletions(-) (limited to 'drivers/base') diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index b61cb9564023..bd699da24666 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -14,9 +14,8 @@ represent the thread siblings to cpu X in the same physical package; To implement it in an architecture-neutral way, a new source file, drivers/base/topology.c, is to export the 4 attributes. -If one architecture wants to support this feature, it just needs to -implement 4 defines, typically in file include/asm-XXX/topology.h. -The 4 defines are: +For an architecture to support this feature, it must define some of +these macros in include/asm-XXX/topology.h: #define topology_physical_package_id(cpu) #define topology_core_id(cpu) #define topology_thread_siblings(cpu) @@ -25,17 +24,10 @@ The 4 defines are: The type of **_id is int. The type of siblings is cpumask_t. -To be consistent on all architectures, the 4 attributes should have -default values if their values are unavailable. Below is the rule. -1) physical_package_id: If cpu has no physical package id, -1 is the -default value. -2) core_id: If cpu doesn't support multi-core, its core id is 0. -3) thread_siblings: Just include itself, if the cpu doesn't support -HT/multi-thread. -4) core_siblings: Just include itself, if the cpu doesn't support -multi-core and HT/Multi-thread. - -So be careful when declaring the 4 defines in include/asm-XXX/topology.h. - -If an attribute isn't defined on an architecture, it won't be exported. - +To be consistent on all architectures, include/linux/topology.h +provides default definitions for any of the above macros that are +not defined by include/asm-XXX/topology.h: +1) physical_package_id: -1 +2) core_id: 0 +3) thread_siblings: just the given CPU +4) core_siblings: just the given CPU diff --git a/drivers/base/topology.c b/drivers/base/topology.c index fdf4044d2e74..24d29a9fc25b 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -59,60 +59,42 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - return show_cpumap(0, &(topology_##name(cpu)), buf); \ + cpumask_t siblings = topology_##name(cpu); \ + return show_cpumap(0, &siblings, buf); \ } #define define_siblings_show_list(name) \ static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - return show_cpumap(1, &(topology_##name(cpu)), buf); \ + cpumask_t siblings = topology_##name(cpu); \ + return show_cpumap(1, &siblings, buf); \ } #define define_siblings_show_func(name) \ define_siblings_show_map(name); define_siblings_show_list(name) -#ifdef topology_physical_package_id define_id_show_func(physical_package_id); define_one_ro(physical_package_id); -#define ref_physical_package_id_attr &attr_physical_package_id.attr, -#else -#define ref_physical_package_id_attr -#endif -#ifdef topology_core_id define_id_show_func(core_id); define_one_ro(core_id); -#define ref_core_id_attr &attr_core_id.attr, -#else -#define ref_core_id_attr -#endif -#ifdef topology_thread_siblings define_siblings_show_func(thread_siblings); define_one_ro(thread_siblings); define_one_ro(thread_siblings_list); -#define ref_thread_siblings_attr \ - &attr_thread_siblings.attr, &attr_thread_siblings_list.attr, -#else -#define ref_thread_siblings_attr -#endif -#ifdef topology_core_siblings define_siblings_show_func(core_siblings); define_one_ro(core_siblings); define_one_ro(core_siblings_list); -#define ref_core_siblings_attr \ - &attr_core_siblings.attr, &attr_core_siblings_list.attr, -#else -#define ref_core_siblings_attr -#endif static struct attribute *default_attrs[] = { - ref_physical_package_id_attr - ref_core_id_attr - ref_thread_siblings_attr - ref_core_siblings_attr + &attr_physical_package_id.attr, + &attr_core_id.attr, + &attr_thread_siblings.attr, + &attr_thread_siblings_list.attr, + &attr_core_siblings.attr, + &attr_core_siblings_list.attr, NULL }; diff --git a/include/linux/topology.h b/include/linux/topology.h index 24f3d2282e11..2158fc0d5a56 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -179,4 +179,17 @@ void arch_update_cpu_topology(void); #endif #endif /* CONFIG_NUMA */ +#ifndef topology_physical_package_id +#define topology_physical_package_id(cpu) ((void)(cpu), -1) +#endif +#ifndef topology_core_id +#define topology_core_id(cpu) ((void)(cpu), 0) +#endif +#ifndef topology_thread_siblings +#define topology_thread_siblings(cpu) cpumask_of_cpu(cpu) +#endif +#ifndef topology_core_siblings +#define topology_core_siblings(cpu) cpumask_of_cpu(cpu) +#endif + #endif /* _LINUX_TOPOLOGY_H */ -- cgit v1.2.3 From 131b943ae643b1ad6febd67cdbb31d955706ecf4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 5 Jun 2008 17:37:15 +0100 Subject: cputopology: always define CPU topology information, clean up simplify drivers/base/topology.c a bit. Signed-off-by: Ben Hutchings Signed-off-by: Ingo Molnar --- drivers/base/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/topology.c b/drivers/base/topology.c index 24d29a9fc25b..f0cb27011930 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -59,16 +59,14 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - cpumask_t siblings = topology_##name(cpu); \ - return show_cpumap(0, &siblings, buf); \ + return show_cpumap(0, &(topology_##name(cpu)), buf); \ } #define define_siblings_show_list(name) \ static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ - cpumask_t siblings = topology_##name(cpu); \ - return show_cpumap(1, &siblings, buf); \ + return show_cpumap(1, &(topology_##name(cpu)), buf); \ } #define define_siblings_show_func(name) \ -- cgit v1.2.3 From 2d5c1be8870383622809c25935fff00d2630c7a5 Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Fri, 4 Jul 2008 10:00:05 -0700 Subject: mm: switch node meminfo Active & Inactive pages to Kbytes There is a bug in the output of /sys/devices/system/node/node[n]/meminfo where the Active and Inactive values are in pages instead of Kbytes. Looks like this occurred back in 2.6.20 when the code was changed over to use node_page_state(). Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/node.c b/drivers/base/node.c index 39f3d1b3a213..0f867a083338 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -84,8 +84,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, node_page_state(nid, NR_ACTIVE), - nid, node_page_state(nid, NR_INACTIVE), + nid, K(node_page_state(nid, NR_ACTIVE)), + nid, K(node_page_state(nid, NR_INACTIVE)), #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), -- cgit v1.2.3 From eb9d0fe40e313c0a74115ef456a2e43a6c8da72f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 7 Jul 2008 03:34:48 +0200 Subject: PCI ACPI: Rework PCI handling of wake-up * Introduce function acpi_pm_device_sleep_wake() for enabling and disabling the system wake-up capability of devices that are power manageable by ACPI. * Introduce function acpi_bus_can_wakeup() allowing other (dependent) subsystems to check if ACPI is able to enable the system wake-up capability of given device. * Introduce callback .sleep_wake() in struct pci_platform_pm_ops and for the ACPI PCI 'driver' make it use acpi_pm_device_sleep_wake(). * Introduce callback .can_wakeup() in struct pci_platform_pm_ops and for the ACPI 'driver' make it use acpi_bus_can_wakeup(). * Move the PME# handlig code out of pci_enable_wake() and split it into two functions, pci_pme_capable() and pci_pme_active(), allowing the caller to check if given device is capable of generating PME# from given power state and to enable/disable the device's PME# functionality, respectively. * Modify pci_enable_wake() to use the new ACPI callbacks and the new PME#-related functions. * Drop the generic .platform_enable_wakeup() callback that is not used any more. * Introduce device_set_wakeup_capable() that will set the power.can_wakeup flag of given device. * Rework PCI device PM initialization so that, if given device is capable of generating wake-up events, either natively through the PME# mechanism, or with the help of the platform, its power.can_wakeup flag is set and its power.should_wakeup flag is unset as appropriate. * Make ACPI set the power.can_wakeup flag for devices found to be wake-up capable by it. * Make the ACPI wake-up code enable/disable GPEs for devices that have the wakeup.flags.prepared flag set (which means that their wake-up power has been enabled). Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/acpi/bus.c | 11 +++ drivers/acpi/glue.c | 2 + drivers/acpi/sleep/main.c | 25 +++++++ drivers/acpi/sleep/wakeup.c | 11 +-- drivers/base/power/sysfs.c | 3 - drivers/pci/pci-acpi.c | 20 ++++++ drivers/pci/pci.c | 169 ++++++++++++++++++++++++++++++++------------ drivers/pci/pci.h | 8 +++ drivers/pci/probe.c | 47 +----------- include/acpi/acpi_bus.h | 2 + include/linux/pm_wakeup.h | 26 ++----- 11 files changed, 208 insertions(+), 116 deletions(-) (limited to 'drivers/base') diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index b9b69d9629b5..fc1110d6a078 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -306,6 +306,17 @@ bool acpi_bus_power_manageable(acpi_handle handle) EXPORT_SYMBOL(acpi_bus_power_manageable); +bool acpi_bus_can_wakeup(acpi_handle handle) +{ + struct acpi_device *device; + int result; + + result = acpi_bus_get_device(handle, &device); + return result ? false : device->wakeup.flags.valid; +} + +EXPORT_SYMBOL(acpi_bus_can_wakeup); + /* -------------------------------------------------------------------------- Event Management -------------------------------------------------------------------------- */ diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 06f8634fe58b..87c5d456e180 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -166,6 +166,8 @@ static int acpi_bind_one(struct device *dev, acpi_handle handle) "firmware_node"); ret = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, "physical_node"); + if (acpi_dev->wakeup.flags.valid) + device_set_wakeup_capable(dev, true); } return 0; diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c index 4addf8ad50ae..af7f4663deaa 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/sleep/main.c @@ -468,6 +468,31 @@ int acpi_pm_device_sleep_state(struct device *dev, int *d_min_p) *d_min_p = d_min; return d_max; } + +/** + * acpi_pm_device_sleep_wake - enable or disable the system wake-up + * capability of given device + * @dev: device to handle + * @enable: 'true' - enable, 'false' - disable the wake-up capability + */ +int acpi_pm_device_sleep_wake(struct device *dev, bool enable) +{ + acpi_handle handle; + struct acpi_device *adev; + + if (!device_may_wakeup(dev)) + return -EINVAL; + + handle = DEVICE_ACPI_HANDLE(dev); + if (!handle || ACPI_FAILURE(acpi_bus_get_device(handle, &adev))) { + printk(KERN_DEBUG "ACPI handle has no context!\n"); + return -ENODEV; + } + + return enable ? + acpi_enable_wakeup_device_power(adev, acpi_target_sleep_state) : + acpi_disable_wakeup_device_power(adev); +} #endif static void acpi_power_off_prepare(void) diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c index 7422a2213944..38655eb132dc 100644 --- a/drivers/acpi/sleep/wakeup.c +++ b/drivers/acpi/sleep/wakeup.c @@ -66,13 +66,15 @@ void acpi_enable_wakeup_device(u8 sleep_state) list_for_each_safe(node, next, &acpi_wakeup_device_list) { struct acpi_device *dev = container_of(node, struct acpi_device, wakeup_list); + if (!dev->wakeup.flags.valid) continue; + /* If users want to disable run-wake GPE, * we only disable it for wake and leave it for runtime */ - if (!dev->wakeup.state.enabled || - sleep_state > (u32) dev->wakeup.sleep_state) { + if ((!dev->wakeup.state.enabled && !dev->wakeup.flags.prepared) + || sleep_state > (u32) dev->wakeup.sleep_state) { if (dev->wakeup.flags.run_wake) { spin_unlock(&acpi_device_lock); /* set_gpe_type will disable GPE, leave it like that */ @@ -110,8 +112,9 @@ void acpi_disable_wakeup_device(u8 sleep_state) if (!dev->wakeup.flags.valid) continue; - if (!dev->wakeup.state.enabled || - sleep_state > (u32) dev->wakeup.sleep_state) { + + if ((!dev->wakeup.state.enabled && !dev->wakeup.flags.prepared) + || sleep_state > (u32) dev->wakeup.sleep_state) { if (dev->wakeup.flags.run_wake) { spin_unlock(&acpi_device_lock); acpi_set_gpe_type(dev->wakeup.gpe_device, diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index d11f74b038db..596aeecfdffe 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -6,9 +6,6 @@ #include #include "power.h" -int (*platform_enable_wakeup)(struct device *dev, int is_on); - - /* * wakeup - Report/change current wakeup option for device * diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 6bc0d8c870af..7764768b6a0e 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -299,10 +299,30 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) return error; } +static bool acpi_pci_can_wakeup(struct pci_dev *dev) +{ + acpi_handle handle = DEVICE_ACPI_HANDLE(&dev->dev); + + return handle ? acpi_bus_can_wakeup(handle) : false; +} + +static int acpi_pci_sleep_wake(struct pci_dev *dev, bool enable) +{ + int error = acpi_pm_device_sleep_wake(&dev->dev, enable); + + if (!error) + dev_printk(KERN_INFO, &dev->dev, + "wake-up capability %s by ACPI\n", + enable ? "enabled" : "disabled"); + return error; +} + static struct pci_platform_pm_ops acpi_pci_platform_pm = { .is_manageable = acpi_pci_power_manageable, .set_state = acpi_pci_set_power_state, .choose_state = acpi_pci_choose_state, + .can_wakeup = acpi_pci_can_wakeup, + .sleep_wake = acpi_pci_sleep_wake, }; /* ACPI bus type */ diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 20e28077b96d..a6b1b6f96abc 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -380,7 +380,8 @@ static struct pci_platform_pm_ops *pci_platform_pm; int pci_set_platform_pm(struct pci_platform_pm_ops *ops) { - if (!ops->is_manageable || !ops->set_state || !ops->choose_state) + if (!ops->is_manageable || !ops->set_state || !ops->choose_state + || !ops->sleep_wake || !ops->can_wakeup) return -EINVAL; pci_platform_pm = ops; return 0; @@ -403,6 +404,17 @@ static inline pci_power_t platform_pci_choose_state(struct pci_dev *dev) pci_platform_pm->choose_state(dev) : PCI_POWER_ERROR; } +static inline bool platform_pci_can_wakeup(struct pci_dev *dev) +{ + return pci_platform_pm ? pci_platform_pm->can_wakeup(dev) : false; +} + +static inline int platform_pci_sleep_wake(struct pci_dev *dev, bool enable) +{ + return pci_platform_pm ? + pci_platform_pm->sleep_wake(dev, enable) : -ENODEV; +} + /** * pci_raw_set_power_state - Use PCI PM registers to set the power state of * given PCI device @@ -1035,6 +1047,56 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) return pcibios_set_pcie_reset_state(dev, state); } +/** + * pci_pme_capable - check the capability of PCI device to generate PME# + * @dev: PCI device to handle. + * @pm: PCI PM capability offset of the device. + * @state: PCI state from which device will issue PME#. + */ +static bool pci_pme_capable(struct pci_dev *dev, int pm, pci_power_t state) +{ + u16 pmc; + + if (!pm) + return false; + + /* Check device's ability to generate PME# from given state */ + pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); + + pmc &= PCI_PM_CAP_PME_MASK; + pmc >>= ffs(PCI_PM_CAP_PME_MASK) - 1; /* First bit of mask */ + + return !!(pmc & (1 << state)); +} + +/** + * pci_pme_active - enable or disable PCI device's PME# function + * @dev: PCI device to handle. + * @pm: PCI PM capability offset of the device. + * @enable: 'true' to enable PME# generation; 'false' to disable it. + * + * The caller must verify that the device is capable of generating PME# before + * calling this function with @enable equal to 'true'. + */ +static void pci_pme_active(struct pci_dev *dev, int pm, bool enable) +{ + u16 pmcsr; + + if (!pm) + return; + + pci_read_config_word(dev, pm + PCI_PM_CTRL, &pmcsr); + /* Clear PME_Status by writing 1 to it and enable PME# */ + pmcsr |= PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE; + if (!enable) + pmcsr &= ~PCI_PM_CTRL_PME_ENABLE; + + pci_write_config_word(dev, pm + PCI_PM_CTRL, pmcsr); + + dev_printk(KERN_INFO, &dev->dev, "PME# %s\n", + enable ? "enabled" : "disabled"); +} + /** * pci_enable_wake - enable PCI device as wakeup event source * @dev: PCI device affected @@ -1046,66 +1108,83 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) * called automatically by this routine. * * Devices with legacy power management (no standard PCI PM capabilities) - * always require such platform hooks. Depending on the platform, devices - * supporting the standard PCI PME# signal may require such platform hooks; - * they always update bits in config space to allow PME# generation. + * always require such platform hooks. * - * -EIO is returned if the device can't ever be a wakeup event source. - * -EINVAL is returned if the device can't generate wakeup events from - * the specified PCI state. Returns zero if the operation is successful. + * RETURN VALUE: + * 0 is returned on success + * -EINVAL is returned if device is not supposed to wake up the system + * Error code depending on the platform is returned if both the platform and + * the native mechanism fail to enable the generation of wake-up events */ int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable) { int pm; - int status; - u16 value; - - /* Note that drivers should verify device_may_wakeup(&dev->dev) - * before calling this function. Platform code should report - * errors when drivers try to enable wakeup on devices that - * can't issue wakeups, or on which wakeups were disabled by - * userspace updating the /sys/devices.../power/wakeup file. - */ + int error = 0; + bool pme_done = false; - status = call_platform_enable_wakeup(&dev->dev, enable); - - /* find PCI PM capability in list */ - pm = pci_find_capability(dev, PCI_CAP_ID_PM); + if (!device_may_wakeup(&dev->dev)) + return -EINVAL; - /* If device doesn't support PM Capabilities, but caller wants to - * disable wake events, it's a NOP. Otherwise fail unless the - * platform hooks handled this legacy device already. + /* + * According to "PCI System Architecture" 4th ed. by Tom Shanley & Don + * Anderson we should be doing PME# wake enable followed by ACPI wake + * enable. To disable wake-up we call the platform first, for symmetry. */ - if (!pm) - return enable ? status : 0; - /* Check device's ability to generate PME# */ - pci_read_config_word(dev,pm+PCI_PM_PMC,&value); + if (!enable && platform_pci_can_wakeup(dev)) + error = platform_pci_sleep_wake(dev, false); - value &= PCI_PM_CAP_PME_MASK; - value >>= ffs(PCI_PM_CAP_PME_MASK) - 1; /* First bit of mask */ - - /* Check if it can generate PME# from requested state. */ - if (!value || !(value & (1 << state))) { - /* if it can't, revert what the platform hook changed, - * always reporting the base "EINVAL, can't PME#" error - */ - if (enable) - call_platform_enable_wakeup(&dev->dev, 0); - return enable ? -EINVAL : 0; + pm = pci_find_capability(dev, PCI_CAP_ID_PM); + if (!enable || pci_pme_capable(dev, pm, state)) { + pci_pme_active(dev, pm, enable); + pme_done = true; } - pci_read_config_word(dev, pm + PCI_PM_CTRL, &value); + if (enable && platform_pci_can_wakeup(dev)) + error = platform_pci_sleep_wake(dev, true); - /* Clear PME_Status by writing 1 to it and enable PME# */ - value |= PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE; + return pme_done ? 0 : error; +} - if (!enable) - value &= ~PCI_PM_CTRL_PME_ENABLE; +/** + * pci_pm_init - Initialize PM functions of given PCI device + * @dev: PCI device to handle. + */ +void pci_pm_init(struct pci_dev *dev) +{ + int pm; + u16 pmc; - pci_write_config_word(dev, pm + PCI_PM_CTRL, value); + /* find PCI PM capability in list */ + pm = pci_find_capability(dev, PCI_CAP_ID_PM); + if (!pm) + return; + /* Check device's ability to generate PME# */ + pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); - return 0; + if ((pmc & PCI_PM_CAP_VER_MASK) > 3) { + dev_err(&dev->dev, "unsupported PM cap regs version (%u)\n", + pmc & PCI_PM_CAP_VER_MASK); + return; + } + + if (pmc & PCI_PM_CAP_PME_MASK) { + dev_printk(KERN_INFO, &dev->dev, + "PME# supported from%s%s%s%s%s\n", + (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", + (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", + (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", + (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", + (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); + /* + * Make device's PM flags reflect the wake-up capability, but + * let the user space enable it to wake up the system as needed. + */ + device_set_wakeup_capable(&dev->dev, true); + device_set_wakeup_enable(&dev->dev, false); + /* Disable the PME# generation functionality */ + pci_pme_active(dev, pm, false); + } } int diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 0cd2e719933b..b08dfc9746af 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -17,6 +17,11 @@ extern void pci_cleanup_rom(struct pci_dev *dev); * platform; to be used during system-wide transitions from a * sleeping state to the working state and vice versa * + * @can_wakeup - returns 'true' if given device is capable of waking up the + * system from a sleeping state + * + * @sleep_wake - enables/disables the system wake up capability of given device + * * If given platform is generally capable of power managing PCI devices, all of * these callbacks are mandatory. */ @@ -24,9 +29,12 @@ struct pci_platform_pm_ops { bool (*is_manageable)(struct pci_dev *dev); int (*set_state)(struct pci_dev *dev, pci_power_t state); pci_power_t (*choose_state)(struct pci_dev *dev); + bool (*can_wakeup)(struct pci_dev *dev); + int (*sleep_wake)(struct pci_dev *dev, bool enable); }; extern int pci_set_platform_pm(struct pci_platform_pm_ops *ops); +extern void pci_pm_init(struct pci_dev *dev); extern int pci_user_read_config_byte(struct pci_dev *dev, int where, u8 *val); extern int pci_user_read_config_word(struct pci_dev *dev, int where, u16 *val); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 2f0ae70710d6..b1724cf31b66 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -860,49 +860,6 @@ int pci_cfg_space_size_ext(struct pci_dev *dev) return PCI_CFG_SPACE_SIZE; } -/** - * pci_disable_pme - Disable the PME function of PCI device - * @dev: PCI device affected - * -EINVAL is returned if PCI device doesn't support PME. - * Zero is returned if the PME is supported and can be disabled. - */ -static int pci_disable_pme(struct pci_dev *dev) -{ - int pm; - u16 value; - - /* find PCI PM capability in list */ - pm = pci_find_capability(dev, PCI_CAP_ID_PM); - - /* If device doesn't support PM Capabilities, it means that PME is - * not supported. - */ - if (!pm) - return -EINVAL; - /* Check device's ability to generate PME# */ - pci_read_config_word(dev, pm + PCI_PM_PMC, &value); - - value &= PCI_PM_CAP_PME_MASK; - /* Check if it can generate PME# */ - if (!value) { - /* - * If it is zero, it means that PME is still unsupported - * although there exists the PM capability. - */ - return -EINVAL; - } - - pci_read_config_word(dev, pm + PCI_PM_CTRL, &value); - - /* Clear PME_Status by writing 1 to it */ - value |= PCI_PM_CTRL_PME_STATUS ; - /* Disable PME enable bit */ - value &= ~PCI_PM_CTRL_PME_ENABLE; - pci_write_config_word(dev, pm + PCI_PM_CTRL, value); - - return 0; -} - int pci_cfg_space_size(struct pci_dev *dev) { int pos; @@ -1010,7 +967,6 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) } pci_vpd_pci22_init(dev); - pci_disable_pme(dev); return dev; } @@ -1031,6 +987,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) /* Fix up broken headers */ pci_fixup_device(pci_fixup_header, dev); + /* Initialize power management of the device */ + pci_pm_init(dev); + /* * Add the device to our list of discovered devices * and the bus list for fixup functions, etc. diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 071daf8db600..7ab5a611e43f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -337,6 +337,7 @@ int acpi_bus_get_status(struct acpi_device *device); int acpi_bus_get_power(acpi_handle handle, int *state); int acpi_bus_set_power(acpi_handle handle, int state); bool acpi_bus_power_manageable(acpi_handle handle); +bool acpi_bus_can_wakeup(acpi_handle handle); #ifdef CONFIG_ACPI_PROC_EVENT int acpi_bus_generate_proc_event(struct acpi_device *device, u8 type, int data); int acpi_bus_generate_proc_event4(const char *class, const char *bid, u8 type, int data); @@ -379,6 +380,7 @@ acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int); #ifdef CONFIG_PM_SLEEP int acpi_pm_device_sleep_state(struct device *, int *); +int acpi_pm_device_sleep_wake(struct device *, bool); #else /* !CONFIG_PM_SLEEP */ static inline int acpi_pm_device_sleep_state(struct device *d, int *p) { diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index f0d0b2cb8d20..3af0c8d05cdc 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -35,6 +35,11 @@ static inline void device_init_wakeup(struct device *dev, int val) dev->power.can_wakeup = dev->power.should_wakeup = !!val; } +static inline void device_set_wakeup_capable(struct device *dev, int val) +{ + dev->power.can_wakeup = !!val; +} + static inline int device_can_wakeup(struct device *dev) { return dev->power.can_wakeup; @@ -47,21 +52,7 @@ static inline void device_set_wakeup_enable(struct device *dev, int val) static inline int device_may_wakeup(struct device *dev) { - return dev->power.can_wakeup & dev->power.should_wakeup; -} - -/* - * Platform hook to activate device wakeup capability, if that's not already - * handled by enable_irq_wake() etc. - * Returns zero on success, else negative errno - */ -extern int (*platform_enable_wakeup)(struct device *dev, int is_on); - -static inline int call_platform_enable_wakeup(struct device *dev, int is_on) -{ - if (platform_enable_wakeup) - return (*platform_enable_wakeup)(dev, is_on); - return 0; + return dev->power.can_wakeup && dev->power.should_wakeup; } #else /* !CONFIG_PM */ @@ -80,11 +71,6 @@ static inline int device_can_wakeup(struct device *dev) #define device_set_wakeup_enable(dev, val) do {} while (0) #define device_may_wakeup(dev) 0 -static inline int call_platform_enable_wakeup(struct device *dev, int is_on) -{ - return 0; -} - #endif /* !CONFIG_PM */ #endif /* _LINUX_PM_WAKEUP_H */ -- cgit v1.2.3 From 23ca4bba3e20c6c3cb11c1bb0ab4770b724d39ac Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:12 +0200 Subject: x86: cleanup early per cpu variables/accesses v4 * Introduce a new PER_CPU macro called "EARLY_PER_CPU". This is used by some per_cpu variables that are initialized and accessed before there are per_cpu areas allocated. ["Early" in respect to per_cpu variables is "earlier than the per_cpu areas have been setup".] This patchset adds these new macros: DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) EXPORT_EARLY_PER_CPU_SYMBOL(_name) DECLARE_EARLY_PER_CPU(_type, _name) early_per_cpu_ptr(_name) early_per_cpu_map(_name, _idx) early_per_cpu(_name, _cpu) The DEFINE macro defines the per_cpu variable as well as the early map and pointer. It also initializes the per_cpu variable and map elements to "_initvalue". The early_* macros provide access to the initial map (usually setup during system init) and the early pointer. This pointer is initialized to point to the early map but is then NULL'ed when the actual per_cpu areas are setup. After that the per_cpu variable is the correct access to the variable. The early_per_cpu() macro is not very efficient but does show how to access the variable if you have a function that can be called both "early" and "late". It tests the early ptr to be NULL, and if not then it's still valid. Otherwise, the per_cpu variable is used instead: #define early_per_cpu(_name, _cpu) \ (early_per_cpu_ptr(_name) ? \ early_per_cpu_ptr(_name)[_cpu] : \ per_cpu(_name, _cpu)) A better method is to actually check the pointer manually. In the case below, numa_set_node can be called both "early" and "late": void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); if (cpu_to_node_map) cpu_to_node_map[cpu] = node; else per_cpu(x86_cpu_to_node_map, cpu) = node; } * Add a flag "arch_provides_topology_pointers" that indicates pointers to topology cpumask_t maps are available. Otherwise, use the function returning the cpumask_t value. This is useful if cpumask_t set size is very large to avoid copying data on to/off of the stack. * The coverage of CONFIG_DEBUG_PER_CPU_MAPS has been increased while the non-debug case has been optimized a bit. * Remove an unreferenced compiler warning in drivers/base/topology.c * Clean up #ifdef in setup.c For inclusion into sched-devel/latest tree. Based on: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git + sched-devel/latest .../mingo/linux-2.6-sched-devel.git Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/kernel/apic_32.c | 9 +-- arch/x86/kernel/apic_64.c | 11 ++-- arch/x86/kernel/setup.c | 96 ++++++++++++++++++++++++++---- arch/x86/kernel/setup_32.c | 24 -------- arch/x86/kernel/setup_64.c | 9 --- arch/x86/kernel/smpboot.c | 20 +------ arch/x86/mm/numa_64.c | 43 ++++---------- arch/x86/mm/srat_64.c | 2 +- drivers/base/topology.c | 25 +++++++- include/asm-x86/numa_64.h | 19 +++--- include/asm-x86/percpu.h | 46 +++++++++++++++ include/asm-x86/smp.h | 15 +---- include/asm-x86/topology.h | 143 ++++++++++++++++++++++++++------------------- 15 files changed, 270 insertions(+), 196 deletions(-) (limited to 'drivers/base') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2e325521e5e9..4469a0db1ae1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE def_bool y config HAVE_SETUP_PER_CPU_AREA - def_bool X86_64 || (X86_SMP && !X86_VOYAGER) + def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 18363374d51a..24ca95a0ba54 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -60,7 +60,7 @@ config DEBUG_PAGEALLOC config DEBUG_PER_CPU_MAPS bool "Debug access to per_cpu maps" depends on DEBUG_KERNEL - depends on X86_64_SMP + depends on X86_SMP default n help Say Y to verify that the per_cpu map being accessed has diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4b99b1bdeb6c..f17c1c1bc384 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -52,9 +52,6 @@ unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - /* * Knob to control our willingness to enable the local APIC. * @@ -1534,9 +1531,9 @@ void __cpuinit generic_processor_info(int apicid, int version) } #ifdef CONFIG_SMP /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0633cfd0dc29..4fd21f7d698c 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -87,9 +87,6 @@ static unsigned long apic_phys; unsigned long mp_lapic_addr; -DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version @@ -1091,9 +1088,9 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu = 0; } /* are we being called early in kernel startup? */ - if (x86_cpu_to_apicid_early_ptr) { - u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; - u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); cpu_to_apicid[cpu] = apicid; bios_cpu_apicid[cpu] = apicid; @@ -1269,7 +1266,7 @@ __cpuinit int apic_is_clustered_box(void) if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) return 0; - bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6f80b852a196..03caa8e4351f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -19,13 +19,23 @@ unsigned disabled_cpus __cpuinitdata; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL(boot_cpu_physical_apicid); -DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); - /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; #endif +/* map cpu index to physical APIC ID */ +DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#define X86_64_NUMA 1 + +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); +#endif + #if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) /* * Copy data used in early init routines from the initial arrays to the @@ -37,20 +47,21 @@ static void __init setup_per_cpu_maps(void) int cpu; for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = - x86_bios_cpu_apicid_init[cpu]; -#ifdef CONFIG_NUMA + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA per_cpu(x86_cpu_to_node_map, cpu) = - x86_cpu_to_node_map_init[cpu]; + early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif } /* indicate the early static arrays will soon be gone */ - x86_cpu_to_apicid_early_ptr = NULL; - x86_bios_cpu_apicid_early_ptr = NULL; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = NULL; + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif } @@ -109,7 +120,8 @@ void __init setup_per_cpu_areas(void) if (!node_online(node) || !NODE_DATA(node)) { ptr = alloc_bootmem_pages(size); printk(KERN_INFO - "cpu %d has no node or node-local memory\n", i); + "cpu %d has no node %d or node-local memory\n", + i, node); } else ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); @@ -137,3 +149,63 @@ void __init setup_per_cpu_areas(void) } #endif + +#ifdef X86_64_NUMA +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + if (cpu_to_node_map) + cpu_to_node_map[cpu] = node; + + else if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); +} +#endif /* CONFIG_NUMA */ + +#if defined(CONFIG_DEBUG_PER_CPU_MAPS) && defined(CONFIG_X86_64) + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +#endif diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 5a2f8e063887..ccd5f5cdbbe6 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -737,18 +737,6 @@ char * __init __attribute__((weak)) memory_setup(void) return machine_specific_memory_setup(); } -#ifdef CONFIG_NUMA -/* - * In the golden day, when everything among i386 and x86_64 will be - * integrated, this will not live here - */ -void *x86_cpu_to_node_map_early_ptr; -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -#endif - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -887,18 +875,6 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); -#ifdef CONFIG_X86_SMP - /* - * setup to use the early static init tables during kernel startup - * X86_SMP will exclude sub-arches that don't deal well with it. - */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); #endif diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 6dff1286ad8a..e8df64fad540 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -406,15 +406,6 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#ifdef CONFIG_SMP - /* setup to use the early static init tables during kernel startup */ - x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; - x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; -#ifdef CONFIG_NUMA - x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; -#endif -#endif - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e1cecedde42..036604d3daed 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,22 +68,6 @@ #include #include -/* - * FIXME: For x86_64, those are defined in other files. But moving them here, - * would make the setup areas dependent on smp, which is a loss. When we - * integrate apic between arches, we can probably do a better job, but - * right now, they'll stay here -- glommer - */ - -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = - { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_early_ptr; - -u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata - = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_bios_cpu_apicid_early_ptr; - #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; static int low_mappings; @@ -992,7 +976,7 @@ do_rest: /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); #ifdef CONFIG_X86_64 - clear_node_cpumask(cpu); /* was set by numa_add_cpu */ + numa_remove_cpu(cpu); /* was set by numa_add_cpu */ #endif cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ @@ -1373,7 +1357,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ clear_bit(cpu, (unsigned long *)&cpu_initialized); - clear_node_cpumask(cpu); + numa_remove_cpu(cpu); #endif } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5d..970f86775c41 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -31,16 +31,6 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -#ifdef CONFIG_SMP -int x86_cpu_to_node_map_init[NR_CPUS] = { - [0 ... NR_CPUS-1] = NUMA_NO_NODE -}; -void *x86_cpu_to_node_map_early_ptr; -EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); -#endif -DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; -EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); - s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; @@ -577,24 +567,6 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__cpuinit void numa_add_cpu(int cpu) -{ - set_bit(cpu, - (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if(cpu_to_node_map) - cpu_to_node_map[cpu] = node; - else if(per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; - else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); -} - unsigned long __init numa_free_all_bootmem(void) { unsigned long pages = 0; @@ -641,6 +613,7 @@ static __init int numa_setup(char *opt) } early_param("numa", numa_setup); +#ifdef CONFIG_NUMA /* * Setup early cpu_to_node. * @@ -652,14 +625,19 @@ early_param("numa", numa_setup); * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. */ void __init init_cpu_to_node(void) { - int i; + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - for (i = 0; i < NR_CPUS; i++) { + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { int node; - u16 apicid = x86_cpu_to_apicid_init[i]; + u16 apicid = cpu_to_apicid[cpu]; if (apicid == BAD_APICID) continue; @@ -668,8 +646,9 @@ void __init init_cpu_to_node(void) continue; if (!node_online(node)) continue; - numa_set_node(i, node); + numa_set_node(cpu, node); } } +#endif diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 99649dccad28..012220e31c99 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; if (!node_isset(node, node_possible_map)) - numa_set_node(i, NUMA_NO_NODE); + numa_clear_node(i); } numa_init_array(); return 0; diff --git a/drivers/base/topology.c b/drivers/base/topology.c index fdf4044d2e74..1efe162e16d7 100644 --- a/drivers/base/topology.c +++ b/drivers/base/topology.c @@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \ return sprintf(buf, "%d\n", topology_##name(cpu)); \ } +#if defined(topology_thread_siblings) || defined(topology_core_siblings) static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) { ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; @@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) } return n; } +#endif +#ifdef arch_provides_topology_pointers #define define_siblings_show_map(name) \ -static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(0, &(topology_##name(cpu)), buf); \ } #define define_siblings_show_list(name) \ -static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ { \ unsigned int cpu = dev->id; \ return show_cpumap(1, &(topology_##name(cpu)), buf); \ } +#else +#define define_siblings_show_map(name) \ +static ssize_t show_##name(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(0, &mask, buf); \ +} + +#define define_siblings_show_list(name) \ +static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + cpumask_t mask = topology_##name(cpu); \ + return show_cpumap(1, &mask, buf); \ +} +#endif + #define define_siblings_show_func(name) \ define_siblings_show_map(name); define_siblings_show_list(name) diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h index 22e87c9f6a80..b510daf4f4d8 100644 --- a/include/asm-x86/numa_64.h +++ b/include/asm-x86/numa_64.h @@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) -extern void numa_add_cpu(int cpu); extern void numa_init_array(void); extern int numa_off; -extern void numa_set_node(int cpu, int node); extern void srat_reserve_add_area(int nodeid); extern int hotadd_percent; @@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, #ifdef CONFIG_NUMA extern void __init init_cpu_to_node(void); - -static inline void clear_node_cpumask(int cpu) -{ - clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]); -} - +extern void __cpuinit numa_set_node(int cpu, int node); +extern void __cpuinit numa_clear_node(int cpu); +extern void __cpuinit numa_add_cpu(int cpu); +extern void __cpuinit numa_remove_cpu(int cpu); #else -#define init_cpu_to_node() do {} while (0) -#define clear_node_cpumask(cpu) do {} while (0) +static inline void init_cpu_to_node(void) { } +static inline void numa_set_node(int cpu, int node) { } +static inline void numa_clear_node(int cpu) { } +static inline void numa_add_cpu(int cpu, int node) { } +static inline void numa_remove_cpu(int cpu) { } #endif #endif diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h index 736fc3bb8e1e..912a3a17b9db 100644 --- a/include/asm-x86/percpu.h +++ b/include/asm-x86/percpu.h @@ -143,4 +143,50 @@ do { \ #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) #endif /* !__ASSEMBLY__ */ #endif /* !CONFIG_X86_64 */ + +#ifdef CONFIG_SMP + +/* + * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu + * variables that are initialized and accessed before there are per_cpu + * areas allocated. + */ + +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue; \ + __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \ + { [0 ... NR_CPUS-1] = _initvalue }; \ + __typeof__(_type) *_name##_early_ptr = _name##_early_map + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ + extern __typeof__(_type) _name##_early_map[] + +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) +#define early_per_cpu(_name, _cpu) \ + (early_per_cpu_ptr(_name) ? \ + early_per_cpu_ptr(_name)[_cpu] : \ + per_cpu(_name, _cpu)) + +#else /* !CONFIG_SMP */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ + DEFINE_PER_CPU(_type, _name) = _initvalue + +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ + EXPORT_PER_CPU_SYMBOL(_name) + +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name) + +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL +/* no early_per_cpu_map() */ + +#endif /* !CONFIG_SMP */ + #endif /* _ASM_X86_PERCPU_H_ */ diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index 1ebaa5cd3112..ec841639fb44 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -29,21 +29,12 @@ extern int smp_num_siblings; extern unsigned int num_processors; extern cpumask_t cpu_initialized; -#ifdef CONFIG_SMP -extern u16 x86_cpu_to_apicid_init[]; -extern u16 x86_bios_cpu_apicid_init[]; -extern void *x86_cpu_to_apicid_early_ptr; -extern void *x86_bios_cpu_apicid_early_ptr; -#else -#define x86_cpu_to_apicid_early_ptr NULL -#define x86_bios_cpu_apicid_early_ptr NULL -#endif - DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); -DECLARE_PER_CPU(u16, x86_cpu_to_apicid); -DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); + +DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); /* Static state in head.S used to set up a CPU */ extern struct { diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index dcf3f8131d6b..dac09cb66dca 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -35,87 +35,67 @@ # endif #endif +/* Node not present */ +#define NUMA_NO_NODE (-1) + #ifdef CONFIG_NUMA #include #include -/* Mappings between logical cpu number and node number */ #ifdef CONFIG_X86_32 -extern int cpu_to_node_map[]; -#else -/* Returns the number of the current Node. */ -#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id())) -#endif - -DECLARE_PER_CPU(int, x86_cpu_to_node_map); - -#ifdef CONFIG_SMP -extern int x86_cpu_to_node_map_init[]; -extern void *x86_cpu_to_node_map_early_ptr; -#else -#define x86_cpu_to_node_map_early_ptr NULL -#endif +/* Mappings between node number and cpus on that node. */ extern cpumask_t node_to_cpumask_map[]; -#define NUMA_NO_NODE (-1) +/* Mappings between logical cpu number and node number */ +extern int cpu_to_node_map[]; /* Returns the number of the node containing CPU 'cpu' */ -#ifdef CONFIG_X86_32 -#define early_cpu_to_node(cpu) cpu_to_node(cpu) static inline int cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } +#define early_cpu_to_node(cpu) cpu_to_node(cpu) #else /* CONFIG_X86_64 */ -#ifdef CONFIG_SMP -static inline int early_cpu_to_node(int cpu) -{ - int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; - - if (cpu_to_node_map) - return cpu_to_node_map[cpu]; - else if (per_cpu_offset(cpu)) - return per_cpu(x86_cpu_to_node_map, cpu); - else - return NUMA_NO_NODE; -} -#else -#define early_cpu_to_node(cpu) cpu_to_node(cpu) -#endif +/* Mappings between node number and cpus on that node. */ +extern cpumask_t node_to_cpumask_map[]; + +/* Mappings between logical cpu number and node number */ +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); + +/* Returns the number of the current Node. */ +#define numa_node_id() (per_cpu(x86_cpu_to_node_map, raw_smp_processor_id())) + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +extern int cpu_to_node(int cpu); +extern int early_cpu_to_node(int cpu); +extern cpumask_t *_node_to_cpumask_ptr(int node); +extern cpumask_t node_to_cpumask(int node); +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +/* Returns the number of the node containing CPU 'cpu' */ static inline int cpu_to_node(int cpu) { -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (x86_cpu_to_node_map_early_ptr) { - printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n", - (int)cpu); - dump_stack(); - return ((int *)x86_cpu_to_node_map_early_ptr)[cpu]; - } -#endif return per_cpu(x86_cpu_to_node_map, cpu); } -#ifdef CONFIG_NUMA - -/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -#define node_to_cpumask_ptr(v, node) \ - cpumask_t *v = &(node_to_cpumask_map[node]) - -#define node_to_cpumask_ptr_next(v, node) \ - v = &(node_to_cpumask_map[node]) -#endif +/* Same function but used if called before per_cpu areas are setup */ +static inline int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; -#endif /* CONFIG_X86_64 */ + return per_cpu(x86_cpu_to_node_map, cpu); +} -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &node_to_cpumask_map[node]; +} /* Returns a bitmask of CPUs on Node 'node'. */ static inline cpumask_t node_to_cpumask(int node) @@ -123,14 +103,29 @@ static inline cpumask_t node_to_cpumask(int node) return node_to_cpumask_map[node]; } +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif /* CONFIG_X86_64 */ + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) + +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) + /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - cpumask_t mask = node_to_cpumask(node); - - return first_cpu(mask); + node_to_cpumask_ptr(mask, node); + return first_cpu(*mask); } +/* + * Returns the number of the node containing Node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + #define pcibus_to_node(bus) __pcibus_to_node(bus) #define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) @@ -180,8 +175,31 @@ extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) #endif -#else /* CONFIG_NUMA */ +#else /* !CONFIG_NUMA */ + +#define numa_node_id() 0 +#define cpu_to_node(cpu) 0 +#define early_cpu_to_node(cpu) 0 + +static inline cpumask_t *_node_to_cpumask_ptr(int node) +{ + return &cpu_online_map; +} +static inline cpumask_t node_to_cpumask(int node) +{ + return cpu_online_map; +} +static inline int node_to_first_cpu(int node) +{ + return first_cpu(cpu_online_map); +} + +/* Replace default node_to_cpumask_ptr with optimized version */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = _node_to_cpumask_ptr(node) +#define node_to_cpumask_ptr_next(v, node) \ + v = _node_to_cpumask_ptr(node) #endif #include @@ -193,6 +211,9 @@ extern cpumask_t cpu_coregroup_map(int cpu); #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) + +/* indicates that pointers to the topology cpumask_t maps are valid */ +#define arch_provides_topology_pointers yes #endif static inline void arch_fix_phys_package_id(int num, u32 slot) @@ -220,4 +241,4 @@ static inline void set_mp_bus_to_node(int busnum, int node) } #endif -#endif +#endif /* _ASM_X86_TOPOLOGY_H */ -- cgit v1.2.3 From b7a39bd0afc4021e8ad2b1189e884551e147427f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 23 May 2008 18:38:49 +0100 Subject: firmware: make fw->data const In preparation for supporting firmware files linked into the static kernel, make fw->data const to ensure that users aren't modifying it (so that we can pass a pointer to the original in-kernel copy, rather than having to copy it). Signed-off-by: David Woodhouse --- drivers/base/firmware_class.c | 2 +- include/linux/firmware.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 9fd4a8534146..264b3a2cd860 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -257,7 +257,7 @@ firmware_data_write(struct kobject *kobj, struct bin_attribute *bin_attr, if (retval) goto out; - memcpy(fw->data + offset, buffer, count); + memcpy((u8 *)fw->data + offset, buffer, count); fw->size = max_t(size_t, offset + count, fw->size); retval = count; diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 6c7eff2ebada..88718d60153c 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -8,7 +8,7 @@ struct firmware { size_t size; - u8 *data; + const u8 *data; }; struct device; -- cgit v1.2.3 From 5658c769443d543728b6c5c673dffc2df8676317 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 23 May 2008 13:52:42 +0100 Subject: firmware: allow firmware files to be built into kernel image Some drivers have their own hacks to bypass the kernel's firmware loader and build their firmware into the kernel; this renders those unnecessary. Other drivers don't use the firmware loader at all, because they always want the firmware to be available. This allows them to start using the firmware loader. A third set of drivers already use the firmware loader, but can't be used without help from userspace, which sometimes requires an initrd. This allows them to work in a static kernel. Signed-off-by: David Woodhouse --- drivers/base/firmware_class.c | 33 +++++++++++++++++++++++++++++++-- include/asm-generic/vmlinux.lds.h | 7 +++++++ include/linux/firmware.h | 21 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) (limited to 'drivers/base') diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index 264b3a2cd860..b0be1d18fee2 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -49,6 +49,14 @@ struct firmware_priv { struct timer_list timeout; }; +#ifdef CONFIG_FW_LOADER +extern struct builtin_fw __start_builtin_fw[]; +extern struct builtin_fw __end_builtin_fw[]; +#else /* Module case. Avoid ifdefs later; it'll all optimise out */ +static struct builtin_fw *__start_builtin_fw; +static struct builtin_fw *__end_builtin_fw; +#endif + static void fw_load_abort(struct firmware_priv *fw_priv) { @@ -391,13 +399,12 @@ _request_firmware(const struct firmware **firmware_p, const char *name, struct device *f_dev; struct firmware_priv *fw_priv; struct firmware *firmware; + struct builtin_fw *builtin; int retval; if (!firmware_p) return -EINVAL; - printk(KERN_INFO "firmware: requesting %s\n", name); - *firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL); if (!firmware) { printk(KERN_ERR "%s: kmalloc(struct firmware) failed\n", @@ -406,6 +413,20 @@ _request_firmware(const struct firmware **firmware_p, const char *name, goto out; } + for (builtin = __start_builtin_fw; builtin != __end_builtin_fw; + builtin++) { + if (strcmp(name, builtin->name)) + continue; + printk(KERN_INFO "firmware: using built-in firmware %s\n", + name); + firmware->size = builtin->size; + firmware->data = builtin->data; + return 0; + } + + if (uevent) + printk(KERN_INFO "firmware: requesting %s\n", name); + retval = fw_setup_device(firmware, &f_dev, name, device, uevent); if (retval) goto error_kfree_fw; @@ -473,8 +494,16 @@ request_firmware(const struct firmware **firmware_p, const char *name, void release_firmware(const struct firmware *fw) { + struct builtin_fw *builtin; + if (fw) { + for (builtin = __start_builtin_fw; builtin != __end_builtin_fw; + builtin++) { + if (fw->data == builtin->data) + goto free_fw; + } vfree(fw->data); + free_fw: kfree(fw); } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f054778e916c..8d71a40625f3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -86,6 +86,13 @@ VMLINUX_SYMBOL(__end_pci_fixups_resume) = .; \ } \ \ + /* Built-in firmware blobs */ \ + .builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start_builtin_fw) = .; \ + *(.builtin_fw) \ + VMLINUX_SYMBOL(__end_builtin_fw) = .; \ + } \ + \ /* RapidIO route ops */ \ .rio_route : AT(ADDR(.rio_route) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_rio_route_ops) = .; \ diff --git a/include/linux/firmware.h b/include/linux/firmware.h index 88718d60153c..c8ecf5b2a207 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -1,7 +1,10 @@ #ifndef _LINUX_FIRMWARE_H #define _LINUX_FIRMWARE_H + #include #include +#include + #define FIRMWARE_NAME_MAX 30 #define FW_ACTION_NOHOTPLUG 0 #define FW_ACTION_HOTPLUG 1 @@ -13,6 +16,24 @@ struct firmware { struct device; +struct builtin_fw { + char *name; + void *data; + unsigned long size; +}; + +/* We have to play tricks here much like stringify() to get the + __COUNTER__ macro to be expanded as we want it */ +#define __fw_concat1(x, y) x##y +#define __fw_concat(x, y) __fw_concat1(x, y) + +#define DECLARE_BUILTIN_FIRMWARE(name, blob) \ + DECLARE_BUILTIN_FIRMWARE_SIZE(name, &(blob), sizeof(blob)) + +#define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size) \ + static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \ + __used __section(.builtin_fw) = { name, blob, size } + #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE)) int request_firmware(const struct firmware **fw, const char *name, struct device *device); -- cgit v1.2.3 From 4d2acfbfdf68257e846aaa355edd10fc35ba0feb Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 23 May 2008 13:58:12 +0100 Subject: firmware: Add CONFIG_EXTRA_FIRMWARE option This allows arbitrary firmware files to be included in the static kernel where the firmware loader can find them without requiring userspace to be alive. (Updated and CONFIG_EXTRA_FIRMWARE_DIR added with lots of help from Johannes Berg). Signed-off-by: David Woodhouse Signed-off-by: Johannes Berg --- Makefile | 2 +- drivers/base/Kconfig | 39 +++++++++++++++++++++++ firmware/Makefile | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 firmware/Makefile (limited to 'drivers/base') diff --git a/Makefile b/Makefile index 6315424a00b9..f398cffa6c07 100644 --- a/Makefile +++ b/Makefile @@ -450,7 +450,7 @@ scripts: scripts_basic include/config/auto.conf # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ +drivers-y := drivers/ sound/ firmware/ net-y := net/ libs-y := lib/ core-y := usr/ diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index d7da109c24fd..13cfcb435f7d 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -34,6 +34,45 @@ config FW_LOADER require userspace firmware loading support, but a module built outside the kernel tree does. +config EXTRA_FIRMWARE + string "External firmware blobs to build into the kernel binary" + depends on FW_LOADER + help + This option allows firmware to be built into the kernel, for the + cases where the user either cannot or doesn't want to provide it from + userspace at runtime (for example, when the firmware in question is + required for accessing the boot device, and the user doesn't want to + use an initrd). + + This option is a string, and takes the (space-separated) names of the + firmware files -- the same names which appear in MODULE_FIRMWARE() + and request_firmware() in the source. These files should exist under + the directory specified by the EXTRA_FIRMWARE_DIR option, which is + by default the firmware/ subdirectory of the kernel source tree. + + So, for example, you might set CONFIG_EXTRA_FIRMWARE="usb8388.bin", + copy the usb8388.bin file into the firmware/ directory, and build the + kernel. Then any request_firmware("usb8388.bin") will be + satisfied internally without needing to call out to userspace. + + WARNING: If you include additional firmware files into your binary + kernel image which are not available under the terms of the GPL, + then it may be a violation of the GPL to distribute the resulting + image -- since it combines both GPL and non-GPL work. You should + consult a lawyer of your own before distributing such an image. + +config EXTRA_FIRMWARE_DIR + string "Firmware blobs root directory" + depends on EXTRA_FIRMWARE != "" + default "firmware" + help + This option controls the directory in which the kernel build system + looks for the firmware files listed in the EXTRA_FIRMWARE option. + The default is the firmware/ directory in the kernel source tree, + but by changing this option you can point it elsewhere, such as + the /lib/firmware/ directory or another separate directory + containing firmware files. + config DEBUG_DRIVER bool "Driver Core verbose debug messages" depends on DEBUG_KERNEL diff --git a/firmware/Makefile b/firmware/Makefile new file mode 100644 index 000000000000..e69461f9362b --- /dev/null +++ b/firmware/Makefile @@ -0,0 +1,88 @@ +# +# kbuild file for firmware/ +# + +# Create $(fwabs) from $(CONFIG_EXTRA_FIRMWARE_DIR) -- if it doesn't have a +# leading /, it's relative to $(srctree). +fwdir := $(subst ",,$(CONFIG_EXTRA_FIRMWARE_DIR)) +fwabs := $(addprefix $(srctree)/,$(filter-out /%,$(fwdir)))$(filter /%,$(fwdir)) + +fw-external-y := $(subst ",,$(CONFIG_EXTRA_FIRMWARE)) + +firmware-y := $(fw-external-y) $(fw-shipped-y) +firmware-dirs := $(sort $(patsubst %,$(objtree)/$(obj)/%/,$(dir $(firmware-y) $(fw-shipped-)))) + +quiet_cmd_mkdir = MKDIR $(patsubst $(objtree)/%,%,$@) + cmd_mkdir = mkdir -p $@ + +quiet_cmd_ihex = IHEX $@ + cmd_ihex = $(OBJCOPY) -Iihex -Obinary $< $@ + +quiet_cmd_fwbin = MK_FW $@ + cmd_fwbin = FWNAME="$(patsubst firmware/%.gen.S,%,$@)"; \ + FWSTR="$(subst /,_,$(subst .,_,$(subst -,_,$(patsubst \ + firmware/%.gen.S,%,$@))))"; \ + ASM_WORD=$(if $(CONFIG_64BIT),.quad,.long); \ + ASM_ALIGN=$(if $(CONFIG_64BIT),3,2); \ + PROGBITS=$(if $(CONFIG_ARM),%,@)progbits; \ + echo "/* Generated by firmware/Makefile */" > $@;\ + echo " .section .rodata" >>$@;\ + echo " .p2align $${ASM_ALIGN}" >>$@;\ + echo "_fw_$${FWSTR}_bin:" >>$@;\ + echo " .incbin \"$(2)\"" >>$@;\ + echo "_fw_end:" >>$@;\ + echo " .section .rodata.str,\"aMS\",$${PROGBITS},1" >>$@;\ + echo " .p2align $${ASM_ALIGN}" >>$@;\ + echo "_fw_$${FWSTR}_name:" >>$@;\ + echo " .string \"$$FWNAME\"" >>$@;\ + echo " .section .builtin_fw,\"a\",$${PROGBITS}" >>$@;\ + echo " .p2align $${ASM_ALIGN}" >>$@;\ + echo " $${ASM_WORD} _fw_$${FWSTR}_name" >>$@;\ + echo " $${ASM_WORD} _fw_$${FWSTR}_bin" >>$@;\ + echo " $${ASM_WORD} _fw_end - _fw_$${FWSTR}_bin" >>$@; + +# One of these files will change, or come into existence, whenever +# the configuration changes between 32-bit and 64-bit. The .S files +# need to change when that happens. +wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \ + include/config/ppc32.h include/config/ppc64.h \ + include/config/superh32.h include/config/superh64.h \ + include/config/x86_32.h include/config/x86_64.h) + +# Workaround for make < 3.81, where .SECONDEXPANSION doesn't work. +# It'll end up depending on these targets, so make them a PHONY rule which +# depends on _all_ the directories in $(firmware-dirs), and it'll work out OK. +PHONY += $(objtree)/$$(%) $(objtree)/$(obj)/$$(%) +$(objtree)/$$(%) $(objtree)/$(obj)/$$(%): $(firmware-dirs) + @true + +# For the $$(dir %) trick, where we need % to be expanded first. +.SECONDEXPANSION: + +$(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps) \ + | $(objtree)/$$(dir %) + $(call cmd,fwbin,$(patsubst %.gen.S,%,$@)) +$(patsubst %,$(obj)/%.gen.S, $(fw-external-y)): %: $(wordsize_deps) \ + include/config/builtin/firmware/dir.h | $(objtree)/$$(dir %) + $(call cmd,fwbin,$(fwabs)/$(patsubst $(obj)/%.gen.S,%,$@)) + +# The .o files depend on the binaries directly; the .S files don't. +$(patsubst %,$(obj)/%.gen.o, $(fw-shipped-y)): %.gen.o: % +$(patsubst %,$(obj)/%.gen.o, $(fw-external-y)): $(obj)/%.gen.o: $(fwdir)/% + +$(obj)/%: $(obj)/%.ihex | $(objtree)/$(obj)/$$(dir %) + $(call cmd,ihex) + +$(firmware-dirs): + $(call cmd,mkdir) + +obj-y := $(patsubst %,%.gen.o, $(firmware-y)) + +# Remove .S files and binaries created from ihex +# (during 'make clean' .config isn't included so they're all in $(fw-shipped-)) +targets := $(fw-shipped-) $(patsubst $(obj)/%,%, \ + $(shell find $(obj) -name \*.gen.S 2>/dev/null)) + +# Without this, built-in.o won't be created when it's empty, and the +# final vmlinux link will fail. +obj-n := dummy -- cgit v1.2.3 From d172e7f5c67f2d41f453c7aa83d3bdb405ef8ba5 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 25 Jun 2008 13:56:07 +0100 Subject: firmware: Add CONFIG_FIRMWARE_IN_KERNEL option. This will control whether we build firmware into the kernel image for _every_ driver which we convert to request_firmware(), to avoid a proliferation of 'CONFIG_XXX_FIRMWARE' options for each one. Default to 'y' for now, which is the wrong thing to do but people seem to be insisting on it and refusing to even review patches until it's done. And it does preserve the existing behaviour for built-in drivers. Signed-off-by: David Woodhouse --- drivers/base/Kconfig | 25 +++++++++++++++++++++++++ firmware/Makefile | 5 +++++ 2 files changed, 30 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 13cfcb435f7d..d47482fa1d21 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -34,6 +34,31 @@ config FW_LOADER require userspace firmware loading support, but a module built outside the kernel tree does. +config FIRMWARE_IN_KERNEL + bool "Include in-kernel firmware blobs in kernel binary" + depends on FW_LOADER + default y + help + The kernel source tree includes a number of firmware 'blobs' + which are used by various drivers. The recommended way to + use these is to run "make firmware_install" and to copy the + resulting binary files created in usr/lib/firmware directory + of the kernel tree to the /lib/firmware on your system so + that they can be loaded by userspace helpers on request. + + Enabling this option will build each required firmware blob + into the kernel directly, where request_firmware() will find + them without having to call out to userspace. This may be + useful if your root file system requires a device which uses + such firmware, and do not wish to use an initrd. + + This single option controls the inclusion of firmware for + every driver which usees request_firmare() and ships its + firmware in the kernel source tree, to avoid a proliferation + of 'Include firmware for xxx device' options. + + Say 'N' and let firmware be loaded from userspace. + config EXTRA_FIRMWARE string "External firmware blobs to build into the kernel binary" depends on FW_LOADER diff --git a/firmware/Makefile b/firmware/Makefile index e69461f9362b..cc25f5600d5d 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -9,6 +9,11 @@ fwabs := $(addprefix $(srctree)/,$(filter-out /%,$(fwdir)))$(filter /%,$(fwdir)) fw-external-y := $(subst ",,$(CONFIG_EXTRA_FIRMWARE)) +# If CONFIG_FIRMWARE_IN_KERNEL is not set, then don't include any firmware +ifneq ($(CONFIG_FIRMWARE_IN_KERNEL),y) +fw-shipped-y := +endif + firmware-y := $(fw-external-y) $(fw-shipped-y) firmware-dirs := $(sort $(patsubst %,$(objtree)/$(obj)/%/,$(dir $(firmware-y) $(fw-shipped-)))) -- cgit v1.2.3