diff options
280 files changed, 8190 insertions, 5280 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable new file mode 100644 index 000000000000..175bb4f70512 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-cache_disable @@ -0,0 +1,18 @@ +What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X +Date: August 2008 +KernelVersion: 2.6.27 +Contact: mark.langsdorf@amd.com +Description: These files exist in every cpu's cache index directories. + There are currently 2 cache_disable_# files in each + directory. Reading from these files on a supported + processor will return that cache disable index value + for that processor and node. Writing to one of these + files will cause the specificed cache index to be disabled. + + Currently, only AMD Family 10h Processors support cache index + disable, and only for their L3 caches. See the BIOS and + Kernel Developer's Guide at + http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf + for formatting information and other details on the + cache index disable. +Users: joachim.deguara@amd.com diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index d9aa43d78bcc..25fb8bcf32a2 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -704,12 +704,24 @@ this directory the following files can currently be found: The current number of free dma_debug_entries in the allocator. + dma-api/driver-filter + You can write a name of a driver into this file + to limit the debug output to requests from that + particular driver. Write an empty string to + that file to disable the filter and see + all errors again. + If you have this code compiled into your kernel it will be enabled by default. If you want to boot without the bookkeeping anyway you can provide 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. Notice that you can not enable it again at runtime. You have to reboot to do so. +If you want to see debug messages only for a special device driver you can +specify the dma_debug_driver=<drivername> parameter. This will enable the +driver filter at boot time. The debug code will only print errors for that +driver afterwards. This filter can be disabled or changed later using debugfs. + When the code disables itself at runtime this is most likely because it ran out of dma_debug_entries. These entries are preallocated at boot. The number of preallocated entries is defined per architecture. If it is too low for you diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 000000000000..9dc1ff4fd536 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt @@ -0,0 +1,131 @@ +Futex Requeue PI +---------------- + +Requeueing of tasks from a non-PI futex to a PI futex requires +special handling in order to ensure the underlying rt_mutex is never +left without an owner if it has waiters; doing so would break the PI +boosting logic [see rt-mutex-desgin.txt] For the purposes of +brevity, this action will be referred to as "requeue_pi" throughout +this document. Priority inheritance is abbreviated throughout as +"PI". + +Motivation +---------- + +Without requeue_pi, the glibc implementation of +pthread_cond_broadcast() must resort to waking all the tasks waiting +on a pthread_condvar and letting them try to sort out which task +gets to run first in classic thundering-herd formation. An ideal +implementation would wake the highest-priority waiter, and leave the +rest to the natural wakeup inherent in unlocking the mutex +associated with the condvar. + +Consider the simplified glibc calls: + +/* caller must lock mutex */ +pthread_cond_wait(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + lock(mutex); +} + +pthread_cond_broadcast(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue(cond->data.__futex, cond->mutex); +} + +Once pthread_cond_broadcast() requeues the tasks, the cond->mutex +has waiters. Note that pthread_cond_wait() attempts to lock the +mutex only after it has returned to user space. This will leave the +underlying rt_mutex with waiters, and no owner, breaking the +previously mentioned PI-boosting algorithms. + +In order to support PI-aware pthread_condvar's, the kernel needs to +be able to requeue tasks to PI futexes. This support implies that +upon a successful futex_wait system call, the caller would return to +user space already holding the PI futex. The glibc implementation +would be modified as follows: + + +/* caller must lock mutex */ +pthread_cond_wait_pi(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait_requeue_pi(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + /* the kernel acquired the the mutex for us */ +} + +pthread_cond_broadcast_pi(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue_pi(cond->data.__futex, cond->mutex); +} + +The actual glibc implementation will likely test for PI and make the +necessary changes inside the existing calls rather than creating new +calls for the PI cases. Similar changes are needed for +pthread_cond_timedwait() and pthread_cond_signal(). + +Implementation +-------------- + +In order to ensure the rt_mutex has an owner if it has waiters, it +is necessary for both the requeue code, as well as the waiting code, +to be able to acquire the rt_mutex before returning to user space. +The requeue code cannot simply wake the waiter and leave it to +acquire the rt_mutex as it would open a race window between the +requeue call returning to user space and the waiter waking and +starting to run. This is especially true in the uncontended case. + +The solution involves two new rt_mutex helper routines, +rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which +allow the requeue code to acquire an uncontended rt_mutex on behalf +of the waiter and to enqueue the waiter on a contended rt_mutex. +Two new system calls provide the kernel<->user interface to +requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. + +FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() +and pthread_cond_timedwait()) to block on the initial futex and wait +to be requeued to a PI-aware futex. The implementation is the +result of a high-speed collision between futex_wait() and +futex_lock_pi(), with some extra logic to check for the additional +wake-up scenarios. + +FUTEX_REQUEUE_CMP_PI is called by the waker +(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and +possibly wake the waiting tasks. Internally, this system call is +still handled by futex_requeue (by passing requeue_pi=1). Before +requeueing, futex_requeue() attempts to acquire the requeue target +PI futex on behalf of the top waiter. If it can, this waiter is +woken. futex_requeue() then proceeds to requeue the remaining +nr_wake+nr_requeue tasks to the PI futex, calling +rt_mutex_start_proxy_lock() prior to each requeue to prepare the +task as a waiter on the underlying rt_mutex. It is possible that +the lock can be acquired at this stage as well, if so, the next +waiter is woken to finish the acquisition of the lock. + +FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but +their sum is all that really matters. futex_requeue() will wake or +requeue up to nr_wake + nr_requeue tasks. It will wake only as many +tasks as it can acquire the lock for, which in the majority of cases +should be 0 as good programming practice dictates that the caller of +either pthread_cond_broadcast() or pthread_cond_signal() acquire the +mutex prior to making the call. FUTEX_REQUEUE_PI requires that +nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for +signal. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8041e51f8860..a5253f6d01af 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -329,11 +329,6 @@ and is between 256 and 4096 characters. It is defined in the file flushed before they will be reused, which is a lot of faster - amd_iommu_size= [HW,X86-64] - Define the size of the aperture for the AMD IOMMU - driver. Possible values are: - '32M', '64M' (default), '128M', '256M', '512M', '1G' - amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: <a>,<b> @@ -646,6 +641,13 @@ and is between 256 and 4096 characters. It is defined in the file DMA-API debugging code disables itself because the architectural default is too low. + dma_debug_driver=<driver_name> + With this option the DMA-API debugging driver + filter feature can be enabled at boot time. Just + pass the driver to filter for as the parameter. + The filter can be disabled or changed to another + driver later using sysfs. + dscc4.setup= [NET] dtc3181e= [HW,SCSI] @@ -1581,6 +1583,9 @@ and is between 256 and 4096 characters. It is defined in the file noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. + nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + remapping. + nointroute [IA-64] nojitter [IA64] Disables jitter checking for ITC timers. diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f5b7127f54ac..7f5809eddee6 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -31,6 +31,7 @@ Contents: - Locking functions. - Interrupt disabling functions. + - Sleep and wake-up functions. - Miscellaneous functions. (*) Inter-CPU locking barrier effects. @@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some other means. +SLEEP AND WAKE-UP FUNCTIONS +--------------------------- + +Sleeping and waking on an event flagged in global data can be viewed as an +interaction between two pieces of data: the task state of the task waiting for +the event and the global data used to indicate the event. To make sure that +these appear to happen in the right order, the primitives to begin the process +of going to sleep, and the primitives to initiate a wake up imply certain +barriers. + +Firstly, the sleeper normally follows something like this sequence of events: + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (event_indicated) + break; + schedule(); + } + +A general memory barrier is interpolated automatically by set_current_state() +after it has altered the task state: + + CPU 1 + =============================== + set_current_state(); + set_mb(); + STORE current->state + <general barrier> + LOAD event_indicated + +set_current_state() may be wrapped by: + + prepare_to_wait(); + prepare_to_wait_exclusive(); + +which therefore also imply a general memory barrier after setting the state. +The whole sequence above is available in various canned forms, all of which +interpolate the memory barrier in the right place: + + wait_event(); + wait_event_interruptible(); + wait_event_interruptible_exclusive(); + wait_event_interruptible_timeout(); + wait_event_killable(); + wait_event_timeout(); + wait_on_bit(); + wait_on_bit_lock(); + + +Secondly, code that performs a wake up normally follows something like this: + + event_indicated = 1; + wake_up(&event_wait_queue); + +or: + + event_indicated = 1; + wake_up_process(event_daemon); + +A write memory barrier is implied by wake_up() and co. if and only if they wake +something up. The barrier occurs before the task state is cleared, and so sits +between the STORE to indicate the event and the STORE to set TASK_RUNNING: + + CPU 1 CPU 2 + =============================== =============================== + set_current_state(); STORE event_indicated + set_mb(); wake_up(); + STORE current->state <write barrier> + <general barrier> STORE current->state + LOAD event_indicated + +The available waker functions include: + + complete(); + wake_up(); + wake_up_all(); + wake_up_bit(); + wake_up_interruptible(); + wake_up_interruptible_all(); + wake_up_interruptible_nr(); + wake_up_interruptible_poll(); + wake_up_interruptible_sync(); + wake_up_interruptible_sync_poll(); + wake_up_locked(); + wake_up_locked_poll(); + wake_up_nr(); + wake_up_poll(); + wake_up_process(); + + +[!] Note that the memory barriers implied by the sleeper and the waker do _not_ +order multiple stores before the wake-up with respect to loads of those stored +values after the sleeper has called set_current_state(). For instance, if the +sleeper does: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) + break; + __set_current_state(TASK_RUNNING); + do_something(my_data); + +and the waker does: + + my_data = value; + event_indicated = 1; + wake_up(&event_wait_queue); + +there's no guarantee that the change to event_indicated will be perceived by +the sleeper as coming after the change to my_data. In such a circumstance, the +code on both sides must interpolate its own memory barriers between the +separate data accesses. Thus the above sleeper ought to do: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) { + smp_rmb(); + do_something(my_data); + } + +and the waker should do: + + my_data = value; + smp_wmb(); + event_indicated = 1; + wake_up(&event_wait_queue); + + MISCELLANEOUS FUNCTIONS ----------------------- @@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED? Under normal operation, memory operation reordering is generally not going to be a problem as a single-threaded linear piece of code will still appear to -work correctly, even if it's in an SMP kernel. There are, however, three +work correctly, even if it's in an SMP kernel. There are, however, four circumstances in which reordering definitely _could_ be a problem: (*) Interprocessor interaction. diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 5ba4d3fc625a..1df7f9cdab05 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -4,6 +4,7 @@ CONTENTS ======== +0. WARNING 1. Overview 1.1 The problem 1.2 The solution @@ -14,6 +15,23 @@ CONTENTS 3. Future plans +0. WARNING +========== + + Fiddling with these settings can result in an unstable system, the knobs are + root only and assumes root knows what he is doing. + +Most notable: + + * very small values in sched_rt_period_us can result in an unstable + system when the period is smaller than either the available hrtimer + resolution, or the time it takes to handle the budget refresh itself. + + * very small values in sched_rt_runtime_us can result in an unstable + system when the runtime is so small the system has difficulty making + forward progress (NOTE: the migration thread and kstopmachine both + are real-time processes). + 1. Overview =========== @@ -169,7 +187,7 @@ get their allocated time. Implementing SCHED_EDF might take a while to complete. Priority Inheritance is the biggest challenge as the current linux PI infrastructure is geared towards -the limited static priority levels 0-139. With deadline scheduling you need to +the limited static priority levels 0-99. With deadline scheduling you need to do deadline inheritance (since priority is inversely proportional to the deadline delta (deadline - now). diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..e362f50c496f 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice values starting at 100 (nice -20). Below is a quick chart to map the kernel priority to user land priorities. - Kernel priority: 0 to 99 ==> user RT priority 99 to 0 - Kernel priority: 100 to 139 ==> user nice -20 to 19 - Kernel priority: 140 ==> idle task priority + Kernel Space User Space + =============================================================== + 0(high) to 98(low) user RT priority 99(high) to 1(low) + with SCHED_RR or SCHED_FIFO + --------------------------------------------------------------- + 99 sched_priority is not used in scheduling + decisions(it must be specified as 0) + --------------------------------------------------------------- + 100(high) to 139(low) user nice -20(high) to 19(low) + --------------------------------------------------------------- + 140 idle task priority + --------------------------------------------------------------- The task states are: diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index e0203662f9e9..8da3a795083f 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. +Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment + beyond the kernel_alignment added, new init_size and + pref_address fields. Added extended boot loader IDs. + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -168,12 +172,13 @@ Offset Proto Name Meaning 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 0224/2 2.01+ heap_end_ptr Free memory after setup end -0226/2 N/A pad1 Unused +0226/1 2.02+(3 ext_loader_ver Extended boot loader version +0227/1 2.02+(3 ext_loader_type Extended boot loader ID 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 022C/4 2.03+ ramdisk_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not -0235/1 N/A pad2 Unused +0235/1 2.10+ min_alignment Minimum alignment, as a power of two 0236/2 N/A pad3 Unused 0238/4 2.06+ cmdline_size Maximum size of the kernel command line 023C/4 2.07+ hardware_subarch Hardware subarchitecture @@ -182,6 +187,8 @@ Offset Proto Name Meaning 024C/4 2.08+ payload_length Length of kernel payload 0250/8 2.09+ setup_data 64-bit physical pointer to linked list of struct setup_data +0258/8 2.10+ pref_address Preferred loading address +0260/4 2.10+ init_size Linear memory required during initialization (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -190,6 +197,8 @@ Offset Proto Name Meaning field are unusable, which means the size of a bzImage kernel cannot be determined. +(3) Ignored, but safe to set, for boot protocols 2.02-2.09. + If the "HdrS" (0x53726448) magic number is not found at offset 0x202, the boot protocol version is "old". Loading an old kernel, the following parameters should be assumed: @@ -343,18 +352,32 @@ Protocol: 2.00+ 0xTV here, where T is an identifier for the boot loader and V is a version number. Otherwise, enter 0xFF here. + For boot loader IDs above T = 0xD, write T = 0xE to this field and + write the extended ID minus 0x10 to the ext_loader_type field. + Similarly, the ext_loader_ver field can be used to provide more than + four bits for the bootloader version. + + For example, for T = 0x15, V = 0x234, write: + + type_of_loader <- 0xE4 + ext_loader_type <- 0x05 + ext_loader_ver <- 0x23 + Assigned boot loader ids: 0 LILO (0x00 reserved for pre-2.00 bootloader) 1 Loadlin 2 bootsect-loader (0x20, all other values reserved) - 3 SYSLINUX - 4 EtherBoot + 3 Syslinux + 4 Etherboot/gPXE 5 ELILO 7 GRUB - 8 U-BOOT + 8 U-Boot 9 Xen A Gujin B Qemu + C Arcturus Networks uCbootloader + E Extended (see ext_loader_type) + F Special (0xFF = undefined) Please contact <hpa@zytor.com> if you need a bootloader ID value assigned. @@ -453,6 +476,35 @@ Protocol: 2.01+ Set this field to the offset (from the beginning of the real-mode code) of the end of the setup stack/heap, minus 0x0200. +Field name: ext_loader_ver +Type: write (optional) +Offset/size: 0x226/1 +Protocol: 2.02+ + + This field is used as an extension of the version number in the + type_of_loader field. The total version number is considered to be + (type_of_loader & 0x0f) + (ext_loader_ver << 4). + + The use of this field is boot loader specific. If not written, it + is zero. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + +Field name: ext_loader_type +Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0) +Offset/size: 0x227/1 +Protocol: 2.02+ + + This field is used as an extension of the type number in + type_of_loader field. If the type in type_of_loader is 0xE, then + the actual type is (ext_loader_type + 0x10). + + This field is ignored if the type in type_of_loader is not 0xE. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + Field name: cmd_line_ptr Type: write (obligatory) Offset/size: 0x228/4 @@ -482,11 +534,19 @@ Protocol: 2.03+ 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) Field name: kernel_alignment -Type: read (reloc) +Type: read/modify (reloc) Offset/size: 0x230/4 -Protocol: 2.05+ +Protocol: 2.05+ (read), 2.10+ (modify) + + Alignment unit required by the kernel (if relocatable_kernel is + true.) A relocatable kernel that is loaded at an alignment + incompatible with the value in this field will be realigned during + kernel initialization. - Alignment unit required by the kernel (if relocatable_kernel is true.) + Starting with protocol version 2.10, this reflects the kernel + alignment preferred for optimal performance; it is possible for the + loader to modify this field to permit a lesser alignment. See the + min_alignment and pref_address field below. Field name: relocatable_kernel Type: read (reloc) @@ -498,6 +558,22 @@ Protocol: 2.05+ After loading, the boot loader must set the code32_start field to point to the loaded code, or to a boot loader hook. +Field name: min_alignment +Type: read (reloc) +Offset/size: 0x235/1 +Protocol: 2.10+ + + This field, if nonzero, indicates as a power of two the minimum + alignment required, as opposed to preferred, by the kernel to boot. + If a boot loader makes use of this field, it should update the + kernel_alignment field with the alignment unit desired; typically: + + kernel_alignment = 1 << min_alignment + + There may be a considerable performance cost with an excessively + misaligned kernel. Therefore, a loader should typically try each + power-of-two alignment from kernel_alignment down to this alignment. + Field name: cmdline_size Type: read Offset/size: 0x238/4 @@ -582,6 +658,36 @@ Protocol: 2.09+ sure to consider the case where the linked list already contains entries. +Field name: pref_address +Type: read (reloc) +Offset/size: 0x258/8 +Protocol: 2.10+ + + This field, if nonzero, represents a preferred load address for the + kernel. A relocating bootloader should attempt to load at this + address if possible. + + A non-relocatable kernel will unconditionally move itself and to run + at this address. + +Field name: init_size +Type: read +Offset/size: 0x25c/4 + + This field indicates the amount of linear contiguous memory starting + at the kernel runtime start address that the kernel needs before it + is capable of examining its memory map. This is not the same thing + as the total amount of memory the kernel needs to boot, but it can + be used by a relocating boot loader to help select a safe load + address for the kernel. + + The kernel runtime start address is determined by the following algorithm: + + if (relocatable_kernel) + runtime_start = align_up(load_address, kernel_alignment) + else + runtime_start = pref_address + **** THE IMAGE CHECKSUM diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a718..2db5893d6c97 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -150,11 +150,6 @@ NUMA Otherwise, the remaining system RAM is allocated to an additional node. - numa=hotadd=percent - Only allow hotadd memory to preallocate page structures upto - percent of already available memory. - numa=hotadd=0 will disable hotadd memory. - ACPI acpi=off Don't enable ACPI diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 29b52b14d0b4..d6498e3cd713 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory -ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole -ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space -ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) +ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory +ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole +ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space +ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole +ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 30 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Man-Eating Seals of Antiquity # *DOCUMENTATION* diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index 9c9d1fd4155f..5bd5259324b7 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } } -static void +static int dp264_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); cpu_set_irq_affinity(irq, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); + + return 0; } -static void +static int clipper_set_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&dp264_irq_lock); cpu_set_irq_affinity(irq - 16, *affinity); tsunami_update_irq_hw(cached_irq_mask); spin_unlock(&dp264_irq_lock); + + return 0; } static struct hw_interrupt_type dp264_irq_type = { diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index 27f840a4ad3d..8dd239ebdb9e 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity) } -static void +static int titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { spin_lock(&titan_irq_lock); titan_cpu_set_irq_affinity(irq - 16, *affinity); titan_update_irq_hw(titan_cached_irq_mask); spin_unlock(&titan_irq_lock); + + return 0; } static void diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index 3e1714c6523f..664c7b8b1ba8 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) +static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) { void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3); unsigned int shift = (irq % 4) * 8; @@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val) val |= 1 << (cpu + shift); writel(val, reg); spin_unlock(&irq_controller_lock); + + return 0; } #endif diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c index df3925cb1c7f..d70b445f4a8f 100644 --- a/arch/cris/arch-v32/kernel/irq.c +++ b/arch/cris/arch-v32/kernel/irq.c @@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq) { } -void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) +int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest) { unsigned long flags; spin_lock_irqsave(&irq_lock, flags); irq_allocations[irq - FIRST_IRQ].mask = *dest; spin_unlock_irqrestore(&irq_lock, flags); + + return 0; } static struct irq_chip crisv32_irq_type = { diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c index cc0a3182db3c..acb5047ab573 100644 --- a/arch/ia64/hp/sim/hpsim_irq.c +++ b/arch/ia64/hp/sim/hpsim_irq.c @@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq) { } -static void +static int hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b) { + return 0; } static struct hw_interrupt_type irq_type_hp_sim = { diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 5510317db37b..baec6f00f7f3 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) { if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) return gsi; @@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) fadt = (struct acpi_table_fadt *)fadt_header; - acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); + acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_LOW); return 0; } diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 166e0d839fa0..f92cef47bf86 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -329,7 +329,7 @@ unmask_irq (unsigned int irq) } -static void +static int iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) { #ifdef CONFIG_SMP @@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) cpu = cpumask_first_and(cpu_online_mask, mask); if (cpu >= nr_cpu_ids) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; dest = cpu_physical_id(cpu); if (!iosapic_intr_info[irq].count) - return; /* not an IOSAPIC interrupt */ + return -1; /* not an IOSAPIC interrupt */ set_irq_affinity_info(irq, dest, redir); @@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask) iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32); } + #endif + return 0; } /* diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 2b15e233f7fe..0f8ade9331ba 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -12,7 +12,7 @@ static struct irq_chip ia64_msi_chip; #ifdef CONFIG_SMP -static void ia64_set_msi_irq_affinity(unsigned int irq, +static int ia64_set_msi_irq_affinity(unsigned int irq, const cpumask_t *cpu_mask) { struct msi_msg msg; @@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, int cpu = first_cpu(*cpu_mask); if (!cpu_online(cpu)) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; read_msi_msg(irq, &msg); @@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq, write_msi_msg(irq, &msg); cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu)); + + return 0; } #endif /* CONFIG_SMP */ @@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_cfg *cfg = irq_cfg + irq; struct msi_msg msg; int cpu = cpumask_first(mask); if (!cpu_online(cpu)) - return; + return -1; if (irq_prepare_move(irq, cpu)) - return; + return -1; dmar_msi_read(irq, &msg); @@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dmar_msi_write(irq, &msg); cpumask_copy(irq_desc[irq].affinity, mask); + + return 0; } #endif /* CONFIG_SMP */ diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c index 66fd705e82c0..764f26abac05 100644 --- a/arch/ia64/sn/kernel/irq.c +++ b/arch/ia64/sn/kernel/irq.c @@ -227,7 +227,7 @@ finish_up: return new_irq_info; } -static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) +static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct sn_irq_info *sn_irq_info, *sn_irq_info_safe; nasid_t nasid; @@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask) list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe, sn_irq_lh[irq], list) (void)sn_retarget_vector(sn_irq_info, nasid, slice); + + return 0; } #ifdef CONFIG_SMP diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c index 81e428943d73..fbbfb9701201 100644 --- a/arch/ia64/sn/kernel/msi_sn.c +++ b/arch/ia64/sn/kernel/msi_sn.c @@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry) } #ifdef CONFIG_SMP -static void sn_set_msi_irq_affinity(unsigned int irq, +static int sn_set_msi_irq_affinity(unsigned int irq, const struct cpumask *cpu_mask) { struct msi_msg msg; @@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpu = cpumask_first(cpu_mask); sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) - return; + return -1; /* * Release XIO resources for the old MSI PCI address @@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); sn_msi_info[irq].sn_irq_info = new_irq_info; if (new_irq_info == NULL) - return; + return -1; /* * Map the xio address into bus space @@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq, write_msi_msg(irq, &msg); cpumask_copy(irq_desc[irq].affinity, cpu_mask); + + return 0; } #endif /* CONFIG_SMP */ diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index 1c19af8daa62..d3a0c8154bec 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq) } #ifdef CONFIG_SMP -static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest) +static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest) { int cpu; int bit = irq - OCTEON_IRQ_WORKQ0; /* Bit 0-63 of EN0 */ @@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask */ cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2)); write_unlock(&octeon_irq_ciu0_rwlock); + + return 0; } #endif @@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq) } #ifdef CONFIG_SMP -static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest) +static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest) { int cpu; int bit = irq - OCTEON_IRQ_WDOG0; /* Bit 0-63 of EN1 */ @@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask */ cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1)); write_unlock(&octeon_irq_ciu1_rwlock); + + return 0; } #endif diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index 3214ade02d10..4f1eed107b08 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq) #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF #include <linux/cpumask.h> -extern void plat_set_irq_affinity(unsigned int irq, +extern int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity); extern void smtc_forward_irq(unsigned int irq); diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c index 87deb8f6c458..3f43c2e3aa5a 100644 --- a/arch/mips/kernel/irq-gic.c +++ b/arch/mips/kernel/irq-gic.c @@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq) static DEFINE_SPINLOCK(gic_lock); -static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) +static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { cpumask_t tmp = CPU_MASK_NONE; unsigned long flags; @@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_and(&tmp, cpumask, cpu_online_mask); if (cpus_empty(tmp)) - return; + return -1; /* Assumption : cpumask refers to a single CPU */ spin_lock_irqsave(&gic_lock, flags); @@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(irq_desc[irq].affinity, cpumask); spin_unlock_irqrestore(&gic_lock, flags); + return 0; } #endif diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c index 5ba31888fefb..499ffe5475df 100644 --- a/arch/mips/mti-malta/malta-smtc.c +++ b/arch/mips/mti-malta/malta-smtc.c @@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = { */ -void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) +int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) { cpumask_t tmask; int cpu = 0; @@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) /* Do any generic SMTC IRQ affinity setup */ smtc_set_irq_affinity(irq, tmask); + + return 0; } #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */ diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c index c147c4b35d3f..690de06bde90 100644 --- a/arch/mips/sibyte/bcm1480/irq.c +++ b/arch/mips/sibyte/bcm1480/irq.c @@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq); static void disable_bcm1480_irq(unsigned int irq); static void ack_bcm1480_irq(unsigned int irq); #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); +static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_PCI @@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) +static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on, k; u64 cur_ints; @@ -118,7 +118,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) if (cpumask_weight(mask) != 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); - return; + return -1; } i = cpumask_first(mask); @@ -152,6 +152,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask) } } spin_unlock_irqrestore(&bcm1480_imr_lock, flags); + + return 0; } #endif diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c index 38cb998ade22..409dec798863 100644 --- a/arch/mips/sibyte/sb1250/irq.c +++ b/arch/mips/sibyte/sb1250/irq.c @@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq); static void disable_sb1250_irq(unsigned int irq); static void ack_sb1250_irq(unsigned int irq); #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); +static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask); #endif #ifdef CONFIG_SIBYTE_HAS_LDT @@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq) } #ifdef CONFIG_SMP -static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) +static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) { int i = 0, old_cpu, cpu, int_on; u64 cur_ints; @@ -113,7 +113,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) if (cpumask_weight(mask) > 1) { printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq); - return; + return -1; } /* Convert logical CPU to physical CPU */ @@ -143,6 +143,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask) R_IMR_INTERRUPT_MASK)); } spin_unlock_irqrestore(&sb1250_imr_lock, flags); + + return 0; } #endif diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 4ea4229d765c..8007f1e65729 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest) return cpu_dest; } -static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) +static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest) { int cpu_dest; cpu_dest = cpu_check_affinity(irq, dest); if (cpu_dest < 0) - return; + return -1; cpumask_copy(&irq_desc[irq].affinity, dest); + + return 0; } #endif diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index 80b513449f4c..be3581a8c294 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq) lpar_xirr_info_set((0xff << 24) | irq); } -static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) +static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) { unsigned int irq; int status; @@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) irq = (unsigned int)irq_map[virq].hwirq; if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) - return; + return -1; status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq); if (status) { printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n", __func__, irq, status); - return; + return -1; } /* @@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) printk(KERN_WARNING "%s: No online cpus in the mask %s for irq %d\n", __func__, cpulist, virq); - return; + return -1; } status = rtas_call(ibm_set_xive, 3, 1, NULL, @@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) if (status) { printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n", __func__, irq, status); - return; + return -1; } + + return 0; } static struct irq_chip xics_pic_direct = { diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 0efc12d1a3d7..352d8c3ef526 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq) #endif /* CONFIG_SMP */ -void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct mpic *mpic = mpic_from_irq(irq); unsigned int src = mpic_irq_to_hw(irq); @@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask) mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), mpic_physmask(cpus_addr(tmp)[0])); } + + return 0; } static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type) diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h index 3cef2af10f42..eff433c322a0 100644 --- a/arch/powerpc/sysdev/mpic.h +++ b/arch/powerpc/sysdev/mpic.h @@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic) extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type); extern void mpic_set_vector(unsigned int virq, unsigned int vector); -extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); +extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask); #endif /* _POWERPC_SYSDEV_MPIC_H */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 639ac805448a..65865726b283 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -102,8 +102,8 @@ struct thread_info { #define TI_KERN_CNTD1 0x00000488 #define TI_PCR 0x00000490 #define TI_RESTART_BLOCK 0x00000498 -#define TI_KUNA_REGS 0x000004c0 -#define TI_KUNA_INSN 0x000004c8 +#define TI_KUNA_REGS 0x000004c8 +#define TI_KUNA_INSN 0x000004d0 #define TI_FPREGS 0x00000500 /* We embed this in the uppermost byte of thread_info->flags */ diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 5deabe921a47..e5e78f9cfc95 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq) } } -static void sun4u_set_affinity(unsigned int virt_irq, +static int sun4u_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { sun4u_irq_enable(virt_irq); + + return 0; } /* Don't do anything. The desc->status check for IRQ_DISABLED in @@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq) ino, err); } -static void sun4v_set_affinity(unsigned int virt_irq, +static int sun4v_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { unsigned int ino = virt_irq_table[virt_irq].dev_ino; @@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq, if (err != HV_EOK) printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): " "err(%d)\n", ino, cpuid, err); + + return 0; } static void sun4v_irq_disable(unsigned int virt_irq) @@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq) dev_handle, dev_ino, err); } -static void sun4v_virt_set_affinity(unsigned int virt_irq, +static int sun4v_virt_set_affinity(unsigned int virt_irq, const struct cpumask *mask) { unsigned long cpuid, dev_handle, dev_ino; @@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq, printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): " "err(%d)\n", dev_handle, dev_ino, cpuid, err); + + return 0; } static void sun4v_virq_disable(unsigned int virt_irq) diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild new file mode 100644 index 000000000000..ad8ec356fb36 --- /dev/null +++ b/arch/x86/Kbuild @@ -0,0 +1,16 @@ + +obj-$(CONFIG_KVM) += kvm/ + +# Xen paravirtualization support +obj-$(CONFIG_XEN) += xen/ + +# lguest paravirtualization support +obj-$(CONFIG_LGUEST_GUEST) += lguest/ + +obj-y += kernel/ +obj-y += mm/ + +obj-y += crypto/ +obj-y += vdso/ +obj-$(CONFIG_IA32_EMULATION) += ia32/ + diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6efe0a2e9ae..aafae3b140de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,11 @@ config X86 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA +config OUTPUT_FORMAT + string + default "elf32-i386" if X86_32 + default "elf64-x86-64" if X86_64 + config ARCH_DEFCONFIG string default "arch/x86/configs/i386_defconfig" if X86_32 @@ -274,15 +279,9 @@ config SPARSE_IRQ If you don't know what to do here, say N. -config NUMA_MIGRATE_IRQ_DESC - bool "Move irq desc when changing irq smp_affinity" +config NUMA_IRQ_DESC + def_bool y depends on SPARSE_IRQ && NUMA - depends on BROKEN - default n - ---help--- - This enables moving irq_desc to cpu/node that irq will use handled. - - If you don't know what to do here, say N. config X86_MPPARSE bool "Enable MPS table" if ACPI @@ -355,7 +354,7 @@ config X86_UV depends on X86_64 depends on X86_EXTENDED_PLATFORM depends on NUMA - select X86_X2APIC + depends on X86_X2APIC ---help--- This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. @@ -1466,9 +1465,7 @@ config KEXEC_JUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if X86_NUMAQ - default "0x200000" if X86_64 - default "0x100000" + default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1487,15 +1484,15 @@ config PHYSICAL_START to be specifically compiled to run from a specific memory area (normally a reserved region) and this option comes handy. - So if you are using bzImage for capturing the crash dump, leave - the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. - Otherwise if you plan to use vmlinux for capturing the crash dump - change this value to start of the reserved region (Typically 16MB - 0x1000000). In other words, it can be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + So if you are using bzImage for capturing the crash dump, + leave the value here unchanged to 0x1000000 and set + CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux + for capturing the crash dump change this value to start of + the reserved region. In other words, it can be set based on + the "X" value as specified in the "crashkernel=YM@XM" + command line boot parameter passed to the panic-ed + kernel. Please take a look at Documentation/kdump/kdump.txt + for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as one does not have to build two kernels. Same kernel can be used @@ -1508,8 +1505,8 @@ config PHYSICAL_START Don't change this unless you know what you are doing. config RELOCATABLE - bool "Build a relocatable kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Build a relocatable kernel" + default y ---help--- This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. @@ -1524,12 +1521,16 @@ config RELOCATABLE it has been loaded at and the compile time physical address (CONFIG_PHYSICAL_START) is ignored. +# Relocation on x86-32 needs some additional build support +config X86_NEED_RELOCS + def_bool y + depends on X86_32 && RELOCATABLE + config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 - default "0x100000" if X86_32 - default "0x200000" if X86_64 - range 0x2000 0x400000 + default "0x1000000" + range 0x2000 0x1000000 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8359e73317f..33fac6bbe1c2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -159,10 +159,17 @@ config IOMMU_DEBUG options. See Documentation/x86_64/boot-options.txt for more details. +config IOMMU_STRESS + bool "Enable IOMMU stress-test mode" + ---help--- + This option disables various optimizations in IOMMU related + code to do real stress testing of the IOMMU code. This option + will cause a performance drop and should only be enabled for + testing. + config IOMMU_LEAK bool "IOMMU leak tracing" - depends on DEBUG_KERNEL - depends on IOMMU_DEBUG + depends on IOMMU_DEBUG && DMA_API_DEBUG ---help--- Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8c86b72afdc2..edbd0ca62067 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,8 +7,6 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -core-$(CONFIG_KVM) += arch/x86/kvm/ - # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o @@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ -# Sub architecture files that needs linking first -core-y += $(fcore-y) - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -core-y += arch/x86/kernel/ -core-y += arch/x86/mm/ - -core-y += arch/x86/crypto/ -core-y += arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +# See arch/x86/Kbuild for content of core part of the kernel +core-y += arch/x86/ # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 172cf8a98bdd..851fe936d242 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -3,6 +3,8 @@ bzImage cpustr.h mkcpustr offsets.h +voffset.h +zoffset.h setup setup.bin setup.elf diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505a..8d16ada25048 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o video-mode.o version.o +setup-y += printf.o regs.o string.o tty.o video.o video-mode.o +setup-y += version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, @@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-offsets := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' -quiet_cmd_offsets = OFFSETS $@ - cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ -$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE - $(call if_changed,offsets) +targets += voffset.h +$(obj)/voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' + +quiet_cmd_zoffset = ZOFFSET $@ + cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ + +targets += zoffset.h +$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE + $(call if_changed,zoffset) -targets += offsets.h AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/offsets.h +$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 7c19ce8c2442..64a31a6d751a 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -2,7 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -90,8 +90,11 @@ static int a20_test_long(void) static void enable_a20_bios(void) { - asm volatile("pushfl; int $0x15; popfl" - : : "a" ((u16)0x2401)); + struct biosregs ireg; + + initregs(&ireg); + ireg.ax = 0x2401; + intcall(0x15, &ireg, NULL); } static void enable_a20_kbc(void) diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index 7aa6033001f9..ee274834ea8b 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * Original APM BIOS checking by Stephen Rothwell, May 1994 * (sfr@canb.auug.org.au) @@ -19,75 +20,56 @@ int query_apm_bios(void) { - u16 ax, bx, cx, dx, di; - u32 ebx, esi; - u8 err; + struct biosregs ireg, oreg; /* APM BIOS installation check */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x53; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* No APM BIOS */ - if (bx != 0x504d) /* "PM" signature */ + if (oreg.bx != 0x504d) /* "PM" signature */ return -1; - if (!(cx & 0x02)) /* 32 bits supported? */ + if (!(oreg.cx & 0x02)) /* 32 bits supported? */ return -1; /* Disconnect first, just in case */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - /* Paranoia */ - ebx = esi = 0; - cx = dx = di = 0; + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); /* 32-bit connect */ - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" - : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), - "+S" (esi), "+D" (di), "=m" (err) - : "a" (0x5303)); - - boot_params.apm_bios_info.cseg = ax; - boot_params.apm_bios_info.offset = ebx; - boot_params.apm_bios_info.cseg_16 = cx; - boot_params.apm_bios_info.dseg = dx; - boot_params.apm_bios_info.cseg_len = (u16)esi; - boot_params.apm_bios_info.cseg_16_len = esi >> 16; - boot_params.apm_bios_info.dseg_len = di; - - if (err) + ireg.al = 0x03; + intcall(0x15, &ireg, &oreg); + + boot_params.apm_bios_info.cseg = oreg.ax; + boot_params.apm_bios_info.offset = oreg.ebx; + boot_params.apm_bios_info.cseg_16 = oreg.cx; + boot_params.apm_bios_info.dseg = oreg.dx; + boot_params.apm_bios_info.cseg_len = oreg.si; + boot_params.apm_bios_info.cseg_16_len = oreg.hsi; + boot_params.apm_bios_info.dseg_len = oreg.di; + + if (oreg.flags & X86_EFLAGS_CF) return -1; /* Redo the installation check as the 32-bit connect; some BIOSes return different flags this way... */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + ireg.al = 0x00; + intcall(0x15, &ireg, &oreg); - if (err || bx != 0x504d) { + if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) { /* Failure with 32-bit connect, try to disconect and ignore */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); return -1; } - boot_params.apm_bios_info.version = ax; - boot_params.apm_bios_info.flags = cx; + boot_params.apm_bios_info.version = oreg.ax; + boot_params.apm_bios_info.flags = oreg.cx; return 0; } diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S new file mode 100644 index 000000000000..507793739ea5 --- /dev/null +++ b/arch/x86/boot/bioscall.S @@ -0,0 +1,82 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes + * touching registers they shouldn't be. + */ + + .code16 + .text + .globl intcall + .type intcall, @function +intcall: + /* Self-modify the INT instruction. Ugly, but works. */ + cmpb %al, 3f + je 1f + movb %al, 3f + jmp 1f /* Synchronize pipeline */ +1: + /* Save state */ + pushfl + pushw %fs + pushw %gs + pushal + + /* Copy input state to stack frame */ + subw $44, %sp + movw %dx, %si + movw %sp, %di + movw $11, %cx + rep; movsd + + /* Pop full state from the stack */ + popal + popw %gs + popw %fs + popw %es + popw %ds + popfl + + /* Actual INT */ + .byte 0xcd /* INT opcode */ +3: .byte 0 + + /* Push full state to the stack */ + pushfl + pushw %ds + pushw %es + pushw %fs + pushw %gs + pushal + + /* Re-establish C environment invariants */ + cld + movzwl %sp, %esp + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + + /* Copy output state from stack frame */ + movw 68(%esp), %di /* Original %cx == 3rd argument */ + andw %di, %di + jz 4f + movw %sp, %si + movw $11, %cx + rep; movsd +4: addw $44, %sp + + /* Restore state and return */ + popal + popw %gs + popw %fs + popfl + retl + .size intcall, .-intcall diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 7b2692e897e5..98239d2658f2 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -26,6 +27,7 @@ #include <asm/setup.h> #include "bitops.h" #include <asm/cpufeature.h> +#include <asm/processor-flags.h> /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -241,6 +243,49 @@ int enable_a20(void); /* apm.c */ int query_apm_bios(void); +/* bioscall.c */ +struct biosregs { + union { + struct { + u32 edi; + u32 esi; + u32 ebp; + u32 _esp; + u32 ebx; + u32 edx; + u32 ecx; + u32 eax; + u32 _fsgs; + u32 _dses; + u32 eflags; + }; + struct { + u16 di, hdi; + u16 si, hsi; + u16 bp, hbp; + u16 _sp, _hsp; + u16 bx, hbx; + u16 dx, hdx; + u16 cx, hcx; + u16 ax, hax; + u16 gs, fs; + u16 es, ds; + u16 flags, hflags; + }; + struct { + u8 dil, dih, edi2, edi3; + u8 sil, sih, esi2, esi3; + u8 bpl, bph, ebp2, ebp3; + u8 _spl, _sph, _esp2, _esp3; + u8 bl, bh, ebx2, ebx3; + u8 dl, dh, edx2, edx3; + u8 cl, ch, ecx2, ecx3; + u8 al, ah, eax2, eax3; + }; + }; +}; +void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); + /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); @@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...); int vsprintf(char *buf, const char *fmt, va_list args); int printf(const char *fmt, ...); +/* regs.c */ +void initregs(struct biosregs *regs); + /* string.c */ int strcmp(const char *str1, const char *str2); size_t strnlen(const char *s, size_t maxlen); diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 63eff3b04d01..4a46fab7162e 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,3 +1,6 @@ relocs vmlinux.bin.all vmlinux.relocs +vmlinux.lds +mkpiggy +piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 65551c9f8571..49c8a4c37d7c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,7 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +hostprogs-y := mkpiggy + +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_32) += relocs +hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) - -ifeq ($(CONFIG_X86_32),y) +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,lzma) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -endif -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T -else +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZMA) := lzma -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T -endif +quiet_cmd_mkpiggy = MKPIGGY $@ + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) -suffix_$(CONFIG_KERNEL_GZIP) = gz -suffix_$(CONFIG_KERNEL_BZIP2) = bz2 -suffix_$(CONFIG_KERNEL_LZMA) = lzma - -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE - $(call if_changed,ld) +targets += piggy.S +$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE + $(call if_changed,mkpiggy) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3a8a866fb2e2..75e4f001e706 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -12,16 +12,16 @@ * the page directory. [According to comments etc elsewhere on a compressed * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] * - * Page 0 is deliberately kept safe, since System Management Mode code in + * Page 0 is deliberately kept safe, since System Management Mode code in * laptops may need to access the BIOS data stored there. This is also - * useful for future device drivers that either access the BIOS via VM86 + * useful for future device drivers that either access the BIOS via VM86 * mode. */ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.text + .text #include <linux/linkage.h> #include <asm/segment.h> @@ -29,161 +29,151 @@ #include <asm/boot.h> #include <asm/asm-offsets.h> -.section ".text.head","ax",@progbits + .section ".text.head","ax",@progbits ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ + testb $(1<<6), BP_loadflags(%esi) + jnz 1f cli - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - movl %eax,%ss + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp + leal (BP_scratch+4)(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp -/* %ebp contains the address we are loaded at by the boot loader and %ebx +/* + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx + movl %ebp, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else - movl $LOAD_PHYSICAL_ADDR, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx - /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx - -/* Copy the compressed kernel to the end of our buffer + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx + + /* Set up the stack */ + leal boot_stack_end(%ebx), %esp + + /* Zero EFLAGS */ + pushl $0 + popfl + +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushl %esi - leal _end(%ebp), %esi - leal _end(%ebx), %edi - movl $(_end - startup_32), %ecx + pushl %esi + leal (_bss-4)(%ebp), %esi + leal (_bss-4)(%ebx), %edi + movl $(_bss - startup_32), %ecx + shrl $2, %ecx std - rep - movsb + rep movsl cld - popl %esi - -/* Compute the kernel start address. - */ -#ifdef CONFIG_RELOCATABLE - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp -#else - movl $LOAD_PHYSICAL_ADDR, %ebp -#endif + popl %esi /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax - jmp *%eax + leal relocated(%ebx), %eax + jmp *%eax ENDPROC(startup_32) -.section ".text" + .text relocated: /* - * Clear BSS - */ - xorl %eax,%eax - leal _edata(%ebx),%edi - leal _end(%ebx), %ecx - subl %edi,%ecx - cld - rep - stosb - -/* - * Setup the stack for the decompressor + * Clear BSS (stack is currently empty) */ - leal boot_stack_end(%ebx), %esp + xorl %eax, %eax + leal _bss(%ebx), %edi + leal _ebss(%ebx), %ecx + subl %edi, %ecx + shrl $2, %ecx + rep stosl /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax - # push arguments for decompress_kernel: - pushl %ebp # output address - movl input_len(%ebx), %eax - pushl %eax # input_len - leal input_data(%ebx), %eax - pushl %eax # input_data - leal boot_heap(%ebx), %eax - pushl %eax # heap area - pushl %esi # real mode pointer - call decompress_kernel - addl $20, %esp - popl %ecx + leal z_extract_offset_negative(%ebx), %ebp + /* push arguments for decompress_kernel: */ + pushl %ebp /* output address */ + pushl $z_input_len /* input_len */ + leal input_data(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call decompress_kernel + addl $20, %esp #if CONFIG_RELOCATABLE -/* Find the address of the relocations. +/* + * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + leal z_output_len(%ebp), %edi -/* Calculate the delta between where vmlinux was compiled to run +/* + * Calculate the delta between where vmlinux was compiled to run * and where it was actually loaded. */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ + movl %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ -1: subl $4, %edi - movl 0(%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b +1: subl $4, %edi + movl (%edi), %ecx + testl %ecx, %ecx + jz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b 2: #endif /* * Jump to the decompressed kernel. */ - xorl %ebx,%ebx - jmp *%ebp + xorl %ebx, %ebx + jmp *%ebp -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index ed4a82948002..f62c284db9eb 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -21,8 +21,8 @@ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.code32 -.text + .code32 + .text #include <linux/linkage.h> #include <asm/segment.h> @@ -33,12 +33,14 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> -.section ".text.head" + .section ".text.head" .code32 ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ testb $(1<<6), BP_loadflags(%esi) jnz 1f @@ -49,14 +51,15 @@ ENTRY(startup_32) movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp @@ -70,32 +73,28 @@ ENTRY(startup_32) testl %eax, %eax jnz no_longmode -/* Compute the delta between where we were compiled to run at +/* + * Compute the delta between where we were compiled to run at * and where the code will actually run at. - */ -/* %ebp contains the address we are loaded at by the boot loader and %ebx + * + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(PMD_PAGE_SIZE -1), %ebx - andl $PMD_PAGE_MASK, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else - movl $CONFIG_PHYSICAL_START, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addl $(32768 + 18 + 4095), %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* * Prepare for entering 64 bit mode @@ -114,7 +113,7 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ - /* Initialize Page tables to 0*/ + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax movl $((4096*6)/4), %ecx @@ -155,7 +154,8 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr - /* Setup for the jump to 64bit mode + /* + * Setup for the jump to 64bit mode * * When the jump is performend we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 @@ -184,7 +184,8 @@ no_longmode: #include "../../kernel/verify_cpu_64.S" - /* Be careful here startup_64 needs to be at a predictable + /* + * Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. @@ -192,7 +193,8 @@ no_longmode: .code64 .org 0x200 ENTRY(startup_64) - /* We come here either from startup_32 or directly from a + /* + * We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. @@ -209,50 +211,54 @@ ENTRY(startup_64) movl $0x20, %eax ltr %ax - /* Compute the decompressed kernel start address. It is where + /* + * Compute the decompressed kernel start address. It is where * we were loaded at aligned to a 2M boundary. %rbp contains the * decompressed kernel start address. * * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and - * run the kernel from CONFIG_PHYSICAL_START + * run the kernel from LOAD_PHYSICAL_ADDR + * + * We cannot rely on the calculation done in 32-bit mode, since we + * may have been invoked via the 64-bit entry point. */ /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(PMD_PAGE_SIZE - 1), %rbp - andq $PMD_PAGE_MASK, %rbp - movq %rbp, %rbx + movl BP_kernel_alignment(%rsi), %eax + decl %eax + addq %rax, %rbp + notq %rax + andq %rax, %rbp #else - movq $CONFIG_PHYSICAL_START, %rbp - movq %rbp, %rbx + movq $LOAD_PHYSICAL_ADDR, %rbp #endif - /* Replace the compressed data size with the uncompressed size */ - movl input_len(%rip), %eax - subq %rax, %rbx - movl output_len(%rip), %eax - addq %rax, %rbx - /* Add 8 bytes for every 32K input block */ - shrq $12, %rax - addq %rax, %rbx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addq $(32768 + 18 + 4095), %rbx - andq $~4095, %rbx - -/* Copy the compressed kernel to the end of our buffer + /* Target address to relocate to for decompression */ + leaq z_extract_offset(%rbp), %rbx + + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + + /* Zero EFLAGS */ + pushq $0 + popfq + +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _end_before_pgt(%rip), %r8 - leaq _end_before_pgt(%rbx), %r9 - movq $_end_before_pgt /* - $startup_32 */, %rcx -1: subq $8, %r8 - subq $8, %r9 - movq 0(%r8), %rax - movq %rax, 0(%r9) - subq $8, %rcx - jnz 1b + pushq %rsi + leaq (_bss-8)(%rip), %rsi + leaq (_bss-8)(%rbx), %rdi + movq $_bss /* - $startup_32 */, %rcx + shrq $3, %rcx + std + rep movsq + cld + popq %rsi /* * Jump to the relocated address. @@ -260,37 +266,28 @@ ENTRY(startup_64) leaq relocated(%rbx), %rax jmp *%rax -.section ".text" + .text relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ - xorq %rax, %rax - leaq _edata(%rbx), %rdi - leaq _end_before_pgt(%rbx), %rcx + xorl %eax, %eax + leaq _bss(%rip), %rdi + leaq _ebss(%rip), %rcx subq %rdi, %rcx - cld - rep - stosb - - /* Setup the stack */ - leaq boot_stack_end(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq + shrq $3, %rcx + rep stosq /* * Do the decompression, and jump to the new kernel.. */ - pushq %rsi # Save the real mode argument - movq %rsi, %rdi # real mode address - leaq boot_heap(%rip), %rsi # malloc area for uncompression - leaq input_data(%rip), %rdx # input_data - movl input_len(%rip), %eax - movq %rax, %rcx # input_len - movq %rbp, %r8 # output + pushq %rsi /* Save the real mode argument */ + movq %rsi, %rdi /* real mode address */ + leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ + leaq input_data(%rip), %rdx /* input_data */ + movl $z_input_len, %ecx /* input_len */ + movq %rbp, %r8 /* output target address */ call decompress_kernel popq %rsi @@ -311,11 +308,21 @@ gdt: .quad 0x0000000000000000 /* TS continued */ gdt_end: -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .section ".pgtable","a",@nobits + .balign 4096 +pgtable: + .fill 6*4096, 1, 0 diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e45be73684ff..842b2a36174a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) + error("Destination address inappropriately aligned"); #ifdef CONFIG_X86_64 - if ((unsigned long)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((unsigned long)output >= 0xffffffffffUL) + if (heap > 0x3fffffffffffUL) error("Destination address too large"); #else - if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1)) - error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); +#endif #ifndef CONFIG_RELOCATABLE - if ((u32)output != LOAD_PHYSICAL_ADDR) + if ((unsigned long)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif -#endif if (!quiet) putstr("\nDecompressing Linux... "); diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c new file mode 100644 index 000000000000..bcbd36c41432 --- /dev/null +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -0,0 +1,97 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * H. Peter Anvin <hpa@linux.intel.com> + * + * ----------------------------------------------------------------------- */ + +/* + * Compute the desired load offset from a compressed program; outputs + * a small assembly wrapper with the appropriate symbols defined. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <inttypes.h> + +static uint32_t getle32(const void *p) +{ + const uint8_t *cp = p; + + return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) + + ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24); +} + +int main(int argc, char *argv[]) +{ + uint32_t olen; + long ilen; + unsigned long offs; + FILE *f; + + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + return 1; + } + + /* Get the information for the compressed kernel image first */ + + f = fopen(argv[1], "r"); + if (!f) { + perror(argv[1]); + return 1; + } + + + if (fseek(f, -4L, SEEK_END)) { + perror(argv[1]); + } + fread(&olen, sizeof olen, 1, f); + ilen = ftell(f); + olen = getle32(&olen); + fclose(f); + + /* + * Now we have the input (compressed) and output (uncompressed) + * sizes, compute the necessary decompression offset... + */ + + offs = (olen > ilen) ? olen - ilen : 0; + offs += olen >> 12; /* Add 8 bytes for each 32K block */ + offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ + offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + + printf(".section \".rodata.compressed\",\"a\",@progbits\n"); + printf(".globl z_input_len\n"); + printf("z_input_len = %lu\n", ilen); + printf(".globl z_output_len\n"); + printf("z_output_len = %lu\n", (unsigned long)olen); + printf(".globl z_extract_offset\n"); + printf("z_extract_offset = 0x%lx\n", offs); + /* z_extract_offset_negative allows simplification of head_32.S */ + printf(".globl z_extract_offset_negative\n"); + printf("z_extract_offset_negative = -0x%lx\n", offs); + + printf(".globl input_data, input_data_end\n"); + printf("input_data:\n"); + printf(".incbin \"%s\"\n", argv[1]); + printf("input_data_end:\n"); + + return 0; +} diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S index bef1ac891bce..cc353e1b3ffd 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,6 +1,17 @@ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#undef i386 + +#include <asm/page_types.h> + +#ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) +#else +OUTPUT_ARCH(i386) +ENTRY(startup_32) +#endif + SECTIONS { /* Be careful parts of head_64.S assume startup_32 is at @@ -33,16 +44,22 @@ SECTIONS *(.data.*) _edata = . ; } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) *(.bss.*) *(COMMON) - . = ALIGN(8); - _end_before_pgt = . ; - . = ALIGN(4096); - pgtable = . ; - . = . + 4096 * 6; + . = ALIGN(8); /* For convenience during zeroing */ _ebss = .; } +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + .pgtable : { + _pgtable = . ; + *(.pgtable) + _epgtable = . ; + } +#endif + _end = .; } diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr deleted file mode 100644 index f02382ae5c48..000000000000 --- a/arch/x86/boot/compressed/vmlinux.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .rodata.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds deleted file mode 100644 index bb3c48379c40..000000000000 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ /dev/null @@ -1,43 +0,0 @@ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(startup_32) -SECTIONS -{ - /* Be careful parts of head_32.S assume startup_32 is at - * address 0. - */ - . = 0; - .text.head : { - _head = . ; - *(.text.head) - _ehead = . ; - } - .rodata.compressed : { - *(.rodata.compressed) - } - .text : { - _text = .; /* Text */ - *(.text) - *(.text.*) - _etext = . ; - } - .rodata : { - _rodata = . ; - *(.rodata) /* read-only data */ - *(.rodata.*) - _erodata = . ; - } - .data : { - _data = . ; - *(.data) - *(.data.*) - _edata = . ; - } - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - _end = . ; - } -} diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 1aae8f3e5ca1..c501a5b466f8 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,17 +23,17 @@ */ static int read_mbr(u8 devno, void *buf) { - u16 ax, bx, cx, dx; + struct biosregs ireg, oreg; - ax = 0x0201; /* Legacy Read, one sector */ - cx = 0x0001; /* Sector 0-0-1 */ - dx = devno; - bx = (size_t)buf; - asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) - : : "esi", "edi", "memory"); + initregs(&ireg); + ireg.ax = 0x0201; /* Legacy Read, one sector */ + ireg.cx = 0x0001; /* Sector 0-0-1 */ + ireg.dl = devno; + ireg.bx = (size_t)buf; - return -(u8)ax; /* 0 or -1 */ + intcall(0x13, &ireg, &oreg); + + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) @@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) static int get_edd_info(u8 devno, struct edd_info *ei) { - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; memset(ei, 0, sizeof *ei); /* Check Extensions Present */ - ax = 0x4100; - bx = EDDMAGIC1; - dx = devno; - asm("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x41; + ireg.bx = EDDMAGIC1; + ireg.dl = devno; + intcall(0x13, &ireg, &oreg); - if ((u8)ax) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No extended information */ - if (bx != EDDMAGIC2) + if (oreg.bx != EDDMAGIC2) return -1; ei->device = devno; - ei->version = ax >> 8; /* EDD version number */ - ei->interface_support = cx; /* EDD functionality subsets */ + ei->version = oreg.ah; /* EDD version number */ + ei->interface_support = oreg.cx; /* EDD functionality subsets */ /* Extended Get Device Parameters */ ei->params.length = sizeof(ei->params); - ax = 0x4800; - dx = devno; - asm("pushfl; int $0x13; popfl" - : "+a" (ax), "+d" (dx), "=m" (ei->params) - : "S" (&ei->params) - : "ebx", "ecx", "edi"); + ireg.ah = 0x48; + ireg.si = (size_t)&ei->params; + intcall(0x13, &ireg, &oreg); /* Get legacy CHS parameters */ /* Ralf Brown recommends setting ES:DI to 0:0 */ - ax = 0x0800; - dx = devno; - di = 0; - asm("pushw %%es; " - "movw %%di,%%es; " - "pushfl; stc; int $0x13; setc %%al; popfl; " - "popw %%es" - : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) - : : "esi"); - - if ((u8)ax == 0) { - ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2); - ei->legacy_max_head = dx >> 8; - ei->legacy_sectors_per_track = cx & 0x3f; + ireg.ah = 0x08; + ireg.es = 0; + intcall(0x13, &ireg, &oreg); + + if (!(oreg.eflags & X86_EFLAGS_CF)) { + ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2); + ei->legacy_max_head = oreg.dh; + ei->legacy_sectors_per_track = oreg.cl & 0x3f; } return 0; diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4c..b31cc54b4641 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -22,7 +22,8 @@ #include <asm/page_types.h> #include <asm/setup.h> #include "boot.h" -#include "offsets.h" +#include "voffset.h" +#include "zoffset.h" BOOTSEG = 0x07C0 /* original address of boot-sector */ SYSSEG = 0x1000 /* historical load address >> 4 */ @@ -115,7 +116,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0209 # header version number (>= 0x0105) + .word 0x020a # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512 # end of setup code can be used by setup # for local heap purposes. -pad1: .word 0 +ext_loader_ver: + .byte 0 # Extended boot loader version +ext_loader_type: + .byte 0 # Extended boot loader type + cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # If nonzero, a 32-bit pointer # to the kernel command line. @@ -200,7 +205,7 @@ relocatable_kernel: .byte 1 #else relocatable_kernel: .byte 0 #endif -pad2: .byte 0 +min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment pad3: .word 0 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, @@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07 hardware_subarch_data: .quad 0 -payload_offset: .long input_data -payload_length: .long input_data_end-input_data +payload_offset: .long ZO_input_data +payload_length: .long ZO_z_input_len setup_data: .quad 0 # 64-bit physical pointer to # single linked list of # struct setup_data +pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) +#define VO_INIT_SIZE (VO__end - VO__text) +#if ZO_INIT_SIZE > VO_INIT_SIZE +#define INIT_SIZE ZO_INIT_SIZE +#else +#define INIT_SIZE VO_INIT_SIZE +#endif +init_size: .long INIT_SIZE # kernel initialization size + # End of setup header ##################################################### - .section ".inittext", "ax" + .section ".entrytext", "ax" start_of_setup: #ifdef SAFE_RESET_DISK_CONTROLLER # Reset the disk controller. diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 58f0415d3ae0..140172b895bd 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -61,11 +62,10 @@ static void copy_boot_params(void) */ static void keyboard_set_repeat(void) { - u16 ax = 0x0305; - u16 bx = 0; - asm volatile("int $0x16" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + struct biosregs ireg; + initregs(&ireg); + ireg.ax = 0x0305; + intcall(0x16, &ireg, NULL); } /* @@ -73,18 +73,22 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + struct biosregs ireg, oreg; + /* Some older BIOSes apparently crash on this call, so filter it from machines too old to have SpeedStep at all. */ if (cpu.level < 6) return; - asm("int $0x15" - : "=a" (boot_params.ist_info.signature), - "=b" (boot_params.ist_info.command), - "=c" (boot_params.ist_info.event), - "=d" (boot_params.ist_info.perf_level) - : "a" (0x0000e980), /* IST Support */ - "d" (0x47534943)); /* Request value */ + initregs(&ireg); + ireg.ax = 0xe980; /* IST Support */ + ireg.edx = 0x47534943; /* Request value */ + intcall(0x15, &ireg, &oreg); + + boot_params.ist_info.signature = oreg.eax; + boot_params.ist_info.command = oreg.ebx; + boot_params.ist_info.event = oreg.ecx; + boot_params.ist_info.perf_level = oreg.edx; } /* @@ -93,13 +97,12 @@ static void query_ist(void) static void set_bios_mode(void) { #ifdef CONFIG_X86_64 - u32 eax, ebx; + struct biosregs ireg; - eax = 0xec00; - ebx = 2; - asm volatile("int $0x15" - : "+a" (eax), "+b" (ebx) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ax = 0xec00; + ireg.bx = 2; + intcall(0x15, &ireg, NULL); #endif } diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c index 911eaae5d696..a95a531148ef 100644 --- a/arch/x86/boot/mca.c +++ b/arch/x86/boot/mca.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -16,26 +17,22 @@ int query_mca(void) { - u8 err; - u16 es, bx, len; - - asm("pushw %%es ; " - "int $0x15 ; " - "setc %0 ; " - "movw %%es, %1 ; " - "popw %%es" - : "=acd" (err), "=acdSD" (es), "=b" (bx) - : "a" (0xc000)); - - if (err) + struct biosregs ireg, oreg; + u16 len; + + initregs(&ireg); + ireg.ah = 0xc0; + intcall(0x15, &ireg, &oreg); + + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No MCA present */ - set_fs(es); - len = rdfs16(bx); + set_fs(oreg.es); + len = rdfs16(oreg.bx); if (len > sizeof(boot_params.sys_desc_table)) len = sizeof(boot_params.sys_desc_table); - copy_from_fs(&boot_params.sys_desc_table, bx, len); + copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len); return 0; } diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 74b3d2ba84e9..cae3feb1035e 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -20,12 +20,16 @@ static int detect_memory_e820(void) { int count = 0; - u32 next = 0; - u32 size, id, edi; - u8 err; + struct biosregs ireg, oreg; struct e820entry *desc = boot_params.e820_map; static struct e820entry buf; /* static so it is zeroed */ + initregs(&ireg); + ireg.ax = 0xe820; + ireg.cx = sizeof buf; + ireg.edx = SMAP; + ireg.di = (size_t)&buf; + /* * Note: at least one BIOS is known which assumes that the * buffer pointed to by one e820 call is the same one as @@ -41,22 +45,13 @@ static int detect_memory_e820(void) */ do { - size = sizeof buf; - - /* Important: %edx and %esi are clobbered by some BIOSes, - so they must be either used for the error output - or explicitly marked clobbered. Given that, assume there - is something out there clobbering %ebp and %edi, too. */ - asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" - : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "+m" (buf) - : "D" (&buf), "d" (SMAP), "a" (0xe820) - : "esi"); + intcall(0x15, &ireg, &oreg); + ireg.ebx = oreg.ebx; /* for next iteration... */ /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */ - if (err) + if (oreg.eflags & X86_EFLAGS_CF) break; /* Some BIOSes stop returning SMAP in the middle of @@ -64,60 +59,64 @@ static int detect_memory_e820(void) screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */ - if (id != SMAP) { + if (oreg.eax != SMAP) { count = 0; break; } *desc++ = buf; count++; - } while (next && count < ARRAY_SIZE(boot_params.e820_map)); + } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } static int detect_memory_e801(void) { - u16 ax, bx, cx, dx; - u8 err; + struct biosregs ireg, oreg; - bx = cx = dx = 0; - ax = 0xe801; - asm("stc; int $0x15; setc %0" - : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx)); + initregs(&ireg); + ireg.ax = 0xe801; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* Do we really need to do this? */ - if (cx || dx) { - ax = cx; - bx = dx; + if (oreg.cx || oreg.dx) { + oreg.ax = oreg.cx; + oreg.bx = oreg.dx; } - if (ax > 15*1024) + if (oreg.ax > 15*1024) { return -1; /* Bogus! */ - - /* This ignores memory above 16MB if we have a memory hole - there. If someone actually finds a machine with a memory - hole at 16MB and no support for 0E820h they should probably - generate a fake e820 map. */ - boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; + } else if (oreg.ax == 15*1024) { + boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + } else { + /* + * This ignores memory above 16MB if we have a memory + * hole there. If someone actually finds a machine + * with a memory hole at 16MB and no support for + * 0E820h they should probably generate a fake e820 + * map. + */ + boot_params.alt_mem_k = oreg.ax; + } return 0; } static int detect_memory_88(void) { - u16 ax; - u8 err; + struct biosregs ireg, oreg; - ax = 0x8800; - asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); + initregs(&ireg); + ireg.ah = 0x88; + intcall(0x15, &ireg, &oreg); - boot_params.screen_info.ext_mem_k = ax; + boot_params.screen_info.ext_mem_k = oreg.ax; - return -err; + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } int detect_memory(void) diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c new file mode 100644 index 000000000000..958019b1cfa5 --- /dev/null +++ b/arch/x86/boot/regs.c @@ -0,0 +1,29 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Simple helper function for initializing a register set. + * + * Note that this sets EFLAGS_CF in the input register set; this + * makes it easier to catch functions which do nothing but don't + * explicitly set CF. + */ + +#include "boot.h" + +void initregs(struct biosregs *reg) +{ + memset(reg, 0, sizeof *reg); + reg->eflags |= X86_EFLAGS_CF; + reg->ds = ds(); + reg->es = ds(); + reg->fs = fs(); + reg->gs = gs(); +} diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index bb8dc2de7969..0f6ec455a2b1 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -15,8 +15,11 @@ SECTIONS . = 497; .header : { *(.header) } + .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } .initdata : { *(.initdata) } + __end_init = .; + .text : { *(.text) } .text32 : { *(.text32) } @@ -52,4 +55,7 @@ SECTIONS . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); + /* Necessary for the very-old-loader check to work... */ + . = ASSERT(__end_init <= 5*512, "init sections too big!"); + } diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 7e8e8b25f5f6..01ec69c901c7 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,24 +23,23 @@ void __attribute__((section(".inittext"))) putchar(int ch) { - unsigned char c = ch; + struct biosregs ireg; - if (c == '\n') + if (ch == '\n') putchar('\r'); /* \n -> \r\n */ - /* int $0x10 is known to have bugs involving touching registers - it shouldn't. Be extra conservative... */ - asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" - : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); + initregs(&ireg); + ireg.bx = 0x0007; + ireg.cx = 0x0001; + ireg.ah = 0x0e; + ireg.al = ch; + intcall(0x10, &ireg, NULL); } void __attribute__((section(".inittext"))) puts(const char *str) { - int n = 0; - while (*str) { + while (*str) putchar(*str++); - n++; - } } /* @@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str) static u8 gettime(void) { - u16 ax = 0x0200; - u16 cx, dx; + struct biosregs ireg, oreg; - asm volatile("int $0x1a" - : "+a" (ax), "=c" (cx), "=d" (dx) - : : "ebx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; + intcall(0x1a, &ireg, &oreg); - return dx >> 8; + return oreg.dh; } /* @@ -64,19 +63,24 @@ static u8 gettime(void) */ int getchar(void) { - u16 ax = 0; - asm volatile("int $0x16" : "+a" (ax)); + struct biosregs ireg, oreg; + + initregs(&ireg); + /* ireg.ah = 0x00; */ + intcall(0x16, &ireg, &oreg); - return ax & 0xff; + return oreg.al; } static int kbd_pending(void) { - u8 pending; - asm volatile("int $0x16; setnz %0" - : "=qm" (pending) - : "a" (0x0100)); - return pending; + struct biosregs ireg, oreg; + + initregs(&ireg); + ireg.ah = 0x01; + intcall(0x16, &ireg, &oreg); + + return !(oreg.eflags & X86_EFLAGS_ZF); } void kbd_flush(void) diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 3fa979c9c363..d660be492363 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi) static int set_bios_mode(u8 mode) { - u16 ax; + struct biosregs ireg, oreg; u8 new_mode; - ax = mode; /* AH=0x00 Set Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.al = mode; /* AH=0x00 Set Video Mode */ + intcall(0x10, &ireg, NULL); - ax = 0x0f00; /* Get Current Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + + ireg.ah = 0x0f; /* Get Current Video Mode */ + intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ - new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ + + /* Not all BIOSes are clean with the top bit */ + new_mode = ireg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ @@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode) /* Mode setting failed, but we didn't end up where we started. That's bad. Try to revert to the original video mode. */ - ax = boot_params.screen_info.orig_video_mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = boot_params.screen_info.orig_video_mode; + intcall(0x10, &ireg, NULL); } #endif return -1; diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4a58c8ce3f69..c700147d6ffb 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) - u16 ax, cx, di; + struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; struct mode_info *mi; @@ -39,13 +40,12 @@ static int vesa_probe(void) video_vesa.modes = GET_HEAP(struct mode_info, 0); - ax = 0x4f00; - di = (size_t)&vginfo; - asm(INT10 - : "+a" (ax), "+D" (di), "=m" (vginfo) - : : "ebx", "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f00; + ireg.di = (size_t)&vginfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f || + if (ireg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -65,14 +65,12 @@ static int vesa_probe(void) memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + ireg.ax = 0x4f01; + ireg.cx = mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (ireg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -111,20 +109,19 @@ static int vesa_probe(void) static int vesa_set_mode(struct mode_info *mode) { - u16 ax, bx, cx, di; + struct biosregs ireg, oreg; int is_graphic; u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = vesa_mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f01; + ireg.cx = vesa_mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode) } - ax = 0x4f02; - bx = vesa_mode; - di = 0; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f02; + ireg.bx = vesa_mode; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; graphic_mode = is_graphic; @@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode) /* Switch DAC to 8-bit mode */ static void vesa_dac_set_8bits(void) { + struct biosregs ireg, oreg; u8 dac_size = 6; /* If possible, switch the DAC to 8-bit mode */ if (vginfo.capabilities & 1) { - u16 ax, bx; - - ax = 0x4f08; - bx = 0x0800; - asm volatile(INT10 - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - if (ax == 0x004f) - dac_size = bx >> 8; + initregs(&ireg); + ireg.ax = 0x4f08; + ireg.bh = 0x08; + intcall(0x10, &ireg, &oreg); + if (oreg.ax == 0x004f) + dac_size = oreg.bh; } /* Set the color sizes to the DAC size, and offsets to 0 */ - boot_params.screen_info.red_size = dac_size; + boot_params.screen_info.red_size = dac_size; boot_params.screen_info.green_size = dac_size; - boot_params.screen_info.blue_size = dac_size; - boot_params.screen_info.rsvd_size = dac_size; + boot_params.screen_info.blue_size = dac_size; + boot_params.screen_info.rsvd_size = dac_size; - boot_params.screen_info.red_pos = 0; - boot_params.screen_info.green_pos = 0; - boot_params.screen_info.blue_pos = 0; - boot_params.screen_info.rsvd_pos = 0; + boot_params.screen_info.red_pos = 0; + boot_params.screen_info.green_pos = 0; + boot_params.screen_info.blue_pos = 0; + boot_params.screen_info.rsvd_pos = 0; } /* Save the VESA protected mode info */ static void vesa_store_pm_info(void) { - u16 ax, bx, di, es; + struct biosregs ireg, oreg; - ax = 0x4f0a; - bx = di = 0; - asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" - : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f0a; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; - boot_params.screen_info.vesapm_seg = es; - boot_params.screen_info.vesapm_off = di; + boot_params.screen_info.vesapm_seg = oreg.es; + boot_params.screen_info.vesapm_off = oreg.di; } /* @@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void) void vesa_store_edid(void) { #ifdef CONFIG_FIRMWARE_EDID - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; /* Apparently used as a nonsense token... */ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); @@ -260,33 +250,26 @@ void vesa_store_edid(void) if (vginfo.version < 0x0200) return; /* EDID requires VBE 2.0+ */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0000; /* Report DDC capabilities */ - cx = 0; /* Controller 0 */ - di = 0; /* ES:DI must be 0 by spec */ - - /* Note: The VBE DDC spec is different from the main VESA spec; - we genuinely have to assume all registers are destroyed here. */ - - asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es" - : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di) - : : "esi", "edx"); + initregs(&ireg); + ireg.ax = 0x4f15; /* VBE DDC */ + /* ireg.bx = 0x0000; */ /* Report DDC capabilities */ + /* ireg.cx = 0; */ /* Controller 0 */ + ireg.es = 0; /* ES:DI must be 0 by spec */ + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; /* No EDID */ /* BH = time in seconds to transfer EDD information */ /* BL = DDC level supported */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0001; /* Read EDID */ - cx = 0; /* Controller 0 */ - dx = 0; /* EDID block number */ - di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ - asm(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), - "+c" (cx), "+D" (di) - : : "esi"); + ireg.ax = 0x4f15; /* VBE DDC */ + ireg.bx = 0x0001; /* Read EDID */ + /* ireg.cx = 0; */ /* Controller 0 */ + /* ireg.dx = 0; */ /* EDID block number */ + ireg.es = ds(); + ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */ + intcall(0x10, &ireg, &oreg); #endif /* CONFIG_FIRMWARE_EDID */ } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 9e0587a37768..8f8d827e254d 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -39,30 +40,30 @@ static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) { + struct biosregs ireg, oreg; u16 ax; u8 rows; u8 mode; + initregs(&ireg); + #ifdef CONFIG_VIDEO_400_HACK if (adapter >= ADAPTER_VGA) { - asm volatile(INT10 - : : "a" (0x1202), "b" (0x0030) - : "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1202; + ireg.bx = 0x0030; + intcall(0x10, &ireg, NULL); } #endif ax = 0x0f00; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); - - mode = (u8)ax; + intcall(0x10, &ireg, &oreg); + mode = oreg.al; set_fs(0); rows = rdfs8(0x484); /* rows minus one */ #ifndef CONFIG_VIDEO_400_HACK - if ((ax == 0x5003 || ax == 0x5007) && + if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; #endif @@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void) mode = 3; /* Set the mode */ - ax = mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = mode; /* AH=0: set mode */ + intcall(0x10, &ireg, NULL); do_restore = 1; return mode; } @@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void) static void vga_set_8font(void) { /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 8x8 font */ - asm volatile(INT10 : : "a" (0x1112), "b" (0)); + ireg.ax = 0x1112; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Use alternate print screen */ - asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); + ireg.ax = 0x1200; + ireg.bl = 0x20; + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 6-7 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); + ireg.ax = 0x0100; + ireg.cx = 0x0607; + intcall(0x10, &ireg, NULL); } static void vga_set_14font(void) { /* Set 9x14 font - 80x28 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 9x14 font */ - asm volatile(INT10 : : "a" (0x1111), "b" (0)); + ireg.ax = 0x1111; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 11-12 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); + ireg.ax = 0x0100; + ireg.cx = 0x0b0c; + intcall(0x10, &ireg, NULL); } static void vga_set_80x43(void) { /* Set 80x43 mode on VGA (not EGA) */ + struct biosregs ireg; + + initregs(&ireg); /* Set 350 scans */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); + ireg.ax = 0x1201; + ireg.bl = 0x30; + intcall(0x10, &ireg, NULL); /* Reset video mode */ - asm volatile(INT10 : : "a" (0x0003)); + ireg.ax = 0x0003; + intcall(0x10, &ireg, NULL); vga_set_8font(); } @@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode) */ static int vga_probe(void) { - u16 ega_bx; - static const char *card_name[] = { "CGA/MDA/HGC", "EGA", "VGA" }; @@ -240,26 +263,26 @@ static int vga_probe(void) sizeof(ega_modes)/sizeof(struct mode_info), sizeof(vga_modes)/sizeof(struct mode_info), }; - u8 vga_flag; - asm(INT10 - : "=b" (ega_bx) - : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ - : "ecx", "edx", "esi", "edi"); + struct biosregs ireg, oreg; + + initregs(&ireg); + + ireg.ax = 0x1200; + ireg.bl = 0x10; /* Check EGA/VGA */ + intcall(0x10, &ireg, &oreg); #ifndef _WAKEUP - boot_params.screen_info.orig_video_ega_bx = ega_bx; + boot_params.screen_info.orig_video_ega_bx = oreg.bx; #endif /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ - if ((u8)ega_bx != 0x10) { + if (oreg.bl != 0x10) { /* EGA/VGA */ - asm(INT10 - : "=a" (vga_flag) - : "a" (0x1a00) - : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1a00; + intcall(0x10, &ireg, &oreg); - if (vga_flag == 0x1a) { + if (oreg.al == 0x1a) { adapter = ADAPTER_VGA; #ifndef _WAKEUP boot_params.screen_info.orig_video_isVGA = 1; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 3bef2c1febe9..bad728b76fc2 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -18,33 +19,29 @@ static void store_cursor_position(void) { - u16 curpos; - u16 ax, bx; + struct biosregs ireg, oreg; - ax = 0x0300; - bx = 0; - asm(INT10 - : "=d" (curpos), "+a" (ax), "+b" (bx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x03; + intcall(0x10, &ireg, &oreg); - boot_params.screen_info.orig_x = curpos; - boot_params.screen_info.orig_y = curpos >> 8; + boot_params.screen_info.orig_x = oreg.dl; + boot_params.screen_info.orig_y = oreg.dh; } static void store_video_mode(void) { - u16 ax, page; + struct biosregs ireg, oreg; /* N.B.: the saving of the video page here is a bit silly, since we pretty much assume page 0 everywhere. */ - ax = 0x0f00; - asm(INT10 - : "+a" (ax), "=b" (page) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x0f; + intcall(0x10, &ireg, &oreg); /* Not all BIOSes are clean with respect to the top bit */ - boot_params.screen_info.orig_video_mode = ax & 0x7f; - boot_params.screen_info.orig_video_page = page >> 8; + boot_params.screen_info.orig_video_mode = oreg.al & 0x7f; + boot_params.screen_info.orig_video_page = oreg.bh; } /* @@ -257,7 +254,7 @@ static void restore_screen(void) int y; addr_t dst = 0; u16 *src = saved.data; - u16 ax, bx, dx; + struct biosregs ireg; if (graphic_mode) return; /* Can't restore onto a graphic mode */ @@ -296,12 +293,11 @@ static void restore_screen(void) } /* Restore cursor position */ - ax = 0x0200; /* Set cursor position */ - bx = 0; /* Page number (<< 8) */ - dx = (saved.cury << 8)+saved.curx; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; /* Set cursor position */ + ireg.dh = saved.cury; + ireg.dl = saved.curx; + intcall(0x10, &ireg, NULL); } #else #define save_screen() ((void)0) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index ee63f5d14461..5bb174a997fc 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ extern int do_restore; /* Restore screen contents */ extern int graphic_mode; /* Graphics mode with linear frame buffer */ -/* - * int $0x10 is notorious for touching registers it shouldn't. - * gcc doesn't like %ebp being clobbered, so define it as a push/pop - * sequence here. - * - * A number of systems, including the original PC can clobber %bp in - * certain circumstances, like when scrolling. There exists at least - * one Trident video card which could clobber DS under a set of - * circumstances that we are unlikely to encounter (scrolling when - * using an extended graphics mode of more than 800x600 pixels), but - * it's cheap insurance to deal with that here. - */ -#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" - /* Accessing VGA indexed registers */ static inline u8 in_idx(u16 port, u8 index) { diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 235b81d0f6f2..edb992ebef92 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:50:58 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:21:55 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf32-i386" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_AUDIT_ARCH is not set CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_32_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y +CONFIG_X86_32_LAZY_GS=y CONFIG_KTIME_SCALAR=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -113,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -139,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set @@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +# CONFIG_X86_BIGSMP is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_VSMP is not set # CONFIG_X86_RDC321X is not set +# CONFIG_X86_32_NON_STANDARD is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,8 +245,10 @@ CONFIG_M686=y # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_X86_XADD=y # CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y @@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_CYRIX_32=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_32=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_CPU_SUP_TRANSMETA_32=y CONFIG_CPU_SUP_UMC_32=y CONFIG_X86_DS=y @@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_HIGHPTE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y @@ -312,6 +332,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -322,8 +343,9 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set CONFIG_PCIEPORTBUS=y # CONFIG_HOTPLUG_PCI_PCIE is not set CONFIG_PCIEAER=y @@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -481,7 +504,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +661,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +1002,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1053,7 @@ CONFIG_E1000=y CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1068,7 @@ CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_IBMLS is not set @@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y @@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1424,6 +1457,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1433,6 +1467,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1448,11 +1483,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set -# CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set @@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2071,6 +2120,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y CONFIG_GENERIC_ACL=y # +# Caches +# +# CONFIG_FSCACHE is not set + +# # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y @@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y @@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y @@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set CONFIG_DOUBLEFAULT=y -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_GEODE is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_LGUEST is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2489,7 +2566,10 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9fe5d212ab4c..cee1dd2e69b2 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:44:16 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:22:00 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_64_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -114,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -140,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y @@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y -# CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_VSMP is not set +# CONFIG_X86_UV is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_MCORE2 is not set CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y @@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_64=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_X86_DS=y CONFIG_X86_PTRACE_BTS=y CONFIG_HPET_TIMER=y @@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y # CONFIG_I8K is not set CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y @@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_DIRECT_GBPAGES=y CONFIG_NUMA=y @@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y CONFIG_X86_RESERVE_LOW_64K=y @@ -317,6 +333,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -325,9 +342,10 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y CONFIG_K8_NB=y CONFIG_PCCARD=y @@ -481,7 +499,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +656,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set -# CONFIG_SGI_XP is not set # CONFIG_HP_ILO is not set -# CONFIG_SGI_GRU is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +995,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1046,7 @@ CONFIG_E1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_3C359 is not set @@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set CONFIG_NVRAM=y @@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1416,6 +1442,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1425,6 +1452,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1440,11 +1468,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2051,6 +2094,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y CONFIG_GENERIC_ACL=y # +# Caches +# +# CONFIG_FSCACHE is not set + +# # CD-ROM/DVD Filesystems # CONFIG_ISO9660_FS=y @@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set @@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_IOMMU_DEBUG is not set -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_AES=y # CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_AES_NI_INTEL is not set # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2464,7 +2536,10 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f6aa18eadf71..1a37bcdc8606 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/stddef.h> +#include <linux/stringify.h> #include <asm/asm.h> /* @@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {} const unsigned char *const *find_nop_table(void); +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, feature) \ + \ + "661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" + /* * Alternative instructions for different CPU types or capabilities. * @@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void); * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") /* * Alternative inline assembly with input. @@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void); * Best is to use constraints that are fixed size (like (%1) ... "r") * If you use variable sized constraints like "m" or "g" in the * replacement make sure to pad to the worst case length. + * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c[feat]\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" : output : [feat] "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : output : "i" (0), ## input) /* * use this macro(s) if you need more than one output parameter diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index f712344329bc..262e02820049 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -27,6 +27,8 @@ extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +extern void amd_iommu_flush_all_domains(void); +extern void amd_iommu_flush_all_devices(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 95c8cd9d22b5..0c878caaa0a2 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -194,6 +194,27 @@ #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +extern bool amd_iommu_dump; +#define DUMP_printk(format, arg...) \ + do { \ + if (amd_iommu_dump) \ + printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ + } while(0); + +/* + * Make iterating over all IOMMUs easier + */ +#define for_each_iommu(iommu) \ + list_for_each_entry((iommu), &amd_iommu_list, list) +#define for_each_iommu_safe(iommu, next) \ + list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list) + +#define APERTURE_RANGE_SHIFT 27 /* 128 MB */ +#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT) +#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT) +#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */ +#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) +#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) /* * This structure contains generic data for IOMMU protection domains @@ -210,6 +231,26 @@ struct protection_domain { }; /* + * For dynamic growth the aperture size is split into ranges of 128MB of + * DMA address space each. This struct represents one such range. + */ +struct aperture_range { + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 *pte_pages[64]; + + unsigned long offset; +}; + +/* * Data container for a dma_ops specific protection domain */ struct dma_ops_domain { @@ -222,18 +263,10 @@ struct dma_ops_domain { unsigned long aperture_size; /* address we start to search for free addresses */ - unsigned long next_bit; - - /* address allocation bitmap */ - unsigned long *bitmap; + unsigned long next_address; - /* - * Array of PTE pages for the aperture. In this array we save all the - * leaf pages of the domain page table used for the aperture. This way - * we don't need to walk the page table to find a specific PTE. We can - * just calculate its address in constant time. - */ - u64 **pte_pages; + /* address space relevant data */ + struct aperture_range *aperture[APERTURE_MAX_RANGES]; /* This will be set to true when TLB needs to be flushed */ bool need_flush; diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f8377422..bb7d47925847 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -#define EIM_8BIT_APIC_ID 0 -#define EIM_32BIT_APIC_ID 1 +extern int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* @@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void) return val; } -extern int x2apic, x2apic_phys; +extern int x2apic_phys; extern void check_x2apic(void); extern void enable_x2apic(void); -extern void enable_IR_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); static inline int x2apic_enabled(void) { @@ -183,6 +181,8 @@ static inline int x2apic_enabled(void) return 1; return 0; } + +#define x2apic_supported() (cpu_has_x2apic) #else static inline void check_x2apic(void) { @@ -190,28 +190,20 @@ static inline void check_x2apic(void) static inline void enable_x2apic(void) { } -static inline void enable_IR_x2apic(void) -{ -} static inline int x2apic_enabled(void) { return 0; } -#define x2apic 0 - +#define x2apic_preenabled 0 +#define x2apic_supported() 0 #endif -extern int get_physical_broadcast(void); +extern void enable_IR_x2apic(void); -#ifdef CONFIG_X86_X2APIC -static inline void ack_x2APIC_irq(void) -{ - /* Docs say use 0 for future compatibility */ - native_apic_msr_write(APIC_EOI, 0); -} -#endif +extern int get_physical_broadcast(void); +extern void apic_disable(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC(void); @@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } - +static inline void apic_disable(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 @@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_XAPIC(ver)) + if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) return (x >> 24) & 0xFF; else return (x >> 24) & 0x0F; @@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); #ifdef CONFIG_X86_32 + +extern struct apic apic_default; + /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index bc9514fb3b13..7ddb36ab933b 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -22,6 +22,7 @@ # define APIC_INTEGRATED(x) (1) #endif #define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_EXT_SPACE(x) ((x) & 0x80000000) #define APIC_TASKPRI 0x80 #define APIC_TPRI_MASK 0xFFu #define APIC_ARBPRI 0x90 @@ -116,7 +117,9 @@ #define APIC_TDR_DIV_32 0x8 #define APIC_TDR_DIV_64 0x9 #define APIC_TDR_DIV_128 0xA -#define APIC_EILVT0 0x500 +#define APIC_EFEAT 0x400 +#define APIC_ECTRL 0x410 +#define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) @@ -125,9 +128,6 @@ #define APIC_EILVT_MSG_NMI 0x4 #define APIC_EILVT_MSG_EXT 0x7 #define APIC_EILVT_MASKED (1 << 16) -#define APIC_EILVT1 0x510 -#define APIC_EILVT2 0x520 -#define APIC_EILVT3 0x530 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6ba23dd9fc92..418e632d4a80 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,11 +8,26 @@ #ifdef __KERNEL__ +#include <asm/page_types.h> + /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + (CONFIG_PHYSICAL_ALIGN - 1)) \ & ~(CONFIG_PHYSICAL_ALIGN - 1)) +/* Minimum kernel alignment, as a power of two */ +#ifdef CONFIG_x86_64 +#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +#else +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#endif +#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) + +#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ + (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) +#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +#endif + #ifdef CONFIG_KERNEL_BZIP2 #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 433adaebf9b6..1724e8de317c 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -50,7 +50,8 @@ struct setup_header { __u32 ramdisk_size; __u32 bootsect_kludge; __u16 heap_end_ptr; - __u16 _pad1; + __u8 ext_loader_ver; + __u8 ext_loader_type; __u32 cmd_line_ptr; __u32 initrd_addr_max; __u32 kernel_alignment; diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h index 222802029fa6..d96c1ee3a95c 100644 --- a/arch/x86/include/asm/cpu_debug.h +++ b/arch/x86/include/asm/cpu_debug.h @@ -86,105 +86,7 @@ enum cpu_file_bit { CPU_VALUE_BIT, /* value */ }; -#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) - -/* - * DisplayFamily_DisplayModel Processor Families/Processor Number Series - * -------------------------- ------------------------------------------ - * 05_01, 05_02, 05_04 Pentium, Pentium with MMX - * - * 06_01 Pentium Pro - * 06_03, 06_05 Pentium II Xeon, Pentium II - * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III - * - * 06_09, 060D Pentium M - * - * 06_0E Core Duo, Core Solo - * - * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series, - * Core 2 Quad, Core 2 Extreme, Core 2 Duo, - * Pentium dual-core - * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650 - * - * 06_1C Atom - * - * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4 - * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D - * - * 0F_06 Xeon 7100, 5000 Series, Xeon MP, - * Pentium 4, Pentium D - */ - -/* Register processors bits */ -enum cpu_processor_bit { - CPU_NONE, -/* Intel */ - CPU_INTEL_PENTIUM_BIT, - CPU_INTEL_P6_BIT, - CPU_INTEL_PENTIUM_M_BIT, - CPU_INTEL_CORE_BIT, - CPU_INTEL_CORE2_BIT, - CPU_INTEL_ATOM_BIT, - CPU_INTEL_XEON_P4_BIT, - CPU_INTEL_XEON_MP_BIT, -/* AMD */ - CPU_AMD_K6_BIT, - CPU_AMD_K7_BIT, - CPU_AMD_K8_BIT, - CPU_AMD_0F_BIT, - CPU_AMD_10_BIT, - CPU_AMD_11_BIT, -}; - -#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT) -#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT) -#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT) -#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT) -#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT) -#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT) -#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT) -#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT) - -#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M) -#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2) -#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP) -#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM) -#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM) -#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM) -#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON) -#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON) -#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT) -#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON) -#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON) -#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT) -#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX) -#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE) -#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE) -#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT) -#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE) -#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT) -#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE) - -/* Select all supported Intel CPUs */ -#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE) - -#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT) -#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT) -#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT) -#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT) -#define CPU_AMD_10 (1 << CPU_AMD_10_BIT) -#define CPU_AMD_11 (1 << CPU_AMD_11_BIT) - -#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11) -#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS) -#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS) -#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS) - -/* Select all supported AMD CPUs */ -#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS) - -/* Select all supported CPUs */ -#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL) +#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT) #define MAX_CPU_FILES 512 @@ -220,7 +122,6 @@ struct cpu_debug_range { unsigned min; /* Register range min */ unsigned max; /* Register range max */ unsigned flag; /* Supported flags */ - unsigned model; /* Supported models */ }; #endif /* _ASM_X86_CPU_DEBUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index bb83b1c397aa..19af42138f78 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,7 +22,7 @@ #define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ #define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ +#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ #define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ @@ -94,6 +94,7 @@ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ +#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -192,11 +193,11 @@ extern const char * const x86_power_flags[32]; #define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) #define setup_clear_cpu_cap(bit) do { \ clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_cleared); \ } while (0) #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ - clear_bit(bit, (unsigned long *)cleared_cpu_caps); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd70..3bd1777a4c8b 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -63,7 +63,26 @@ extern unsigned long io_apic_irqs; extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); extern void disable_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); + +struct io_apic_irq_attr { + int ioapic; + int ioapic_pin; + int trigger; + int polarity; +}; + +static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, + int ioapic, int ioapic_pin, + int trigger, int polarity) +{ + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; +} + +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, + struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); @@ -78,7 +97,11 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_generic_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_IO_APIC +extern asmlinkage void smp_irq_move_cleanup_interrupt(void); +#endif #ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 71c9e5183982..175adf58dd4f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in __save_init_fpu() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "m" (*fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); @@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return err; } -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); - else - return fxrstor_checking(&tsk->thread.xstate->fxsave); -} - /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. The kernel data segment can be sometimes 0 and sometimes @@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in __fxsave_clear() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "0" (0)); @@ -185,12 +177,9 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline void restore_fpu(struct task_struct *tsk) +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - if (task_thread_info(tsk)->status & TS_XSAVE) { - xrstor_checking(&tsk->thread.xstate->xsave); - return; - } /* * The "nop" is needed to make the instructions the same * length. @@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk) "nop ; frstor %1", "fxrstor %1", X86_FEATURE_FXSR, - "m" (tsk->thread.xstate->fxsave)); + "m" (*fx)); + + return 0; } /* We need a safe address that is cheap to find and that is already @@ -262,6 +253,14 @@ end: #endif /* CONFIG_X86_64 */ +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); + else + return fxrstor_checking(&tsk->thread.xstate->fxsave); +} + /* * Signal frame handlers... */ @@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void) /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context aswell. To prevent these kernel instructions - * in interrupt context interact wrongly with other user/kernel fpu usage, we + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we * should use them only in the context of irq_ts_save/restore() */ static inline int irq_ts_save(void) { /* - * If we are in process context, we are ok to take a spurious DNA fault. - * Otherwise, doing clts() in process context require pre-emption to - * be disabled or some heavy lifting like kernel_fpu_begin() + * If in process context and not atomic, we can take a spurious DNA fault. + * Otherwise, doing clts() in process context requires disabling preemption + * or some heavy lifting like kernel_fpu_begin() */ - if (!in_interrupt()) + if (!in_atomic()) return 0; if (read_cr0() & X86_CR0_TS) { diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 1a99e6c092af..58d7091eeb1f 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip; extern void mask_8259A(void); extern void unmask_8259A(void); -#ifdef CONFIG_X86_32 -extern void init_ISA_irqs(void); -#endif - #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 9d826e436010..daf866ed0612 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -154,22 +154,19 @@ extern int timer_through_8259; extern int io_apic_get_unique_id(int ioapic, int apic_id); extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); -extern int io_apic_set_pci_routing(int ioapic, int pin, int irq, - int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ +struct io_apic_irq_attr; +extern int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); -#ifdef CONFIG_X86_64 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries); -#endif extern void probe_nr_irqs_gsi(void); diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 86af26091d6c..0e9fe1d9d971 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_IOMAP_H +#define _ASM_X86_IOMAP_H + /* * Copyright © 2008 Ingo Molnar * @@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); + +#endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 0396760fccb8..f275e2244505 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -1,6 +1,6 @@ #ifndef _ASM_X86_IRQ_REMAPPING_H #define _ASM_X86_IRQ_REMAPPING_H -#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8) +#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) #endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47c..910b5a3d6751 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -34,6 +34,7 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 +# define IA32_SYSCALL_VECTOR 0x80 #else # define IA32_SYSCALL_VECTOR 0x80 #endif diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index 54c8cc53b24d..c2d1f3b58e5f 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); +#ifdef CONFIG_K8_NB +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; +} +#else +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return NULL; +} +#endif + + #endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c882664716c1..ef51b501e22a 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -9,20 +9,31 @@ struct cpu_signature { struct device; +enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; + struct microcode_ops { - int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - int (*request_microcode_fw) (int cpu, struct device *device); + enum ucode_state (*request_microcode_user) (int cpu, + const void __user *buf, size_t size); - void (*apply_microcode) (int cpu); + enum ucode_state (*request_microcode_fw) (int cpu, + struct device *device); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); void (*microcode_fini_cpu) (int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + int (*apply_microcode) (int cpu); + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); }; struct ucode_cpu_info { - struct cpu_signature cpu_sig; - int valid; - void *mc; + struct cpu_signature cpu_sig; + int valid; + void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 642fc7fc8cdc..e2a1bb6d71ea 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -61,9 +61,11 @@ extern void get_smp_config(void); #ifdef CONFIG_X86_MPPARSE extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); +extern int enable_update_mptable; #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } +#define enable_update_mptable 0 #endif void __cpuinit generic_processor_info(int apicid, int version); @@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs(void); -extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low); +struct device; +extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, + int active_high_low); extern int acpi_probe_gsi(void); #ifdef CONFIG_X86_IO_APIC -extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity); extern int mp_find_ioapic(int gsi); extern int mp_find_ioapic_pin(int ioapic, int gsi); -#else -static inline int -mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ - return 0; -} #endif #else /* !CONFIG_ACPI: */ static inline int acpi_probe_gsi(void) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c167..4d58d04fca83 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -121,7 +121,6 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_HWCR 0xc0010015 #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c45a0a568dff..c97264409934 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void) * but since they are power of two we could use a * cheaper way --cvg */ - return nmi_watchdog & 0x3; + return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); } #endif diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 064ed6df4cbe..c4ae822e415f 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, extern void numa_init_array(void); extern int numa_off; -extern void srat_reserve_add_area(int nodeid); -extern int hotadd_percent; - extern s16 apicid_to_node[MAX_LOCAL_APIC]; extern unsigned long numa_free_all_bootmem(void); @@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); #ifdef CONFIG_NUMA +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + extern void __init init_cpu_to_node(void); extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0f915ae649a7..6f1b7331313f 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); -extern void initmem_init(unsigned long, unsigned long); -extern void free_initmem(void); extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b70248..8d382d3abf38 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,22 +32,14 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START CONFIG_PHYSICAL_START -#define __KERNEL_ALIGN 0x200000 - -/* - * Make sure kernel is aligned to 2MB address. Catching it at compile - * time is better. Change your config file and compile the kernel - * for a 2MB aligned address (CONFIG_PHYSICAL_START) - */ -#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 -#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" -#endif +#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ + (CONFIG_PHYSICAL_ALIGN - 1)) & \ + ~(CONFIG_PHYSICAL_ALIGN - 1)) #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 48 @@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long); #define vmemmap ((struct page *)VMEMMAP_START) -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); - -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); -extern void free_initmem(void); - extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 826ad37006ab..6473f5ccff85 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); + +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void free_initmem(void); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a53da004e08e..4fb37c8a0832 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -56,6 +56,7 @@ struct desc_ptr; struct tss_struct; struct mm_struct; struct desc_struct; +struct task_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -203,7 +204,8 @@ struct pv_cpu_ops { void (*swapgs)(void); - struct pv_lazy_ops lazy_mode; + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); }; struct pv_irq_ops { @@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode { }; enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_enter_lazy_cpu(void); -void paravirt_leave_lazy_cpu(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); -void paravirt_leave_lazy(enum paravirt_lazy_mode mode); -#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE -static inline void arch_enter_lazy_cpu_mode(void) +#define __HAVE_ARCH_START_CONTEXT_SWITCH +static inline void arch_start_context_switch(struct task_struct *prev) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); + PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); } -static inline void arch_leave_lazy_cpu_mode(void) +static inline void arch_end_context_switch(struct task_struct *next) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); + PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); } -void arch_flush_lazy_cpu_mode(void); - #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 29d96d168bc0..18ef7ebf2631 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base) #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) +#define arch_end_context_switch(prev) do {} while(0) + #endif /* CONFIG_PARAVIRT */ /* @@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +extern int direct_gbpages; + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6b87bc6d5018..abde308fdb0f 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", \ __FILE__, __LINE__, &(e), pte_val(e)) @@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define update_mmu_cache(vma, address, pte) do { } while (0) -extern int direct_gbpages; - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index fbf42b8e0383..766ea16fbbbd 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b8238dc8786d..4d258ad76a0f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,7 +273,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; -extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae709c8..87ede2f31bc7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct doublefault_tss; -extern __u32 cleared_cpu_caps[NCAPINTS]; +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); @@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; struct thread_struct { /* Cached TLS descriptors: */ @@ -427,8 +425,12 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif +#ifdef CONFIG_X86_32 unsigned long ip; +#endif +#ifdef CONFIG_X86_64 unsigned long fs; +#endif unsigned long gs; /* Hardware debugging registers: */ unsigned long debugreg0; @@ -814,6 +816,7 @@ extern unsigned int BIOS_revision; /* Boot loader type from the setup header: */ extern int bootloader_type; +extern int bootloader_version; extern char ignore_fpu_irq; @@ -874,7 +877,6 @@ static inline void spin_lock_prefetch(const void *x) .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ } /* diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index a4737dddfd58..64cf2d24fad1 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -48,9 +48,15 @@ #endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_PARAVIRT +/* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_PGE 0 +#else +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) +#endif +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bdc2ada05ae0..4093d1ed6db2 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -33,7 +33,6 @@ struct x86_quirks { int (*setup_ioapic_ids)(void); }; -extern void x86_quirk_pre_intr_init(void); extern void x86_quirk_intr_init(void); extern void x86_quirk_trap_init(void); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 19e0d88b966d..6a84ed166aec 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void); static inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR)); + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } #endif diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index e3cc3c063ec5..4517d6b93188 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ +# define MAX_PHYSMEM_BITS 46 #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408f6904..372b76edd63f 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -1,7 +1,7 @@ /* * syscalls.h - Linux syscall interfaces (arch-specific) * - * Copyright (c) 2008 Jaswinder Singh + * Copyright (c) 2008 Jaswinder Singh Rajput * * This file is released under the GPLv2. * See the file COPYING for more details. @@ -12,50 +12,55 @@ #include <linux/compiler.h> #include <linux/linkage.h> -#include <linux/types.h> #include <linux/signal.h> +#include <linux/types.h> /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/process.c */ +int sys_fork(struct pt_regs *); +int sys_vfork(struct pt_regs *); + /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/signal.c */ +long sys_rt_sigreturn(struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 +/* kernel/ioport.c */ +long sys_iopl(struct pt_regs *); + /* kernel/process_32.c */ -int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); -int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); -/* kernel/signal_32.c */ +/* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); - -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ +struct mmap_arg_struct; +struct sel_arg_struct; +struct oldold_utsname; +struct old_utsname; + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct mmap_arg_struct; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -struct sel_arg_struct; asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -struct old_utsname; asmlinkage int sys_uname(struct old_utsname __user *); -struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ @@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + /* kernel/process_64.c */ -asmlinkage long sys_fork(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_vfork(struct pt_regs *); asmlinkage long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/signal_64.c */ +/* kernel/signal.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ +struct new_utsname; + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct new_utsname; asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8820a73ae090..602c769fc98c 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -94,7 +94,8 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ +#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ +#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -116,6 +117,7 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) /* work to do in syscall_trace_enter() */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f44b49abca49..066ef590d7e0 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -203,7 +203,8 @@ struct pci_bus; void x86_pci_root_bus_res_quirks(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) +#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ + (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)) #define smt_capable() (smp_num_siblings > 1) #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b86..bfd74c032fca 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -2,6 +2,7 @@ #define _ASM_X86_TRAPS_H #include <asm/debugreg.h> +#include <asm/siginfo.h> /* TRAP_TRACE, ... */ #ifdef CONFIG_X86_32 #define dotraplinkage @@ -13,6 +14,9 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); +asmlinkage void xen_debug(void); +asmlinkage void xen_int3(void); +asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition) } extern int panic_on_unrecovered_nmi; -extern int kstack_depth_to_print; void math_error(void __user *); void math_emulate(struct math_emu_info *); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 9b0e61bf7a88..bddd44f2f0ab 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -37,7 +37,7 @@ #define UV_CPUS_PER_ACT_STATUS 32 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ACTIVATION_DESCRIPTOR_SIZE 32 +#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 #define UV_NET_ENDPOINT_INTD 0x38 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3a98ea1062e..341070f7ad5c 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -133,6 +133,7 @@ struct uv_scir_s { struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; + unsigned int gnode_extra; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) -#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper) +#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) +#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) #define UV_LOCAL_MMR_BASE 0xf4000000UL #define UV_GLOBAL_MMR32_BASE 0xf8000000UL @@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT)) #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ - ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT) + ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) #define UV_APIC_PNODE_SHIFT 6 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 88d1bfc847d3..235f5927bb97 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp) obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit_$(BITS).o +obj-y += setup.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 723989d7f802..631086159c53 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -33,6 +33,7 @@ #include <linux/irq.h> #include <linux/bootmem.h> #include <linux/ioport.h> +#include <linux/pci.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) * success: return IRQ number (>=0) * failure: return < 0 */ -int acpi_register_gsi(u32 gsi, int triggering, int polarity) +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { unsigned int irq; unsigned int plat_gsi = gsi; @@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity) * Make sure all (legacy) PCI IRQs are set as level-triggered. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (triggering == ACPI_LEVEL_SENSITIVE) + if (trigger == ACPI_LEVEL_SENSITIVE) eisa_set_level_irq(gsi); } #endif #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, triggering, polarity); + plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif acpi_gsi_to_irq(plat_gsi, &irq); @@ -903,10 +904,8 @@ extern int es7000_plat; #endif static struct { - int apic_id; int gsi_base; int gsi_end; - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; int mp_find_ioapic(int gsi) @@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].apicid = uniq_ioapic_id(id); -#ifdef CONFIG_X86_32 mp_ioapics[idx].apicver = io_apic_get_version(idx); -#else - mp_ioapics[idx].apicver = 0; -#endif + /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); @@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void) } } -int mp_register_gsi(u32 gsi, int triggering, int polarity) +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) { +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; int ioapic; - int ioapic_pin; -#ifdef CONFIG_X86_32 -#define MAX_GSI_NUM 4096 -#define IRQ_COMPRESSION_START 64 + u8 pin; - static int pci_irq = IRQ_COMPRESSION_START; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, and IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; -#else + if (!acpi_ioapic) + return 0; + if (!dev) + return 0; + if (dev->bus != &pci_bus_type) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + save_mp_irq(&mp_irq); +#endif + return 0; +} + +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + int ioapic; + int ioapic_pin; + struct io_apic_irq_attr irq_attr; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#endif /* Don't set up the ACPI SCI because it's already set up */ if (acpi_gbl_FADT.sci_interrupt == gsi) @@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) gsi = ioapic_renumber_irq(ioapic, gsi); #endif - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + "%d-%d\n", mp_ioapics[ioapic].apicid, ioapic_pin); return gsi; } - if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); -#ifdef CONFIG_X86_32 - return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); -#else - return gsi; -#endif - } - - set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); -#ifdef CONFIG_X86_32 - /* - * For GSI >= 64, use IRQ compression - */ - if ((gsi >= IRQ_COMPRESSION_START) - && (triggering == ACPI_LEVEL_SENSITIVE)) { - /* - * For PCI devices assign IRQs in order, avoiding gaps - * due to unused I/O APIC pins. - */ - int irq = gsi; - if (gsi < MAX_GSI_NUM) { - /* - * Retain the VIA chipset work-around (gsi > 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_gbl_FADT.sci_interrupt) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } -#endif - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, - polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} -int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin, - u32 gsi, int triggering, int polarity) -{ -#ifdef CONFIG_X86_MPPARSE - struct mpc_intsrc mp_irq; - int ioapic; + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); - if (!acpi_ioapic) - return 0; + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + io_apic_set_pci_routing(dev, gsi, &irq_attr); - /* print the entry should happen on mptable identically */ - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | - (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); - mp_irq.srcbus = number; - mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); - ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id; - mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - - save_mp_irq(&mp_irq); -#endif - return 0; + return gsi; } /* diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9def..167bc16ce0e5 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 000000000000..f51eb0bb56ce --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 000000000000..6206033ba202 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a97db99dad52..1c60554537c3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,7 +55,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 + **pte_page, gfp_t gfp); +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages); +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif #ifdef CONFIG_AMD_IOMMU_STATS @@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) + for_each_iommu(iommu) iommu_poll_events(iommu); return IRQ_HANDLED; @@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid) __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, domid, 1, 1); - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { spin_lock_irqsave(&iommu->lock, flags); __iommu_queue_command(iommu, &cmd); __iommu_completion_wait(iommu); @@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid) } } +void amd_iommu_flush_all_domains(void) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + iommu_flush_domain(i); + } +} + +void amd_iommu_flush_all_devices(void) +{ + struct amd_iommu *iommu; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] == NULL) + continue; + + iommu = amd_iommu_rlookup_table[i]; + if (!iommu) + continue; + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long phys_addr, int prot) { - u64 __pte, *pte, *page; + u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); @@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom, if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, * as allocated in the aperture */ if (addr < dma_dom->aperture_size) - __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + __set_bit(addr >> PAGE_SHIFT, + dma_dom->aperture[0]->bitmap); } return 0; @@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ /* - * The address allocator core function. + * The address allocator core functions. * * called with domain->lock held */ + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64* fetch_pte(struct protection_domain *domain, + unsigned long address) +{ + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function is used to add a new aperture range to an existing + * aperture in case of dma_ops domain allocation or address allocation + * failure. + */ +static int alloc_new_range(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + bool populate, gfp_t gfp) +{ + int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i; + +#ifdef CONFIG_IOMMU_STRESS + populate = false; +#endif + + if (index >= APERTURE_MAX_RANGES) + return -ENOMEM; + + dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp); + if (!dma_dom->aperture[index]) + return -ENOMEM; + + dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp); + if (!dma_dom->aperture[index]->bitmap) + goto out_free; + + dma_dom->aperture[index]->offset = dma_dom->aperture_size; + + if (populate) { + unsigned long address = dma_dom->aperture_size; + int i, num_ptes = APERTURE_RANGE_PAGES / 512; + u64 *pte, *pte_page; + + for (i = 0; i < num_ptes; ++i) { + pte = alloc_pte(&dma_dom->domain, address, + &pte_page, gfp); + if (!pte) + goto out_free; + + dma_dom->aperture[index]->pte_pages[i] = pte_page; + + address += APERTURE_RANGE_SIZE / 64; + } + } + + dma_dom->aperture_size += APERTURE_RANGE_SIZE; + + /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + /* + * Check for areas already mapped as present in the new aperture + * range and mark those pages as reserved in the allocator. Such + * mappings may already exist as a result of requested unity + * mappings for devices. + */ + for (i = dma_dom->aperture[index]->offset; + i < dma_dom->aperture_size; + i += PAGE_SIZE) { + u64 *pte = fetch_pte(&dma_dom->domain, i); + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + continue; + + dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + } + + return 0; + +out_free: + free_page((unsigned long)dma_dom->aperture[index]->bitmap); + + kfree(dma_dom->aperture[index]); + dma_dom->aperture[index] = NULL; + + return -ENOMEM; +} + +static unsigned long dma_ops_area_alloc(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages, + unsigned long align_mask, + u64 dma_mask, + unsigned long start) +{ + unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE; + int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT; + int i = start >> APERTURE_RANGE_SHIFT; + unsigned long boundary_size; + unsigned long address = -1; + unsigned long limit; + + next_bit >>= PAGE_SHIFT; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + for (;i < max_index; ++i) { + unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT; + + if (dom->aperture[i]->offset >= dma_mask) + break; + + limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset, + dma_mask >> PAGE_SHIFT); + + address = iommu_area_alloc(dom->aperture[i]->bitmap, + limit, next_bit, pages, 0, + boundary_size, align_mask); + if (address != -1) { + address = dom->aperture[i]->offset + + (address << PAGE_SHIFT); + dom->next_address = address + (pages << PAGE_SHIFT); + break; + } + + next_bit = 0; + } + + return address; +} + static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, unsigned long align_mask, u64 dma_mask) { - unsigned long limit; unsigned long address; - unsigned long boundary_size; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, - dma_mask >> PAGE_SHIFT); +#ifdef CONFIG_IOMMU_STRESS + dom->next_address = 0; + dom->need_flush = true; +#endif - if (dom->next_bit >= limit) { - dom->next_bit = 0; - dom->need_flush = true; - } + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, dom->next_address); - address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, align_mask); if (address == -1) { - address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, align_mask); + dom->next_address = 0; + address = dma_ops_area_alloc(dev, dom, pages, align_mask, + dma_mask, 0); dom->need_flush = true; } - if (likely(address != -1)) { - dom->next_bit = address + pages; - address <<= PAGE_SHIFT; - } else + if (unlikely(address == -1)) address = bad_dma_address; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) { - address >>= PAGE_SHIFT; - iommu_area_free(dom->bitmap, address, pages); + unsigned i = address >> APERTURE_RANGE_SHIFT; + struct aperture_range *range = dom->aperture[i]; - if (address >= dom->next_bit) + BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL); + +#ifdef CONFIG_IOMMU_STRESS + if (i < 4) + return; +#endif + + if (address >= dom->next_address) dom->need_flush = true; + + address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; + + iommu_area_free(range->bitmap, address, pages); + } /**************************************************************************** @@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) { - unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; if (start_page + pages > last_page) pages = last_page - start_page; - iommu_area_reserve(dom->bitmap, start_page, pages); + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } } static void free_pagetable(struct protection_domain *domain) @@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain) */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { + int i; + if (!dom) return; free_pagetable(&dom->domain); - kfree(dom->pte_pages); - - kfree(dom->bitmap); + for (i = 0; i < APERTURE_MAX_RANGES; ++i) { + if (!dom->aperture[i]) + continue; + free_page((unsigned long)dom->aperture[i]->bitmap); + kfree(dom->aperture[i]); + } kfree(dom); } @@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, - unsigned order) +static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) { struct dma_ops_domain *dma_dom; - unsigned i, num_pte_pages; - u64 *l2_pde; - u64 address; - - /* - * Currently the DMA aperture must be between 32 MB and 1GB in size - */ - if ((order < 25) || (order > 30)) - return NULL; dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); if (!dma_dom) @@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; - dma_dom->aperture_size = (1ULL << order); - dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), - GFP_KERNEL); - if (!dma_dom->bitmap) - goto free_dma_dom; - /* - * mark the first page as allocated so we never return 0 as - * a valid dma-address. So we can use 0 as error value - */ - dma_dom->bitmap[0] = 1; - dma_dom->next_bit = 0; dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; - /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); - } + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + goto free_dma_dom; /* - * At the last step, build the page tables so we don't need to - * allocate page table pages in the dma_ops mapping/unmapping - * path. + * mark the first page as allocated so we never return 0 as + * a valid dma-address. So we can use 0 as error value */ - num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); - dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), - GFP_KERNEL); - if (!dma_dom->pte_pages) - goto free_dma_dom; - - l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); - if (l2_pde == NULL) - goto free_dma_dom; + dma_dom->aperture[0]->bitmap[0] = 1; + dma_dom->next_address = 0; - dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); - - for (i = 0; i < num_pte_pages; ++i) { - dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); - if (!dma_dom->pte_pages[i]) - goto free_dma_dom; - address = virt_to_phys(dma_dom->pte_pages[i]); - l2_pde[i] = IOMMU_L1_PDE(address); - } return dma_dom; @@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb, struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; unsigned long flags; if (devid > amd_iommu_last_bdf) @@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb, "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { - case BUS_NOTIFY_BOUND_DRIVER: - if (domain) - goto out; - dma_domain = find_protection_domain(devid); - if (!dma_domain) - dma_domain = iommu->default_dom; - attach_device(iommu, &dma_domain->domain, devid); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", dma_domain->domain.id, dev_name(dev)); - break; - case BUS_NOTIFY_UNBIND_DRIVER: + case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; detach_device(domain, devid); @@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu, order); + dma_domain = dma_ops_domain_alloc(iommu); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev, dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; attach_device(*iommu, *domain, *bdf); - printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device %s\n", (*domain)->id, dev_name(dev)); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) @@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev, } /* + * If the pte_page is not yet allocated this function is called + */ +static u64* alloc_pte(struct protection_domain *dom, + unsigned long address, u64 **pte_page, gfp_t gfp) +{ + u64 *pte, *page; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page) + *pte_page = pte; + + pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + + return pte; +} + +/* + * This function fetches the PTE for a given address in the aperture + */ +static u64* dma_ops_get_pte(struct dma_ops_domain *dom, + unsigned long address) +{ + struct aperture_range *aperture; + u64 *pte, *pte_page; + + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return NULL; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) { + pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; + } else + pte += IOMMU_PTE_L0_INDEX(address); + + return pte; +} + +/* * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ @@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, paddr &= PAGE_MASK; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; - pte += IOMMU_PTE_L0_INDEX(address); + pte = dma_ops_get_pte(dom, address); + if (!pte) + return bad_dma_address; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) { + struct aperture_range *aperture; u64 *pte; if (address >= dom->aperture_size) return; - WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); + aperture = dom->aperture[APERTURE_RANGE_INDEX(address)]; + if (!aperture) + return; + + pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; + if (!pte) + return; - pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); WARN_ON(!*pte); @@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev, u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; - dma_addr_t address, start; + dma_addr_t address, start, ret; unsigned int pages; unsigned long align_mask = 0; int i; @@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; +retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) - goto out; + if (unlikely(address == bad_dma_address)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the + * first run. This is a small optimization. + */ + dma_dom->next_address = dma_dom->aperture_size; + + if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + goto out; + + /* + * aperture was sucessfully enlarged by 128 MB, try + * allocation again + */ + goto retry; + } start = address; for (i = 0; i < pages; ++i) { - dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + if (ret == bad_dma_address) + goto out_unmap; + paddr += PAGE_SIZE; start += PAGE_SIZE; } @@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev, out: return address; + +out_unmap: + + for (--i; i >= 0; --i) { + start -= PAGE_SIZE; + dma_ops_domain_unmap(iommu, dma_dom, start); + } + + dma_ops_free_addresses(dma_dom, address, pages); + + return bad_dma_address; } /* @@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) + if (*dma_addr == bad_dma_address) { + spin_unlock_irqrestore(&domain->lock, flags); goto out_free; + } iommu_completion_wait(iommu); @@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void) iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - dma_dom = dma_ops_domain_alloc(iommu, order); + dma_dom = dma_ops_domain_alloc(iommu); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = { int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int order = amd_iommu_aperture_order; int ret; /* @@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void) * found in the system. Devices not assigned to any other * protection domain will be assigned to the default one. */ - list_for_each_entry(iommu, &amd_iommu_list, list) { - iommu->default_dom = dma_ops_domain_alloc(iommu, order); + for_each_iommu(iommu) { + iommu->default_dom = dma_ops_domain_alloc(iommu); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; @@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void) free_domains: - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { if (iommu->default_dom) dma_ops_domain_free(iommu->default_dom); } @@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, old_domain = domain_for_device(devid); if (old_domain) - return -EBUSY; + detach_device(old_domain, devid); attach_device(iommu, domain, devid); diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c0be0902dac..238989ec077d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -115,15 +115,21 @@ struct ivmd_header { u64 range_length; } __attribute__((packed)); +bool amd_iommu_dump; + static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +#ifdef CONFIG_IOMMU_STRESS +bool amd_iommu_isolate = false; +#else bool amd_iommu_isolate = true; /* if true, device isolation is enabled */ +#endif + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid) static inline unsigned long tbl_size(int entry_size) { unsigned shift = PAGE_SHIFT + - get_order(amd_iommu_last_bdf * entry_size); + get_order(((int)amd_iommu_last_bdf + 1) * entry_size); return 1UL << shift; } @@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size) * This function set the exclusion range in the IOMMU. DMA accesses to the * exclusion range are passed through untranslated */ -static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) +static void iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; @@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) } /* Generic functions to enable/disable certain features of the IOMMU. */ -static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -static void __init iommu_enable(struct amd_iommu *iommu) +static void iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); @@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } -/* Function to enable IOMMU event logging and event interrupts */ -static void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void iommu_disable(struct amd_iommu *iommu) { - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } /* @@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - entry = (u64)virt_to_phys(cmd_buf); + return cmd_buf; +} + +/* + * This function writes the command buffer address to the hardware and + * enables it. + */ +static void iommu_enable_command_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->cmd_buf == NULL); + + entry = (u64)virt_to_phys(iommu->cmd_buf); entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); - - return cmd_buf; } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu) /* allocates the memory where the IOMMU will log its events to */ static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) { - u64 entry; iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(EVT_BUFFER_SIZE)); if (iommu->evt_buf == NULL) return NULL; + return iommu->evt_buf; +} + +static void iommu_enable_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + + BUG_ON(iommu->evt_buf == NULL); + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, &entry, sizeof(entry)); - iommu->evt_buf_size = EVT_BUFFER_SIZE; - - return iommu->evt_buf; + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); } static void __init free_event_buffer(struct amd_iommu *iommu) @@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, p += sizeof(struct ivhd_header); end += h->length; + while (p < end) { e = (struct ivhd_entry *)p; switch (e->type) { case IVHD_DEV_ALL: + + DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x" + " last device %02x:%02x.%x flags: %02x\n", + PCI_BUS(iommu->first_device), + PCI_SLOT(iommu->first_device), + PCI_FUNC(iommu->first_device), + PCI_BUS(iommu->last_device), + PCI_SLOT(iommu->last_device), + PCI_FUNC(iommu->last_device), + e->flags); + for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) set_dev_entry_from_acpi(iommu, dev_i, e->flags, 0); break; case IVHD_DEV_SELECT: + + DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x " + "flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: + + DUMP_printk(" DEV_SELECT_RANGE_START\t " + "devid: %02x:%02x.%x flags: %02x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags); + devid_start = e->devid; flags = e->flags; ext_flags = 0; alias = false; break; case IVHD_DEV_ALIAS: + + DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x " + "flags: %02x devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: + + DUMP_printk(" DEV_ALIAS_RANGE\t\t " + "devid: %02x:%02x.%x flags: %02x " + "devid_to: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, + PCI_BUS(e->ext >> 8), + PCI_SLOT(e->ext >> 8), + PCI_FUNC(e->ext >> 8)); + devid_start = e->devid; flags = e->flags; devid_to = e->ext >> 8; @@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, alias = true; break; case IVHD_DEV_EXT_SELECT: + + DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x " + "flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid = e->devid; set_dev_entry_from_acpi(iommu, devid, e->flags, e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: + + DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: " + "%02x:%02x.%x flags: %02x ext: %08x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid), + e->flags, e->ext); + devid_start = e->devid; flags = e->flags; ext_flags = e->ext; alias = false; break; case IVHD_DEV_RANGE_END: + + DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n", + PCI_BUS(e->devid), + PCI_SLOT(e->devid), + PCI_FUNC(e->devid)); + devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) @@ -679,7 +774,7 @@ static void __init free_iommu_all(void) { struct amd_iommu *iommu, *next; - list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + for_each_iommu_safe(iommu, next) { list_del(&iommu->list); free_iommu_one(iommu); kfree(iommu); @@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->mmio_base) return -ENOMEM; - iommu_set_device_table(iommu); iommu->cmd_buf = alloc_command_buffer(iommu); if (!iommu->cmd_buf) return -ENOMEM; @@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table) h = (struct ivhd_header *)p; switch (*p) { case ACPI_IVHD_TYPE: + + DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + "seg: %d flags: %01x info %04x\n", + PCI_BUS(h->devid), PCI_SLOT(h->devid), + PCI_FUNC(h->devid), h->cap_ptr, + h->pci_seg, h->flags, h->info); + DUMP_printk(" mmio-addr: %016llx\n", + h->mmio_phys); + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); if (iommu == NULL) return -ENOMEM; @@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msix(struct amd_iommu *iommu) -{ - struct amd_iommu *curr; - struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ - int nvec = 0, i; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) { - entries[nvec].entry = curr->evt_msi_num; - entries[nvec].vector = 0; - curr->int_enabled = true; - nvec++; - } - } - - if (pci_enable_msix(iommu->dev, entries, nvec)) { - pci_disable_msix(iommu->dev); - return 1; - } - - for (i = 0; i < nvec; ++i) { - int r = request_irq(entries->vector, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD IOMMU", - NULL); - if (r) - goto out_free; - } - - return 0; - -out_free: - for (i -= 1; i >= 0; --i) - free_irq(entries->vector, NULL); - - pci_disable_msix(iommu->dev); - - return 1; -} - static int __init iommu_setup_msi(struct amd_iommu *iommu) { int r; - struct amd_iommu *curr; - - list_for_each_entry(curr, &amd_iommu_list, list) { - if (curr->dev == iommu->dev) - curr->int_enabled = true; - } - if (pci_enable_msi(iommu->dev)) return 1; @@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) return 1; } + iommu->int_enabled = true; + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + return 0; } -static int __init iommu_init_msi(struct amd_iommu *iommu) +static int iommu_init_msi(struct amd_iommu *iommu) { if (iommu->int_enabled) return 0; - if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) - return iommu_setup_msix(iommu); - else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) return iommu_setup_msi(iommu); return 1; @@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; + char *s; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) @@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m) switch (m->type) { default: + kfree(e); + return 0; case ACPI_IVMD_TYPE: + s = "IVMD_TYPEi\t\t\t"; e->devid_start = e->devid_end = m->devid; break; case ACPI_IVMD_TYPE_ALL: + s = "IVMD_TYPE_ALL\t\t"; e->devid_start = 0; e->devid_end = amd_iommu_last_bdf; break; case ACPI_IVMD_TYPE_RANGE: + s = "IVMD_TYPE_RANGE\t\t"; e->devid_start = m->devid; e->devid_end = m->aux; break; @@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m) e->address_end = e->address_start + PAGE_ALIGN(m->range_length); e->prot = m->flags >> 1; + DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x" + " range_start: %016llx range_end: %016llx flags: %x\n", s, + PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start), + PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end), + PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end), + e->address_start, e->address_end, m->flags); + list_add_tail(&e->list, &amd_iommu_unity_map); return 0; @@ -967,18 +1037,28 @@ static void init_device_table(void) * This function finally enables all IOMMUs found in the system after * they have been initialized */ -static void __init enable_iommus(void) +static void enable_iommus(void) { struct amd_iommu *iommu; - list_for_each_entry(iommu, &amd_iommu_list, list) { + for_each_iommu(iommu) { + iommu_set_device_table(iommu); + iommu_enable_command_buffer(iommu); + iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); - iommu_enable_event_logging(iommu); iommu_enable(iommu); } } +static void disable_iommus(void) +{ + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_disable(iommu); +} + /* * Suspend/Resume support * disable suspend until real resume implemented @@ -986,12 +1066,31 @@ static void __init enable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + /* + * Disable IOMMUs before reprogramming the hardware registers. + * IOMMU is still enabled from the resume kernel. + */ + disable_iommus(); + + /* re-load the hardware */ + enable_iommus(); + + /* + * we have to flush after the IOMMUs are enabled because a + * disabled IOMMU will never execute the commands we send + */ + amd_iommu_flush_all_domains(); + amd_iommu_flush_all_devices(); + return 0; } static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) { - return -EINVAL; + /* disable IOMMUs to go out of the way for BIOS */ + disable_iommus(); + + return 0; } static struct sysdev_class amd_iommu_sysdev_class = { @@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void) enable_iommus(); - printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", - (1 << (amd_iommu_aperture_order-20))); - printk(KERN_INFO "AMD IOMMU: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); @@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void) * ****************************************************************************/ +static int __init parse_amd_iommu_dump(char *str) +{ + amd_iommu_dump = true; + + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str) return 1; } -static int __init parse_amd_iommu_size_options(char *str) -{ - unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); - - if ((order > 24) && (order < 31)) - amd_iommu_aperture_order = order; - - return 1; -} - +__setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); -__setup("amd_iommu_size=", parse_amd_iommu_size_options); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f2870920f246..a4c9cf0bf70b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -98,6 +98,29 @@ early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} #endif #ifdef CONFIG_X86_64 @@ -111,13 +134,19 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +int x2apic_mode; #ifdef CONFIG_X86_X2APIC -int x2apic; /* x2apic enabled before OS handover */ static int x2apic_preenabled; static int disable_x2apic; static __init int setup_nox2apic(char *str) { + if (x2apic_enabled()) { + pr_warning("Bios already enabled x2apic, " + "can't enforce nox2apic"); + return 0; + } + disable_x2apic = 1; setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; @@ -209,6 +238,31 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * bare function to substitute write operation + * and it's _that_ fast :) + */ +static void native_apic_write_dummy(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + +static u32 native_apic_read_dummy(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +/* + * right after this call apic->write/read doesn't do anything + * note that there is no restore operation it works one way + */ +void apic_disable(void) +{ + apic->read = native_apic_read_dummy; + apic->write = native_apic_write_dummy; +} + void native_apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -348,7 +402,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) { - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); @@ -815,7 +869,7 @@ void clear_local_APIC(void) u32 v; /* APIC hasn't been mapped yet */ - if (!x2apic && !apic_phys) + if (!x2apic_mode && !apic_phys) return; maxlvt = lapic_get_maxlvt(); @@ -1287,7 +1341,7 @@ void check_x2apic(void) { if (x2apic_enabled()) { pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); - x2apic_preenabled = x2apic = 1; + x2apic_preenabled = x2apic_mode = 1; } } @@ -1295,7 +1349,7 @@ void enable_x2apic(void) { int msr, msr2; - if (!x2apic) + if (!x2apic_mode) return; rdmsr(MSR_IA32_APICBASE, msr, msr2); @@ -1304,6 +1358,7 @@ void enable_x2apic(void) wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } +#endif /* CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) { @@ -1312,32 +1367,21 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - if (!cpu_has_x2apic) - return; - - if (!x2apic_preenabled && disable_x2apic) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); - return; + ret = dmar_table_init(); + if (ret) { + pr_debug("dmar_table_init() failed with %d:\n", ret); + goto ir_failed; } - if (x2apic_preenabled && disable_x2apic) - panic("Bios already enabled x2apic, can't enforce nox2apic"); - - if (!x2apic_preenabled && skip_ioapic_setup) { - pr_info("Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); - return; + if (!intr_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + goto ir_failed; } - ret = dmar_table_init(); - if (ret) { - pr_info("dmar_table_init() failed with %d:\n", ret); - if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); - else - pr_info("Not enabling x2apic,Intr-remapping\n"); + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); return; } @@ -1357,19 +1401,16 @@ void __init enable_IR_x2apic(void) mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - ret = enable_intr_remapping(EIM_32BIT_APIC_ID); - - if (ret && x2apic_preenabled) { - local_irq_restore(flags); - panic("x2apic enabled by bios. But IR enabling failed"); - } - + ret = enable_intr_remapping(x2apic_supported()); if (ret) goto end_restore; - if (!x2apic) { - x2apic = 1; + pr_info("Enabled Interrupt-remapping\n"); + + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; enable_x2apic(); + pr_info("Enabled x2apic\n"); } end_restore: @@ -1378,37 +1419,34 @@ end_restore: * IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - else - reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries); unmask_8259A(); local_irq_restore(flags); end: - if (!ret) { - if (!x2apic_preenabled) - pr_info("Enabled x2apic and interrupt-remapping\n"); - else - pr_info("Enabled Interrupt-remapping\n"); - } else - pr_err("Failed to enable Interrupt-remapping and x2apic\n"); if (ioapic_entries) free_ioapic_entries(ioapic_entries); + + if (!ret) + return; + +ir_failed: + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else if (cpu_has_x2apic) + pr_info("Not enabling x2apic,Intr-remapping\n"); #else if (!cpu_has_x2apic) return; if (x2apic_preenabled) panic("x2apic enabled prior OS handover," - " enable CONFIG_INTR_REMAP"); - - pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); #endif return; } -#endif /* CONFIG_X86_X2APIC */ + #ifdef CONFIG_X86_64 /* @@ -1425,7 +1463,6 @@ static int __init detect_init_APIC(void) } mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_physical_apicid = 0; return 0; } #else @@ -1539,32 +1576,49 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { - if (x2apic) { + unsigned int new_apicid; + + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; } - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ + /* If no local APIC can be found return early */ if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { apic_phys = mp_lapic_addr; - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic) + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", + APIC_BASE, apic_phys); + } /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } } /* @@ -1733,8 +1787,7 @@ void __init connect_bsp_APIC(void) */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); + imcr_pic_to_apic(); } #endif if (apic->enable_apic_mode) @@ -1762,8 +1815,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + imcr_apic_to_pic(); return; } #endif @@ -1969,10 +2021,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) local_irq_save(flags); disable_local_APIC(); -#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) disable_intr_remapping(); -#endif + local_irq_restore(flags); return 0; } @@ -1982,42 +2034,34 @@ static int lapic_resume(struct sys_device *dev) unsigned int l, h; unsigned long flags; int maxlvt; - -#ifdef CONFIG_INTR_REMAP - int ret; + int ret = 0; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) return 0; local_irq_save(flags); - if (x2apic) { + if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - return -ENOMEM; + ret = -ENOMEM; + goto restore; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { WARN(1, "Saving IO-APIC state failed: %d\n", ret); free_ioapic_entries(ioapic_entries); - return ret; + goto restore; } mask_IO_APIC_setup(ioapic_entries); mask_8259A(); - enable_x2apic(); } -#else - if (!apic_pm_state.active) - return 0; - local_irq_save(flags); - if (x2apic) + if (x2apic_mode) enable_x2apic(); -#endif - else { /* * Make sure the APICBASE points to the right address @@ -2055,21 +2099,16 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - reenable_intr_remapping(EIM_32BIT_APIC_ID); - - if (x2apic) { + if (intr_remapping_enabled) { + reenable_intr_remapping(x2apic_mode); unmask_8259A(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } -#endif - +restore: local_irq_restore(flags); - - return 0; + return ret; } /* @@ -2117,31 +2156,14 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ #ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) + +static int __cpuinit apic_cluster_num(void) { int i, clusters, zeros; unsigned id; u16 *bios_cpu_apicid; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); @@ -2177,18 +2199,67 @@ __cpuinit int apic_is_clustered_box(void) ++zeros; } - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) + return clusters; +} + +static int __cpuinitdata multi_checked; +static int __cpuinitdata multi; + +static int __cpuinit set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void __cpuinit dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +__cpuinit int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) return 1; + if (!is_vsmp_box()) + return 0; + /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards */ - return (clusters > 2); + if (apic_cluster_num() > 1) + return 1; + + return 0; } #endif diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 306e5e88fb6f..d0c99abc26c3 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -161,7 +161,7 @@ static int flat_apic_id_registered(void) static int flat_phys_pkg_id(int initial_apic_id, int index_msb) { - return hard_smp_processor_id() >> index_msb; + return initial_apic_id >> index_msb; } struct apic apic_flat = { @@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * regardless of how many processors are present (x86_64 ES7000 * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 302947775575..69328ac8de9c 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30da617d18e4..1946fac42ab3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> +#include <asm/hw_irq.h> #include <asm/uv/uv_hub.h> #include <asm/uv/uv_irq.h> @@ -129,12 +130,9 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +static struct irq_pin_list *get_one_free_irq_2_pin(int node) { struct irq_pin_list *pin; - int node; - - node = cpu_to_node(cpu); pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); @@ -148,9 +146,6 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - u8 move_desc_pending : 1; -#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -212,12 +207,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) return cfg; } -static struct irq_cfg *get_one_free_irq_cfg(int cpu) +static struct irq_cfg *get_one_free_irq_cfg(int node) { struct irq_cfg *cfg; - int node; - - node = cpu_to_node(cpu); cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { @@ -238,13 +230,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -int arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int node) { struct irq_cfg *cfg; cfg = desc->chip_data; if (!cfg) { - desc->chip_data = get_one_free_irq_cfg(cpu); + desc->chip_data = get_one_free_irq_cfg(node); if (!desc->chip_data) { printk(KERN_ERR "can not alloc irq_cfg\n"); BUG_ON(1); @@ -254,10 +246,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu) return 0; } -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - +/* for move_irq_desc */ static void -init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node) { struct irq_pin_list *old_entry, *head, *tail, *entry; @@ -266,7 +257,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) if (!old_entry) return; - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) return; @@ -276,7 +267,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) tail = entry; old_entry = old_entry->next; while (old_entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { entry = head; while (entry) { @@ -316,12 +307,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) } void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { struct irq_cfg *cfg; struct irq_cfg *old_cfg; - cfg = get_one_free_irq_cfg(cpu); + cfg = get_one_free_irq_cfg(node); if (!cfg) return; @@ -332,7 +323,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); - init_copy_irq_2_pin(old_cfg, cfg, cpu); + init_copy_irq_2_pin(old_cfg, cfg, node); } static void free_irq_cfg(struct irq_cfg *old_cfg) @@ -356,19 +347,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->chip_data = NULL; } } - -static void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg = desc->chip_data; - - if (!cfg->move_in_progress) { - /* it means that domain is not changed */ - if (!cpumask_intersects(desc->affinity, mask)) - cfg->move_desc_pending = 1; - } -} -#endif +/* end for move_irq_desc */ #else static struct irq_cfg *irq_cfg(unsigned int irq) @@ -378,13 +357,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif -#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC -static inline void -set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) -{ -} -#endif - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -518,132 +490,18 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } -#ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - -/* - * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and - * leaves desc->affinity untouched. - */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned int irq; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; - - irq = desc->irq; - cfg = desc->chip_data; - if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; - - /* check that before desc->addinity get updated */ - set_extra_move_desc(desc, mask); - - cpumask_copy(desc->affinity, mask); - - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); -} - -static void -set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, cfg); - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void -set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - set_ioapic_affinity_irq_desc(desc, mask); -} -#endif /* CONFIG_SMP */ - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list *entry; entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(cpu); + entry = get_one_free_irq_2_pin(node); if (!entry) { printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", apic, pin); @@ -663,7 +521,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(cpu); + entry->next = get_one_free_irq_2_pin(node); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -672,7 +530,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { @@ -692,7 +550,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); + add_pin_to_irq_node(cfg, node, newapic, newpin); } static inline void io_apic_modify_irq(struct irq_cfg *cfg, @@ -850,7 +708,6 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_INTR_REMAP struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; @@ -948,20 +805,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) return 0; } -void reinit_intr_remapped_IO_APIC(int intr_remapping, - struct IO_APIC_route_entry **ioapic_entries) - -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(ioapic_entries); -} - void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; @@ -971,7 +814,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) kfree(ioapic_entries); } -#endif /* * Find the IRQ entry number of a certain pin. @@ -1032,54 +874,6 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || - mp_irqs[i].dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1298,6 +1092,64 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, apic, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + void lock_vector_lock(void) { /* Used to the online set of cpus does not change @@ -1628,58 +1480,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq ioapic_write_entry(apic_id, pin, entry); } +static struct { + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} mp_ioapic_routing[MAX_IO_APICS]; + static void __init setup_IO_APIC_irqs(void) { - int apic_id, pin, idx, irq; + int apic_id = 0, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { - - idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); - continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + apic_id = mp_find_ioapic(0); + if (apic_id < 0) + apic_id = 0; + } +#endif - irq = pin_2_irq(idx, apic_id, pin); + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { + if (!notcon) { + notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic_id].apicid, pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic_id].apicid, pin); + continue; + } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } - /* - * Skip the timer IRQ if there's a quirk handler - * installed and if it returns 1: - */ - if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) - continue; + irq = pin_2_irq(idx, apic_id, pin); - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc for %d\n", irq); - continue; - } - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, apic_id, pin); + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(apic_id, irq)) + continue; - setup_IO_APIC_irq(apic_id, pin, irq, desc, - irq_trigger(idx), irq_polarity(idx)); + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; } + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + /* + * don't mark it in pin_programmed, so later acpi could + * set it correctly when irq < 16 + */ + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); } if (notcon) @@ -1869,7 +1733,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { - unsigned int v, ver, maxlvt; + unsigned int i, v, ver, maxlvt; u64 icr; if (apic_verbosity == APIC_QUIET) @@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); v = apic_read(APIC_TDCR); printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } printk("\n"); } @@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void) __apicdebuginit(int) print_all_ICs(void) { print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic || disable_apic) + return 0; + print_all_local_APICs(); print_IO_APIC(); @@ -2360,6 +2241,118 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); + +/* + * Either sets desc->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_copy(desc->affinity, mask); + + return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); +} + +static int +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + unsigned int irq; + int ret = -1; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + ret = 0; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return ret; +} + +static int +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + return set_ioapic_affinity_irq_desc(desc, mask); +} #ifdef CONFIG_INTR_REMAP @@ -2374,26 +2367,25 @@ static int ioapic_retrigger_irq(unsigned int irq) * Real vector that is used for interrupting cpu will be coming from * the interrupt-remapping table entry. */ -static void +static int migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; struct irte irte; unsigned int dest; unsigned int irq; + int ret = -1; if (!cpumask_intersects(mask, cpu_online_mask)) - return; + return ret; irq = desc->irq; if (get_irte(irq, &irte)) - return; + return ret; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return; - - set_extra_move_desc(desc, mask); + return ret; dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); @@ -2409,27 +2401,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) send_cleanup_vector(cfg); cpumask_copy(desc->affinity, mask); + + return 0; } /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { - migrate_ioapic_irq_desc(desc, mask); + return migrate_ioapic_irq_desc(desc, mask); } -static void set_ir_ioapic_affinity_irq(unsigned int irq, +static int set_ir_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); - set_ir_ioapic_affinity_irq_desc(desc, mask); + return set_ir_ioapic_affinity_irq_desc(desc, mask); } #else -static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { + return 0; } #endif @@ -2491,86 +2486,19 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - if (likely(!cfg->move_desc_pending)) - return; - - /* domain has not changed, but affinity did */ - me = smp_processor_id(); - if (cpumask_test_cpu(me, desc->affinity)) { - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; - cfg->move_desc_pending = 0; - } -#endif + if (likely(!cfg->move_in_progress)) return; - } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) { -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - *descp = desc = move_irq_desc(desc, me); - /* get the new one */ - cfg = desc->chip_data; -#endif + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); - } } #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_X86_X2APIC -static void ack_x2apic_level(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - ack_x2APIC_irq(); - eoi_ioapic_irq(desc); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - static void ack_apic_edge(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2634,9 +2562,6 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - if (irq_remapped(irq)) - eoi_ioapic_irq(desc); - /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2683,22 +2608,50 @@ static void ack_apic_level(unsigned int irq) } #ifdef CONFIG_INTR_REMAP +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + + entry = cfg->irq_2_pin; + for (;;) { + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_eoi(apic, pin); + entry = entry->next; + } +} + +static void +eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ir_ack_apic_edge(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_edge(irq); -#endif - return ack_apic_edge(irq); + ack_APIC_irq(); } static void ir_ack_apic_level(unsigned int irq) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_enabled()) - return ack_x2apic_level(irq); -#endif - return ack_apic_level(irq); + struct irq_desc *desc = irq_to_desc(irq); + + ack_APIC_irq(); + eoi_ioapic_irq(desc); } #endif /* CONFIG_INTR_REMAP */ @@ -2903,7 +2856,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int cpu = boot_cpu_id; + int node = cpu_to_node(boot_cpu_id); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2969,7 +2922,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); + add_pin_to_irq_node(cfg, node, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } else { /* for edge trigger, setup_IO_APIC_irq already @@ -3006,7 +2959,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); enable_8259A_irq(0); if (timer_irq_works()) { @@ -3218,14 +3171,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ -unsigned int create_irq_nr(unsigned int irq_want) +unsigned int create_irq_nr(unsigned int irq_want, int node) { /* Allocate an unused irq */ unsigned int irq; unsigned int new; unsigned long flags; struct irq_cfg *cfg_new = NULL; - int cpu = boot_cpu_id; struct irq_desc *desc_new = NULL; irq = 0; @@ -3234,7 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want) spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { - desc_new = irq_to_desc_alloc_cpu(new, cpu); + desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; @@ -3243,6 +3195,9 @@ unsigned int create_irq_nr(unsigned int irq_want) if (cfg_new->vector != 0) continue; + + desc_new = move_irq_desc(desc_new, node); + if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; break; @@ -3260,11 +3215,12 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { + int node = cpu_to_node(boot_cpu_id); unsigned int irq_want; int irq; irq_want = nr_irqs_gsi; - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) irq = -1; @@ -3366,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3375,7 +3331,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3387,13 +3343,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg_desc(desc, &msg); + + return 0; } #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void +static int ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); @@ -3402,11 +3360,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irte irte; if (get_irte(irq, &irte)) - return; + return -1; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3423,6 +3381,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) */ if (cfg->move_in_progress) send_cleanup_vector(cfg); + + return 0; } #endif @@ -3518,15 +3478,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) unsigned int irq_want; struct intel_iommu *iommu = NULL; int index = 0; + int node; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; + node = dev_to_node(&dev->dev); irq_want = nr_irqs_gsi; sub_handle = 0; list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want); + irq = create_irq_nr(irq_want, node); if (irq == 0) return -1; irq_want = irq + 1; @@ -3576,7 +3538,7 @@ void arch_teardown_msi_irq(unsigned int irq) #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3585,7 +3547,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3597,6 +3559,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3630,7 +3594,7 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) +static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3639,7 +3603,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; @@ -3651,6 +3615,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); + + return 0; } #endif /* CONFIG_SMP */ @@ -3707,7 +3673,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) +static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; @@ -3715,11 +3681,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID) - return; + return -1; cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); + + return 0; } #endif @@ -3794,6 +3762,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, eligible_cpu); @@ -3807,15 +3777,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3833,10 +3801,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) struct uv_IO_APIC_route_entry *entry; int mmr_pnode; + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - entry->mask = 1; mmr_pnode = uv_blade_to_pnode(mmr_blade); @@ -3900,6 +3868,71 @@ int __init arch_probe_nr_irqs(void) } #endif +static int __io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + struct irq_desc *desc; + struct irq_cfg *cfg; + int node; + int ioapic, pin; + int trigger, polarity; + + ioapic = irq_attr->ioapic; + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + if (dev) + node = dev_to_node(dev); + else + node = cpu_to_node(boot_cpu_id); + + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + + pin = irq_attr->ioapic_pin; + trigger = irq_attr->trigger; + polarity = irq_attr->polarity; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, ioapic, pin); + } + + setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); + + return 0; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic, pin; + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + ioapic = irq_attr->ioapic; + pin = irq_attr->ioapic_pin; + if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[ioapic].apicid, pin); + return 0; + } + set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); + + return __io_apic_set_pci_routing(dev, irq, irq_attr); +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3980,6 +4013,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } +#endif int __init io_apic_get_version(int ioapic) { @@ -3992,39 +4026,6 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -#endif - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - struct irq_desc *desc; - struct irq_cfg *cfg; - int cpu = boot_cpu_id; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - desc = irq_to_desc_alloc_cpu(irq, cpu); - if (!desc) { - printk(KERN_INFO "can not get irq_desc %d\n", irq); - return 0; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= NR_IRQS_LEGACY) { - cfg = desc->chip_data; - add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); - } - - setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); - - return 0; -} - int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { @@ -4055,51 +4056,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic, irq, irq_entry; + int pin, ioapic = 0, irq, irq_entry; struct irq_desc *desc; - struct irq_cfg *cfg; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, desc, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); - continue; +#ifdef CONFIG_ACPI + if (!acpi_disabled && acpi_ioapic) { + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + ioapic = 0; + } +#endif - } + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); - /* - * Honour affinities which have been set in early boot - */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; - else - mask = apic->target_cpus(); + desc = irq_to_desc(irq); - if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq_desc(desc, mask); - else - set_ioapic_affinity_irq_desc(desc, mask); - } + /* + * Honour affinities which have been set in early boot + */ + if (desc->status & + (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) + mask = desc->affinity; + else + mask = apic->target_cpus(); + if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq_desc(desc, mask); + else + set_ioapic_affinity_irq_desc(desc, mask); } + } #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index ce4fbfa315a1..a691302dc3ff 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e4..440a8bccd91a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1783652bb0e5..bc3e880f9b82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic && (apic != &apic_x2apic_phys && + if (x2apic_mode && (apic != &apic_x2apic_phys && #ifdef CONFIG_X86_UV apic != &apic_x2apic_uv_x && #endif diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9cfe1f415d81..344eee4ac0a4 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); } - -/* In clustered mode, the high nibble of APIC ID is a cluster number. - * The low nibble is a 4-bit bitmap. */ -#define XAPIC_DEST_CPUS_SHIFT 4 -#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) -#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) - #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) static const struct cpumask *summit_target_cpus(void) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d17..8e4cbb255c38 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include <asm/apic.h> #include <asm/ipi.h> -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2bda69352976..ef0ae207a7c8 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP unsigned long val; @@ -562,7 +562,7 @@ void __init uv_system_init(void) union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int max_pnode = 0; + int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; @@ -574,6 +574,13 @@ void __init uv_system_init(void) mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", + n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) @@ -583,15 +590,18 @@ void __init uv_system_init(void) bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); uv_blade_info = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); memset(uv_node_to_blade, 255, bytes); bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); memset(uv_cpu_to_blade, 255, bytes); blade = 0; @@ -607,11 +617,6 @@ void __init uv_system_init(void) } } - pnode_mask = (1 << n_val) - 1; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); - gnode_upper = (((unsigned long)node_id.s.node_id) & - ~((1 << n_val) - 1)) << m_val; - uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size); @@ -634,6 +639,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162f..1a830cbd7015 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b5..898ecc47e129 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7e4a459daa64..e5b27d8f1b47 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -6,6 +6,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/pci-direct.h> #ifdef CONFIG_X86_64 # include <asm/numa_64.h> @@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = hard_smp_processor_id(); + unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) @@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 77848d9fca68..b0517aa2bd3b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -299,7 +299,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -768,6 +769,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); #endif @@ -813,6 +820,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -825,10 +842,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 46e29ab96c6a..6b2a52dd0403 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -32,9 +32,7 @@ static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); -static DEFINE_PER_CPU(unsigned, cpu_modelflag); static DEFINE_PER_CPU(int, cpu_priv_count); -static DEFINE_PER_CPU(unsigned, cpu_model); static DEFINE_MUTEX(cpu_debug_lock); @@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = { { "value", CPU_REG_ALL, 1 }, }; -/* Intel Registers Range */ -static struct cpu_debug_range cpu_intel_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL }, - { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL }, - { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM }, - { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE }, - - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE }, - { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON }, - { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE }, - - { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE }, - { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT }, - { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT }, - { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM }, - - { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON }, - - { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT }, - { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT }, - { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 }, - { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX }, - { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 }, - { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE }, - { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE }, - { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE }, - { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT }, - { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE }, - { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE }, - { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE }, - { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE }, - - { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE }, - { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE }, - { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 }, - { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE }, - { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE }, - - { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE }, - { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON }, - { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT }, - { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON }, - { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT }, - { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON }, - { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE }, - { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON }, - - { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON }, - { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON }, - { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON }, - { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE }, - { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON }, - { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE }, - { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE }, - { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE }, - - { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON }, - - { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON }, - { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON }, +/* CPU Registers Range */ +static struct cpu_debug_range cpu_reg_range[] = { + { 0x00000000, 0x00000001, CPU_MC, }, + { 0x00000006, 0x00000007, CPU_MONITOR, }, + { 0x00000010, 0x00000010, CPU_TIME, }, + { 0x00000011, 0x00000013, CPU_PMC, }, + { 0x00000017, 0x00000017, CPU_PLATFORM, }, + { 0x0000001B, 0x0000001B, CPU_APIC, }, + { 0x0000002A, 0x0000002B, CPU_POWERON, }, + { 0x0000002C, 0x0000002C, CPU_FREQ, }, + { 0x0000003A, 0x0000003A, CPU_CONTROL, }, + { 0x00000040, 0x00000047, CPU_LBRANCH, }, + { 0x00000060, 0x00000067, CPU_LBRANCH, }, + { 0x00000079, 0x00000079, CPU_BIOS, }, + { 0x00000088, 0x0000008A, CPU_CACHE, }, + { 0x0000008B, 0x0000008B, CPU_BIOS, }, + { 0x0000009B, 0x0000009B, CPU_MONITOR, }, + { 0x000000C1, 0x000000C4, CPU_PMC, }, + { 0x000000CD, 0x000000CD, CPU_FREQ, }, + { 0x000000E7, 0x000000E8, CPU_PERF, }, + { 0x000000FE, 0x000000FE, CPU_MTRR, }, + + { 0x00000116, 0x0000011E, CPU_CACHE, }, + { 0x00000174, 0x00000176, CPU_SYSENTER, }, + { 0x00000179, 0x0000017B, CPU_MC, }, + { 0x00000186, 0x00000189, CPU_PMC, }, + { 0x00000198, 0x00000199, CPU_PERF, }, + { 0x0000019A, 0x0000019A, CPU_TIME, }, + { 0x0000019B, 0x0000019D, CPU_THERM, }, + { 0x000001A0, 0x000001A0, CPU_MISC, }, + { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, + { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, + { 0x000001D9, 0x000001D9, CPU_DEBUG, }, + { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, + + { 0x00000200, 0x0000020F, CPU_MTRR, }, + { 0x00000250, 0x00000250, CPU_MTRR, }, + { 0x00000258, 0x00000259, CPU_MTRR, }, + { 0x00000268, 0x0000026F, CPU_MTRR, }, + { 0x00000277, 0x00000277, CPU_PAT, }, + { 0x000002FF, 0x000002FF, CPU_MTRR, }, + + { 0x00000300, 0x00000311, CPU_PMC, }, + { 0x00000345, 0x00000345, CPU_PMC, }, + { 0x00000360, 0x00000371, CPU_PMC, }, + { 0x0000038D, 0x00000390, CPU_PMC, }, + { 0x000003A0, 0x000003BE, CPU_PMC, }, + { 0x000003C0, 0x000003CD, CPU_PMC, }, + { 0x000003E0, 0x000003E1, CPU_PMC, }, + { 0x000003F0, 0x000003F2, CPU_PMC, }, + + { 0x00000400, 0x00000417, CPU_MC, }, + { 0x00000480, 0x0000048B, CPU_VMX, }, + + { 0x00000600, 0x00000600, CPU_DEBUG, }, + { 0x00000680, 0x0000068F, CPU_LBRANCH, }, + { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, + + { 0x000107CC, 0x000107D3, CPU_PMC, }, + + { 0xC0000080, 0xC0000080, CPU_FEATURES, }, + { 0xC0000081, 0xC0000084, CPU_CALL, }, + { 0xC0000100, 0xC0000102, CPU_BASE, }, + { 0xC0000103, 0xC0000103, CPU_TIME, }, + + { 0xC0010000, 0xC0010007, CPU_PMC, }, + { 0xC0010010, 0xC0010010, CPU_CONF, }, + { 0xC0010015, 0xC0010015, CPU_CONF, }, + { 0xC0010016, 0xC001001A, CPU_MTRR, }, + { 0xC001001D, 0xC001001D, CPU_MTRR, }, + { 0xC001001F, 0xC001001F, CPU_CONF, }, + { 0xC0010030, 0xC0010035, CPU_BIOS, }, + { 0xC0010044, 0xC0010048, CPU_MC, }, + { 0xC0010050, 0xC0010056, CPU_SMM, }, + { 0xC0010058, 0xC0010058, CPU_CONF, }, + { 0xC0010060, 0xC0010060, CPU_CACHE, }, + { 0xC0010061, 0xC0010068, CPU_SMM, }, + { 0xC0010069, 0xC001006B, CPU_SMM, }, + { 0xC0010070, 0xC0010071, CPU_SMM, }, + { 0xC0010111, 0xC0010113, CPU_SMM, }, + { 0xC0010114, 0xC0010118, CPU_SVM, }, + { 0xC0010140, 0xC0010141, CPU_OSVM, }, + { 0xC0011022, 0xC0011023, CPU_CONF, }, }; -/* AMD Registers Range */ -static struct cpu_debug_range cpu_amd_range[] = { - { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, }, - { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, }, - { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, }, - { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS }, - { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS }, - { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, }, - { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, }, - { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, }, - { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, }, - - { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, }, - { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, }, - { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, }, - { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, }, - { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, }, - { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, }, - { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, }, - { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, }, - { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, }, - { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, }, - { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, }, - { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, }, - { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, }, - { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, }, -}; - - -/* Intel */ -static int get_intel_modelflag(unsigned model) -{ - int flag; - - switch (model) { - case 0x0501: - case 0x0502: - case 0x0504: - flag = CPU_INTEL_PENTIUM; - break; - case 0x0601: - case 0x0603: - case 0x0605: - case 0x0607: - case 0x0608: - case 0x060A: - case 0x060B: - flag = CPU_INTEL_P6; - break; - case 0x0609: - case 0x060D: - flag = CPU_INTEL_PENTIUM_M; - break; - case 0x060E: - flag = CPU_INTEL_CORE; - break; - case 0x060F: - case 0x0617: - flag = CPU_INTEL_CORE2; - break; - case 0x061C: - flag = CPU_INTEL_ATOM; - break; - case 0x0F00: - case 0x0F01: - case 0x0F02: - case 0x0F03: - case 0x0F04: - flag = CPU_INTEL_XEON_P4; - break; - case 0x0F06: - flag = CPU_INTEL_XEON_MP; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -/* AMD */ -static int get_amd_modelflag(unsigned model) -{ - int flag; - - switch (model >> 8) { - case 0x6: - flag = CPU_AMD_K6; - break; - case 0x7: - flag = CPU_AMD_K7; - break; - case 0x8: - flag = CPU_AMD_K8; - break; - case 0xf: - flag = CPU_AMD_0F; - break; - case 0x10: - flag = CPU_AMD_10; - break; - case 0x11: - flag = CPU_AMD_11; - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_modelflag(unsigned cpu) -{ - int flag; - - flag = per_cpu(cpu_model, cpu); - - switch (flag >> 16) { - case X86_VENDOR_INTEL: - flag = get_intel_modelflag(flag); - break; - case X86_VENDOR_AMD: - flag = get_amd_modelflag(flag & 0xffff); - break; - default: - flag = CPU_NONE; - break; - } - - return flag; -} - -static int get_cpu_range_count(unsigned cpu) -{ - int index; - - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - index = ARRAY_SIZE(cpu_intel_range); - break; - case X86_VENDOR_AMD: - index = ARRAY_SIZE(cpu_amd_range); - break; - default: - index = 0; - break; - } - - return index; -} - static int is_typeflag_valid(unsigned cpu, unsigned flag) { - unsigned vendor, modelflag; - int i, index; + int i; /* Standard Registers should be always valid */ if (flag >= CPU_TSS) return 1; - modelflag = per_cpu(cpu_modelflag, cpu); - vendor = per_cpu(cpu_model, cpu) >> 16; - index = get_cpu_range_count(cpu); - - for (i = 0; i < index; i++) { - switch (vendor) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[i].model & modelflag) && - (cpu_intel_range[i].flag & flag)) - return 1; - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[i].model & modelflag) && - (cpu_amd_range[i].flag & flag)) - return 1; - break; - } + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { + if (cpu_reg_range[i].flag == flag) + return 1; } /* Invalid */ @@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag) static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, int index, unsigned flag) { - unsigned modelflag; - - modelflag = per_cpu(cpu_modelflag, cpu); - *max = 0; - switch (per_cpu(cpu_model, cpu) >> 16) { - case X86_VENDOR_INTEL: - if ((cpu_intel_range[index].model & modelflag) && - (cpu_intel_range[index].flag & flag)) { - *min = cpu_intel_range[index].min; - *max = cpu_intel_range[index].max; - } - break; - case X86_VENDOR_AMD: - if ((cpu_amd_range[index].model & modelflag) && - (cpu_amd_range[index].flag & flag)) { - *min = cpu_amd_range[index].min; - *max = cpu_amd_range[index].max; - } - break; - } + if (cpu_reg_range[index].flag == flag) { + *min = cpu_reg_range[index].min; + *max = cpu_reg_range[index].max; + } else + *max = 0; return *max; } @@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) unsigned msr, msr_min, msr_max; struct cpu_private *priv; u32 low, high; - int i, range; + int i; if (seq) { priv = seq->private; @@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) } } - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) continue; @@ -588,8 +369,20 @@ static void print_apic(void *arg) seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); -#endif /* CONFIG_X86_LOCAL_APIC */ + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + unsigned int i, v, maxeilvt; + + v = apic_read(APIC_EFEAT); + maxeilvt = (v >> 16) & 0xff; + seq_printf(seq, " EFEAT\t\t: %08x\n", v); + seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); + for (i = 0; i < maxeilvt; i++) { + v = apic_read(APIC_EILVTn(i)); + seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); + } + } +#endif /* CONFIG_X86_LOCAL_APIC */ seq_printf(seq, "\n MSR\t:\n"); } @@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) { struct dentry *cpu_dentry = NULL; unsigned reg, reg_min, reg_max; - int i, range, err = 0; + int i, err = 0; char reg_dir[12]; u32 low, high; - range = get_cpu_range_count(cpu); - - for (i = 0; i < range; i++) { + for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { if (!get_cpu_range(cpu, ®_min, ®_max, i, cpu_base[type].flag)) continue; @@ -850,10 +641,6 @@ static int cpu_init_cpu(void) cpui = &cpu_data(cpu); if (!cpu_has(cpui, X86_FEATURE_MSR)) continue; - per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) | - (cpui->x86 << 8) | - (cpui->x86_model)); - per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu); sprintf(cpu_dir, "cpu%d", cpu); cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index 52c839875478..f138c6c389b9 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -220,11 +220,14 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver" + tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" select CPU_FREQ_TABLE - depends on X86_32 + depends on X86_32 && EXPERIMENTAL help - This adds the CPUFreq driver for VIA C7 processors. + This adds the CPUFreq driver for VIA C7 processors. However, this driver + does not have any safeguards to prevent operating the CPU out of spec + and is thus considered dangerous. Please use the regular ACPI cpufreq + driver, enabled by CONFIG_X86_ACPI_CPUFREQ. If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 54b6de2cd947..ae9b503220ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return 0; - - return 1; + return cpu_has(cpu, X86_FEATURE_EST); } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) @@ -550,7 +546,7 @@ static int __init acpi_cpufreq_early_init(void) return -ENOMEM; } for_each_possible_cpu(i) { - if (!alloc_cpumask_var_node( + if (!zalloc_cpumask_var_node( &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, GFP_KERNEL, cpu_to_node(i))) { diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index a8363e5be4ef..d47c775eb0ab 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -322,7 +322,7 @@ static int powernow_acpi_init(void) goto err0; } - if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, + if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, GFP_KERNEL)) { retval = -ENOMEM; goto err05; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 35dc8fbe92bd..cf52215d9eb1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -887,7 +887,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { printk(KERN_ERR PFX "unable to alloc powernow_k8_data cpumask\n"); ret_val = -ENOMEM; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index c9f1fdc02830..55c831ed71ce 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -471,7 +471,7 @@ static int centrino_target (struct cpufreq_policy *policy, if (unlikely(!alloc_cpumask_var(&saved_mask, GFP_KERNEL))) return -ENOMEM; - if (unlikely(!alloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { + if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) { free_cpumask_var(saved_mask); return -ENOMEM; } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7437fa133c02..daed39ba2614 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -static void __cpuinit srat_detect_node(void) +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); + int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ @@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } /* Work around errata */ - srat_detect_node(); + srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e102..789efe217e1a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include <asm/processor.h> #include <asm/smp.h> +#include <asm/k8.h> #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -207,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; @@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } @@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); +} - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ } +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) - return 0; - - if (strlen(buf) > 15) return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) return -EINVAL; - if (index > 1) + + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); - return -EINVAL; - } + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); + return count; +} - return 1; +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ } +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) struct _cache_attr { struct attribute attr; @@ -808,7 +795,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 6fb0b359d2a5..09dd1d414fc3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -1163,7 +1163,7 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); err = mce_init_banks(); if (err) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index cef3ee30744b..65a0fceedcd7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include <asm/hw_irq.h> #include <asm/idle.h> #include <asm/therm_throt.h> -#include <asm/apic.h> asmlinkage void smp_thermal_interrupt(void) { diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04f..1d584a18a50d 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d21d4fb161f7..0543f69f0b27 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,9 +20,9 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) @@ -310,7 +310,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -318,7 +318,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); @@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c7..8fc248b5aeaf 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347a..7538b767f206 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,21 +5,6 @@ #include <linux/types.h> #include <linux/stddef.h> -#define MTRRcap_MSR 0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685b..1f5fb1588d1f 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b8698..81086c227ab7 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 006281302925..7271fa33d791 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", @@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953dee..ebdb85cf2686 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e8433..bb01ce080b80 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1379,6 +1379,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0cd..dc5ed4bdd88d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c3fe010d74c8..9a391bbb8ba8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/idle.h> +#include <asm/hw_irq.h> atomic_t irq_err_count; @@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL; */ void ack_bad_irq(unsigned int irq) { - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N @@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq) * completely. * But only ack when the APIC is enabled -AK */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); } #define irq_stats(x) (&per_cpu(irq_stat, x)) @@ -178,7 +177,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_thermal_count; # ifdef CONFIG_X86_64 sum += irq_stats(cpu)->irq_threshold_count; -#endif +# endif #endif return sum; } @@ -213,14 +212,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) irq = __get_cpu_var(vector_irq)[vector]; if (!handle_irq(irq, regs)) { -#ifdef CONFIG_X86_64 - if (!disable_apic) - ack_APIC_irq(); -#endif + ack_APIC_irq(); if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n", - __func__, smp_processor_id(), vector, irq); + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); } irq_exit(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c index 368b0a8836f9..2e08b10ad51a 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit.c @@ -1,20 +1,25 @@ +#include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/timex.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/bitops.h> +#include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> #include <asm/atomic.h> #include <asm/system.h> #include <asm/timer.h> +#include <asm/hw_irq.h> #include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> @@ -22,7 +27,23 @@ #include <asm/i8259.h> #include <asm/traps.h> +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ +#ifdef CONFIG_X86_32 /* * Note that on a 486, we don't want to do a SIGFPE on an irq13 * as the irq is unreliable, and exception 16 works correctly @@ -52,30 +73,7 @@ static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", }; - -void __init init_ISA_irqs(void) -{ - int i; - -#ifdef CONFIG_X86_LOCAL_APIC - init_bsp_APIC(); #endif - init_8259A(0); - - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} /* * IRQ2 is cascade interrupt to second interrupt controller @@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init init_ISA_irqs(void) { int i; - /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + init_8259A(0); /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) + * 16 old-style INTA-cycle interrupts: */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + for (i = 0; i < NR_IRQS_LEGACY; i++) { + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); } +} +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -160,16 +166,27 @@ void __init native_init_IRQ(void) /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - /* IPI for single call function */ + /* IPI for generic single function call */ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + +#ifdef CONFIG_X86_64 + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -179,16 +196,67 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupts: */ +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); +# endif + #endif +#ifdef CONFIG_X86_32 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#endif +} + +/** + * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +static void __init x86_quirk_pre_intr_init(void) +{ +#ifdef CONFIG_X86_32 + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) + return; + } +#endif + init_ISA_irqs(); +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_quirk_pre_intr_init(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + if (!test_bit(i, used_vectors)) + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + } if (!acpi_ioapic) setup_irq(2, &irq2); +#ifdef CONFIG_X86_32 /* * Call quirks after call gates are initialised (usually add in * the architecture specific gates): @@ -203,4 +271,5 @@ void __init native_init_IRQ(void) setup_irq(FPU_IRQ, &fpu_irq); irq_ctx_init(smp_processor_id()); +#endif } diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c deleted file mode 100644 index 8cd10537fd46..000000000000 --- a/arch/x86/kernel/irqinit_64.c +++ /dev/null @@ -1,177 +0,0 @@ -#include <linux/linkage.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> -#include <linux/bitops.h> -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/atomic.h> -#include <asm/system.h> -#include <asm/hw_irq.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/apic.h> -#include <asm/i8259.h> - -/* - * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: - * (these are usually mapped to vectors 0x30-0x3f) - */ - -/* - * The IO-APIC gives us many more interrupt sources. Most of these - * are unused but an SMP system is supposed to have enough memory ... - * sometimes (mostly wrt. hw bugs) we get corrupted vectors all - * across the spectrum, so we really want to be prepared to get all - * of these. Plus, more powerful systems might have more than 64 - * IO-APIC registers. - * - * (these are usually mapped into the 0x30-0xff vector range) - */ - -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ - -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", -}; -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 -}; - -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (per_cpu(vector_irq, cpu)[vector] != -1) - return 1; - } - - return 0; -} - -static void __init init_ISA_irqs(void) -{ - int i; - - init_bsp_APIC(); - init_8259A(0); - - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } -} - -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); -#endif -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); - - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -} - -void __init native_init_IRQ(void) -{ - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - - apic_intr_init(); - - if (!acpi_ioapic) - setup_irq(2, &irq2); -} diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..8d82a77a3f3b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 33019ddb56b4..6551dedee20c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -195,7 +195,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c6..366baa179913 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> #include <linux/pci.h> -#include <linux/fs.h> -#include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf) static void free_equiv_cpu_table(void) { - if (equiv_cpu_table) { - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; - } + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else @@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) if (new_mc) { if (!leftover) { - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); + enum ucode_state ret; - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d1..9c4461501fcb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include <linux/platform_device.h> -#include <linux/capability.h> #include <linux/miscdevice.h> -#include <linux/firmware.h> +#include <linux/capability.h> #include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> #include <asm/microcode.h> #include <asm/processor.h> -#include <asm/msr.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; + if (!uci->mc) + return UCODE_NFOUND; + + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); + + return UCODE_OK; } -static int microcode_resume_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; + enum ucode_state ustate; - pr_debug("microcode: CPU%d resumed\n", cpu); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; - if (!uci->mc) - return 1; + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - return 0; + return ustate; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_update_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); - } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; -} + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); -static int microcode_init_cpu(int cpu) -{ - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); - - return err; + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1ab..0d334ddd0a96 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include <linux/platform_device.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> #include <linux/firmware.h> -#include <linux/smp_lock.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/fs.h> -#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) + if (leftover) { + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; goto out; + } - if (leftover) { - vfree(new_mc); + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70fd7e414c15..651c93b28862 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -17,6 +17,7 @@ #include <linux/acpi.h> #include <linux/module.h> #include <linux/smp.h> +#include <linux/pci.h> #include <asm/mtrr.h> #include <asm/mpspec.h> @@ -870,24 +871,17 @@ static inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} #endif /* CONFIG_X86_IO_APIC */ -static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, - int count) +static int +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - if (!mpc_new_phys) { - pr_info("No spare slots, try to append...take your risk, " - "new mpc_length %x\n", count); - } else { - if (count <= mpc_new_length) - pr_info("No spare slots, try to append..., " - "new mpc_length %x\n", count); - else { - pr_err("mpc_new_length %lx is too small\n", - mpc_new_length); - return -1; - } + int ret = 0; + + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; } - return 0; + return ret; } static int __init replace_intsrc_all(struct mpc_table *mpc, @@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); - if (!check_slot(mpc_new_phys, mpc_new_length, count)) + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; assign_to_mpc_intsrc(&mp_irqs[i], m); mpc->length = count; @@ -963,11 +957,14 @@ out: return 0; } -static int __initdata enable_update_mptable; +int enable_update_mptable; static int __init update_mptable_setup(char *str) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif return 0; } early_param("update_mptable", update_mptable_setup); @@ -980,6 +977,9 @@ static int __initdata alloc_mptable; static int __init parse_alloc_mptable_opt(char *p) { enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif alloc_mptable = 1; if (!p) return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9faf43bea336..70ec9b951d76 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + BUG_ON(preemptible()); + + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return __get_cpu_var(paravirt_lazy_mode); + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) @@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f3..971a3bec47a8 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035c..cfd9f9063896 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - enable_gart_translations(); - error = sysdev_class_register(&gart_sysdev_class); if (!error) error = sysdev_register(&device_gart); @@ -801,11 +769,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -845,6 +814,14 @@ void __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); /* * Try to workaround a bug (thanks to BenH): diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 221a3853e268..a1712f2b50f1 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e847..e22d63bdc8ff 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,9 +8,11 @@ #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/random.h> #include <trace/power.h> #include <asm/system.h> #include <asm/apic.h> +#include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> @@ -613,3 +615,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a2..c60924b5d123 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -33,7 +31,6 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/personality.h> #include <linux/tick.h> #include <linux/percpu.h> @@ -407,7 +404,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the @@ -497,15 +494,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b1..45f010fb2e20 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include <stdarg.h> - #include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> @@ -32,7 +30,6 @@ #include <linux/delay.h> #include <linux/module.h> #include <linux/ptrace.h> -#include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -428,7 +425,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(next_p); /* * Switch FS and GS. @@ -660,15 +657,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 7563b31b4f03..af71d06624bf 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,5 +491,42 @@ void force_hpet_resume(void) break; } } +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 667188e0b5a0..d2d1ce8170f0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf63..d1c636bf31a7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; @@ -214,8 +222,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; @@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + reserve_brk(); /* max_pfn_mapped is updated here */ @@ -997,24 +1015,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 /** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -void __init x86_quirk_pre_intr_init(void) -{ - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } - init_ISA_irqs(); -} - -/** * x86_quirk_intr_init - post gate setup interrupt initialisation * * Description: diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8f0e13be36b3..9c3f0823e6aa 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 13f33ea8ccaa..f6db48c405b8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) } struct smp_ops smp_ops = { - .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, - .smp_prepare_cpus = native_smp_prepare_cpus, - .smp_cpus_done = native_smp_cpus_done, + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, - .smp_send_reschedule = native_smp_send_reschedule, + .smp_send_stop = native_smp_send_stop, + .smp_send_reschedule = native_smp_send_reschedule, - .cpu_up = native_cpu_up, - .cpu_die = native_cpu_die, - .cpu_disable = native_cpu_disable, - .play_dead = native_play_dead, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, - .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..7c80007ea5f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -int __devinit +int __cpuinit wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -int __devinit +static int __cpuinit wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; @@ -822,10 +822,12 @@ do_rest: /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; - /* - * Cleanup possible dangling ends... - */ - smpboot_restore_warm_reset_vector(); + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } return boot_error; } @@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk(KERN_ERR "... forcing use of dummy APIC emulation." + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation." "(tell your hw vendor)\n"); + } smpboot_clear_io_apic(); arch_disable_smp_support(); return -1; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ed0c33761e6d..124d40c575df 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode) struct bau_desc *adp; struct bau_desc *ad2; - adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node); + /* + * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + */ + adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ @@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode) (n << UV_DESC_BASE_PNODE_SHIFT | m)); } - for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) { + /* + * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 nodes. + */ + for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, ad2++) { memset(ad2, 0, sizeof(struct bau_desc)); ad2->header.sw_ack_flag = 1; /* @@ -832,7 +843,7 @@ static int __init uv_bau_init(void) return 0; for_each_possible_cpu(cur_cpu) - alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), + zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..ede024531f8f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } @@ -969,11 +966,8 @@ void __init trap_init(void) for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); -#ifdef CONFIG_X86_64 set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else - set_bit(SYSCALL_VECTOR, used_vectors); -#endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d57de05dc430..3e1c057e98fe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); @@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs) #ifdef CONFIG_X86_64 static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)vget_cycles(); + cycle_t ret; + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + rdtsc_barrier(); return ret >= __vsyscall_gtod_data.clock.cycle_last ? ret : __vsyscall_gtod_data.clock.cycle_last; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef9..027b5b498993 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1c..9c4e62539058 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.ds = 0; info->regs.pt.es = 0; info->regs.pt.fs = 0; - -/* we are clearing gs later just before "jmp resume_userspace", - * because it is not saved/restored. - */ +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif /* * The flags register is also special: we cannot trust that the user @@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); @@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS "mov %2, %%gs\n\t" +#endif "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95deb9f2211e..b263423fbe2a 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } +static void vmi_end_context_switch(struct task_struct *next) +{ + vmi_ops.set_lazy_mode(0); + paravirt_end_context_switch(next); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -711,14 +717,14 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f013..4c85b2e2bb65 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,431 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org> + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + #ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" +#define LOAD_OFFSET __PAGE_OFFSET #else -# include "vmlinux_64.lds.S" +#define LOAD_OFFSET __START_KERNEL_map #endif + +#include <asm-generic/vmlinux.lds.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page_types.h> +#include <asm/cache.h> +#include <asm/boot.h> + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + + /* freed after init ends here */ + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } + + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG +} + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index 62ad500d55f3..000000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,229 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include <asm-generic/vmlinux.lds.h> -#include <asm/thread_info.h> -#include <asm/page_types.h> -#include <asm/cache.h> -#include <asm/boot.h> - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index c8742507b030..000000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,298 +0,0 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include <asm-generic/vmlinux.lds.h> -#include <asm/asm-offsets.h> -#include <asm/page_types.h> - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 - - NOTES :text :note - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ - } :data - - - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init - - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - - STABS_DEBUG - - DWARF_DEBUG -} - - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include <asm/kexec.h> - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 44153afc9067..25ee06a80aad 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) return; } - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); now = vread(); - rdtsc_barrier(); - base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 33a93b417396..4e0c26559395 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call, /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ -static void lguest_leave_lazy_mode(void) +static void lguest_leave_lazy_mmu_mode(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); kvm_hypercall0(LHCALL_FLUSH_ASYNC); + paravirt_leave_lazy_mmu(); +} + +static void lguest_end_context_switch(struct task_struct *next) +{ + kvm_hypercall0(LHCALL_FLUSH_ASYNC); + paravirt_end_context_switch(next); } /*G:033 @@ -637,7 +643,7 @@ static void __init lguest_init_IRQ(void) void lguest_setup_irq(unsigned int irq) { - irq_to_desc_alloc_cpu(irq, 0); + irq_to_desc_alloc_node(irq, 0); set_irq_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); } @@ -1054,8 +1060,8 @@ __init void lguest_init(void) pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_cpu_ops.start_context_switch = paravirt_start_context_switch; + pv_cpu_ops.end_context_switch = lguest_end_context_switch; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; @@ -1068,7 +1074,7 @@ __init void lguest_init(void) pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; pv_mmu_ops.pte_update = lguest_pte_update; pv_mmu_ops.pte_update_defer = lguest_pte_update; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e7277cbcfb40..a725b7f760ae 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; + int width = sizeof(unsigned long) * 2; /* * Now print the actual finished series */ - seq_printf(m, "0x%p-0x%p ", - (void *)st->start_address, - (void *)st->current_address); + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); delta = (st->current_address - st->start_address) >> 10; while (!(delta & 1023) && unit[1]) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa0..5ec7ae366615 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,40 +3,16 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include <linux/interrupt.h> -#include <linux/mmiotrace.h> -#include <linux/bootmem.h> -#include <linux/compiler.h> -#include <linux/highmem.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/vt_kern.h> -#include <linux/signal.h> -#include <linux/kernel.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/kdebug.h> -#include <linux/errno.h> -#include <linux/magic.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/mman.h> -#include <linux/tty.h> -#include <linux/smp.h> -#include <linux/mm.h> - -#include <asm-generic/sections.h> - -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/segment.h> -#include <asm/system.h> -#include <asm/proto.h> -#include <asm/traps.h> -#include <asm/desc.h> +#include <linux/magic.h> /* STACK_END_MAGIC */ +#include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/kdebug.h> /* oops_begin/end, ... */ +#include <linux/module.h> /* search_exception_table */ +#include <linux/bootmem.h> /* max_low_pfn */ +#include <linux/kprobes.h> /* __kprobes, ... */ +#include <linux/mmiotrace.h> /* kmmio_handler, ... */ + +#include <asm/traps.h> /* dotraplinkage, ... */ +#include <asm/pgalloc.h> /* pgd_*(), ... */ /* * Page fault error code bits: @@ -225,12 +201,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pmd_present(*pmd_k)) return NULL; - if (!pmd_present(*pmd)) { + if (!pmd_present(*pmd)) set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { + else BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } return pmd_k; } @@ -538,8 +512,6 @@ bad: static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - if (address != regs->ip) return 0; @@ -549,10 +521,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { - printk(errata93_warning); - once = 1; - } + printk_once(errata93_warning); regs->ip = address; return 1; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8126e8d1a2a4..58f621e81919 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); return (void *)vaddr; } @@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } - arch_flush_lazy_mmu_mode(); pagefault_enable(); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ae4f7b5d7104..34c1bfb64f1c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> @@ -10,6 +11,9 @@ #include <asm/setup.h> #include <asm/system.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; @@ -23,6 +27,69 @@ int direct_gbpages #endif ; +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +static inline void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, */ #ifdef CONFIG_X86_32 start = 0x7000; - e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, - tables, PAGE_SIZE); -#else /* CONFIG_X86_64 */ +#else start = 0x8000; - e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE); #endif + e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, + tables, PAGE_SIZE); if (e820_table_start == -1UL) panic("Cannot find space for the kernel page tables"); @@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } -#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 749559ed80f5..949708d7a481 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,12 +49,9 @@ #include <asm/paravirt.h> #include <asm/setup.h> #include <asm/cacheflush.h> +#include <asm/page_types.h> #include <asm/init.h> -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); @@ -587,61 +584,9 @@ void zap_low_mappings(void) flush_tlb_all(); } -int nx_enabled; - pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); -#ifdef CONFIG_X86_PAE - -static int disable_nx __initdata; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str || !strcmp(str, "on")) { - if (cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } - - return 0; -} -early_param("noexec", noexec_setup); - -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#endif - /* user-defined highmem size */ static unsigned int highmem_pages = -1; @@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn, highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memory_present(0, 0, highend_pfn); e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memory_present(0, 0, max_low_pfn); e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1753e8020df6..52bb9519bb86 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,18 +50,8 @@ #include <asm/cacheflush.h> #include <asm/init.h> -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int disable_nx __cpuinitdata; - -/* - * noexec=on|off - * Control non-executable mappings for 64-bit processes. - * - * on Enable (default) - * off Disable - */ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - int force_personality32; /* @@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } +#endif void __init paging_init(void) { @@ -638,11 +596,10 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; - memory_present(0, 0, max_pfn); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); free_area_init_nodes(max_zone_pfns); } -#endif /* * Memory hotplug specific functions diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 8056545e2d39..fe6f84ca121e 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) kpte_clear_flush(kmap_pte-idx, vaddr); - arch_flush_lazy_mmu_mode(); pagefault_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 605c8be06217..c0bedcd10f97 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - u64 i, count; - u64 *start; + u64 *p; + void *start, *end; u64 start_bad, last_bad; u64 start_phys_aligned; size_t incr; incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); - count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); + end = start + size - (start_phys_aligned - start_phys); start_bad = 0; last_bad = 0; - for (i = 0; i < count; i++) - start[i] = pattern; - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start == pattern) + for (p = start; p < end; p++) + *p = pattern; + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) continue; if (start_phys_aligned == last_bad + incr) { last_bad += incr; diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2d05a12029dc..459913beac71 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start, } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end) +void __init +setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; + const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); unsigned long bootmap_start, nodedata_phys; void *bootmap; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; if (!end) return; + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, @@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); -#ifdef CONFIG_ACPI_NUMA - srat_reserve_add_area(nodeid); -#endif node_set_online(nodeid); } @@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; - max_zone_pfns[ZONE_NORMAL] = max_pfn; - - sparse_memory_present_with_active_regions(MAX_NUMNODES); - sparse_init(); - - free_area_init_nodes(max_zone_pfns); -} - static __init int numa_setup(char *opt) { if (!opt) @@ -606,8 +595,6 @@ static __init int numa_setup(char *opt) #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt, "noacpi", 6)) acpi_numa = -1; - if (!strncmp(opt, "hotadd=", 7)) - hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e17efed088c5..6ce9518fe2ac 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; @@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); - /* - * If we've been called with lazy mmu updates enabled, then - * make sure that everything gets flushed out before we - * return. - */ - arch_flush_lazy_mmu_mode(); - out: return ret; } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 01765955baaf..2dfcbf9df2ae 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata; static nodemask_t cpu_nodes_parsed __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes_add[MAX_NUMNODES]; -static int found_add_area __initdata; -int hotadd_percent __initdata = 0; static int num_node_memblks __initdata; static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &nodes[i]; - if (found_add_area) - return; - if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -86,7 +77,6 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; - found_add_area = 0; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; for (i = 0; i < MAX_NUMNODES; i++) @@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm, apic_id, node); } -static int update_end_of_memory(unsigned long end) {return -1;} -static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif /* - * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add information. - * This code supports one contiguous hot add area per node. + * Update nodes_add[] + * This code supports one contiguous hot add area per node */ -static int __init -reserve_hotadd(int node, unsigned long start, unsigned long end) +static void __init +update_nodes_add(int node, unsigned long start, unsigned long end) { unsigned long s_pfn = start >> PAGE_SHIFT; unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; + int changed = 0; struct bootnode *nd = &nodes_add[node]; /* I had some trouble with strange memory hotadd regions breaking @@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) mistakes */ if ((signed long)(end - start) < NODE_MIN_SIZE) { printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; + return; } /* This check might be a bit too strict, but I'm keeping it for now. */ @@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug area %lu -> %lu has existing memory\n", s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; + return; } /* Looks good */ @@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - ret = update_end_of_memory(nd->end); - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", + nd->start, nd->end); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start, end); e820_register_active_regions(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + update_nodes_add(node, start, end); + /* restore nodes[node] */ *nd = oldnode; if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); @@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { + e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { printk(KERN_ERR "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", (pxmram << PAGE_SHIFT) >> 20, @@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void __init unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - node_clear(node, cpu_nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ @@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - /* - * don't confuse VM with a node that doesn't have the - * minimum memory. - */ - if (nodes[i].end && - (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } if (!nodes_cover_memory(nodes)) { bad_srat(); @@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; - if (!node_isset(node, node_possible_map)) + if (!node_online(node)) numa_clear_node(i); } numa_init_array(); @@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b) } #endif /* CONFIG_NUMA_EMU */ -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start, - BOOTMEM_DEFAULT); - } -} - int __node_distance(int a, int b) { int index; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index fecbce6e7d7c..0696d506c4ad 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 0; } + if (io_apic_assign_pci_irqs) + return 0; + /* Find IRQ routing entry */ if (!pirq_table) @@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void) pirq_penalty[dev->irq]++; } + if (io_apic_assign_pci_irqs) + return; + dev = NULL; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -#ifdef CONFIG_X86_IO_APIC - /* - * Recalculate IRQ numbers if we use the I/O APIC. - */ - if (io_apic_assign_pci_irqs) { - int irq; - - /* - * interrupt pins are numbered starting from 1 - */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin - 1); - /* - * Busses behind bridges are typically not listed in the - * MP-table. In this case we have to look up the IRQ - * based on the parent bus, parent slot, and pin number. - * The SMP code detects such bridged busses itself so we - * should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { - /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - int bus; - - pin = pci_swizzle_interrupt_pin(dev, pin); - bus = bridge->bus->number; - irq = IO_APIC_get_PCI_irq_vector(bus, - PCI_SLOT(bridge->devfn), pin - 1); - if (irq >= 0) - dev_warn(&dev->dev, - "using bridge %s INT %c to " - "get IRQ %d\n", - pci_name(bridge), - 'A' + pin - 1, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, - "PCI->APIC IRQ transform: INT %c " - "-> IRQ %d\n", - 'A' + pin - 1, irq); - dev->irq = irq; - } - } -#endif /* * Still no IRQ? Try to lookup one... */ @@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void) pcibios_enable_irq = pirq_enable_irq; pcibios_fixup_irqs(); + + if (io_apic_assign_pci_irqs && pci_routeirq) { + struct pci_dev *dev = NULL; + /* + * PCI IRQ routing is set up by pci_enable_device(), but we + * also do it here in case there are still broken drivers that + * don't use pci_enable_device(). + */ + printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + for_each_pci_dev(dev) + pirq_enable_irq(dev); + } + return 0; } @@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active) static int pirq_enable_irq(struct pci_dev *dev) { u8 pin; - struct pci_dev *temp_dev; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + if (pin && !pcibios_lookup_irq(dev, 1)) { char *msg = ""; + if (!io_apic_assign_pci_irqs && dev->irq) + return 0; + if (io_apic_assign_pci_irqs) { +#ifdef CONFIG_X86_IO_APIC + struct pci_dev *temp_dev; int irq; + struct io_apic_irq_attr irq_attr; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1); + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), + pin - 1, &irq_attr); /* * Busses behind bridges are typically not listed in the MP-table. * In this case we have to look up the IRQ based on the parent bus, @@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev) pin = pci_swizzle_interrupt_pin(dev, pin); irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin - 1); + PCI_SLOT(bridge->devfn), + pin - 1, &irq_attr); if (irq >= 0) dev_warn(&dev->dev, "using bridge %s " "INT %c to get IRQ %d\n", @@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev) } dev = temp_dev; if (irq >= 0) { + io_apic_set_pci_routing(&dev->dev, irq, + &irq_attr); + dev->irq = irq; dev_info(&dev->dev, "PCI->APIC IRQ transform: " "INT %c -> IRQ %d\n", 'A' + pin - 1, irq); - dev->irq = irq; return 0; } else msg = "; probably buggy MP table"; +#endif } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7133cdf9098b..cac083386e03 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/random.h> +#include <linux/elf.h> #include <asm/vsyscall.h> #include <asm/vgtod.h> #include <asm/proto.h> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f09e8c36ee80..0a1700a2be9c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/start_kernel.h> #include <linux/sched.h> +#include <linux/kprobes.h> #include <linux/bootmem.h> #include <linux/module.h> #include <linux/mm.h> @@ -44,6 +45,7 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/msr-index.h> +#include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> @@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -void xen_leave_lazy(void) +static void xen_end_context_switch(struct task_struct *next) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); + paravirt_end_context_switch(next); } static unsigned long xen_store_tr(void) @@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { + unsigned long addr; + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; - info->address = gate_offset(*val); + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault and + * machine_check, so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else if (addr == (unsigned long)double_fault || + addr == (unsigned long)nmi) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + return 0; +#endif + } else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } +#endif /* CONFIG_X86_64 */ + info->address = addr; + info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ @@ -623,10 +658,26 @@ static void xen_clts(void) xen_mc_issue(PARAVIRT_LAZY_CPU); } +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = percpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + percpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + static void xen_write_cr0(unsigned long cr0) { struct multicall_space mcs; + percpu_write(xen_cr0_value, cr0); + /* Only pay attention to cr0.TS; everything else is ignored. */ mcs = xen_mc_entry(0); @@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .clts = xen_clts, - .read_cr0 = native_read_cr0, + .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, @@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { /* Xen takes care of %gs when switching to usermode for us */ .swapgs = paravirt_nop, - .lazy_mode = { - .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy, - }, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, }; static const struct pv_apic_ops xen_apic_ops __initdata = { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fba55b1a4021..4ceb28581652 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - /* updates to init_mm may be done without lock */ - if (mm == &init_mm) - preempt_disable(); - ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, } xen_set_pte(ptep, pteval); -out: - if (mm == &init_mm) - preempt_enable(); +out: return; } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, @@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info) /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { + if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); - arch_flush_lazy_cpu_mode(); - } } static void xen_drop_mm_ref(struct mm_struct *mm) @@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - arch_flush_lazy_cpu_mode(); } /* Get the "official" set of cpus referring to our pagetable. */ @@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void) xen_mark_init_mm_pinned(); } +static void xen_leave_lazy_mmu(void) +{ + preempt_disable(); + xen_mc_flush(); + paravirt_leave_lazy_mmu(); + preempt_enable(); +} + const struct pv_mmu_ops xen_mmu_ops __initdata = { .pagetable_setup_start = xen_pagetable_setup_start, .pagetable_setup_done = xen_pagetable_setup_done, @@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_mmu, }, .set_fixmap = xen_set_fixmap, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 15c6c68db6a2..ad0047f47cd4 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -61,9 +61,9 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ - e820_add_region(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list, - E820_RESERVED); + reserve_early(__pa(xen_start_info->mfn_list), + __pa(xen_start_info->pt_base), + "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ca6596b05d53..22494fd4c9b5 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_leave_lazy(void); void xen_post_allocator_init(void); char * __init xen_memory_setup(void); diff --git a/block/bsg.c b/block/bsg.c index 206060e795da..dd81be455e00 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -315,6 +315,7 @@ out: blk_put_request(rq); if (next_rq) { blk_rq_unmap_user(next_rq->bio); + next_rq->bio = NULL; blk_put_request(next_rq); } return ERR_PTR(ret); @@ -448,6 +449,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, hdr->dout_resid = rq->data_len; hdr->din_resid = rq->next_rq->data_len; blk_rq_unmap_user(bidi_bio); + rq->next_rq->bio = NULL; blk_put_request(rq->next_rq); } else if (rq_data_dir(rq) == READ) hdr->din_resid = rq->data_len; @@ -466,6 +468,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, blk_rq_unmap_user(bio); if (rq->cmd != rq->__cmd) kfree(rq->cmd); + rq->bio = NULL; blk_put_request(rq); return ret; diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 51b9f8280f88..2faa9e2ac893 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev) /* Interrupt Line values above 0xF are forbidden */ if (dev->irq > 0 && (dev->irq <= 0xF)) { printk(" - using IRQ %d\n", dev->irq); - acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE, + acpi_register_gsi(&dev->dev, dev->irq, + ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); return 0; } else { @@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev) } } - rc = acpi_register_gsi(gsi, triggering, polarity); + rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity); if (rc < 0) { dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n", pin_name(pin)); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 45ad3288c5ff..23f0fb84f1c1 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -844,7 +844,7 @@ static int acpi_processor_add(struct acpi_device *device) if (!pr) return -ENOMEM; - if (!alloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) { kfree(pr); return -ENOMEM; } diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 340ba4f9dc54..4a9f3492b921 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp) break; } - gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE, + gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); if (gsi > 0) break; @@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data) irqp = &res->data.extended_irq; for (i = 0; i < irqp->interrupt_count; i++) { - irq = acpi_register_gsi(irqp->interrupts[i], + irq = acpi_register_gsi(NULL, irqp->interrupts[i], irqp->triggering, irqp->polarity); if (irq < 0) return AE_ERROR; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 65e12bca657c..f96d0bef855e 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf, written += chunk - unwritten; if (unwritten) break; - /* Consider changing this to just 'signal_pending()' with lots of testing */ - if (fatal_signal_pending(current)) - return written ? written : -EINTR; + if (signal_pending(current)) + return written ? written : -ERESTARTSYS; buf += chunk; count -= chunk; cond_resched(); diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index a420e8d437dd..13f8871e5b21 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -2711,7 +2711,7 @@ static int __init mxser_module_init(void) continue; brd = &mxser_boards[m]; - retval = mxser_get_ISA_conf(!ioaddr[b], brd); + retval = mxser_get_ISA_conf(ioaddr[b], brd); if (retval <= 0) { brd->info = NULL; continue; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 47d2ad0ae079..6e2ec0b18948 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -808,7 +808,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = -ENOMEM; goto nomem_out; } - if (!alloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&policy->related_cpus, GFP_KERNEL)) { free_cpumask_var(policy->cpus); kfree(policy); ret = -ENOMEM; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d400aef8d9b..bb37fb1b2d82 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -362,7 +362,7 @@ static void raid5_unplug_device(struct request_queue *q); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, - int previous, int noblock) + int previous, int noblock, int noquiesce) { struct stripe_head *sh; @@ -372,7 +372,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, do { wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, + conf->quiesce == 0 || noquiesce, conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { @@ -2671,7 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1); + sh2 = get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2944,7 +2944,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3189,7 +3189,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3288,7 +3288,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -3675,7 +3675,7 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK)); + (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -3873,7 +3873,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3916,13 +3916,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0); + sh = get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -4022,9 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - sh = get_active_stripe(conf, sector_nr, 0, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -4034,7 +4034,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski * We don't need to check the 'failed' flag as when that gets set, * recovery aborts. */ - for (i=0; i<mddev->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].rdev == NULL) still_degraded = 1; @@ -4086,7 +4086,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1); + sh = get_active_stripe(conf, sector, 0, 1, 0); if (!sh) { /* failed to get a stripe - must wait */ diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 8247a945a1d9..3b19e0ce290f 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -66,7 +66,6 @@ static const int multicast_filter_limit = 32; #define RX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ #define TX_DMA_BURST 6 /* Maximum PCI burst, '6' is 1024 */ #define EarlyTxThld 0x3F /* 0x3F means NO early transmit */ -#define RxPacketMaxSize 0x3FE8 /* 16K - 1 - ETH_HLEN - VLAN - CRC... */ #define SafeMtu 0x1c20 /* ... actually life sucks beyond ~7k */ #define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */ @@ -2357,10 +2356,10 @@ static u16 rtl_rw_cpluscmd(void __iomem *ioaddr) return cmd; } -static void rtl_set_rx_max_size(void __iomem *ioaddr) +static void rtl_set_rx_max_size(void __iomem *ioaddr, unsigned int rx_buf_sz) { /* Low hurts. Let's disable the filtering. */ - RTL_W16(RxMaxSize, 16383); + RTL_W16(RxMaxSize, rx_buf_sz); } static void rtl8169_set_magic_reg(void __iomem *ioaddr, unsigned mac_version) @@ -2407,7 +2406,7 @@ static void rtl_hw_start_8169(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); if ((tp->mac_version == RTL_GIGA_MAC_VER_01) || (tp->mac_version == RTL_GIGA_MAC_VER_02) || @@ -2668,7 +2667,7 @@ static void rtl_hw_start_8168(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); tp->cp_cmd |= RTL_R16(CPlusCmd) | PktCntrDisable | INTT_1; @@ -2846,7 +2845,7 @@ static void rtl_hw_start_8101(struct net_device *dev) RTL_W8(EarlyTxThres, EarlyTxThld); - rtl_set_rx_max_size(ioaddr); + rtl_set_rx_max_size(ioaddr, tp->rx_buf_sz); tp->cp_cmd |= rtl_rw_cpluscmd(ioaddr) | PCIMulRW; diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 73348c4047e9..4a9cc92d4d18 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq) } #ifdef CONFIG_SMP -static void iosapic_set_affinity_irq(unsigned int irq, +static int iosapic_set_affinity_irq(unsigned int irq, const struct cpumask *dest) { struct vector_info *vi = iosapic_get_vector(irq); @@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq, dest_cpu = cpu_check_affinity(irq, dest); if (dest_cpu < 0) - return; + return -1; cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu)); vi->txn_addr = txn_affinity_addr(irq, dest_cpu); @@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq, iosapic_set_irt_data(vi, &dummy_d0, &d1); iosapic_wr_irt_entry(vi, d0, d1); spin_unlock_irqrestore(&iosapic_lock, flags); + + return 0; } #endif diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c index dd18f857dfb0..42e4260c3b12 100644 --- a/drivers/pci/hotplug/ibmphp_core.c +++ b/drivers/pci/hotplug/ibmphp_core.c @@ -153,45 +153,47 @@ int ibmphp_init_devno(struct slot **cur_slot) return -1; } for (loop = 0; loop < len; loop++) { - if ((*cur_slot)->number == rtable->slots[loop].slot) { - if ((*cur_slot)->bus == rtable->slots[loop].bus) { + if ((*cur_slot)->number == rtable->slots[loop].slot && + (*cur_slot)->bus == rtable->slots[loop].bus) { + struct io_apic_irq_attr irq_attr; + (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn); for (i = 0; i < 4; i++) (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus, - (int) (*cur_slot)->device, i); - - debug("(*cur_slot)->irq[0] = %x\n", - (*cur_slot)->irq[0]); - debug("(*cur_slot)->irq[1] = %x\n", - (*cur_slot)->irq[1]); - debug("(*cur_slot)->irq[2] = %x\n", - (*cur_slot)->irq[2]); - debug("(*cur_slot)->irq[3] = %x\n", - (*cur_slot)->irq[3]); - - debug("rtable->exlusive_irqs = %x\n", + (int) (*cur_slot)->device, i, + &irq_attr); + + debug("(*cur_slot)->irq[0] = %x\n", + (*cur_slot)->irq[0]); + debug("(*cur_slot)->irq[1] = %x\n", + (*cur_slot)->irq[1]); + debug("(*cur_slot)->irq[2] = %x\n", + (*cur_slot)->irq[2]); + debug("(*cur_slot)->irq[3] = %x\n", + (*cur_slot)->irq[3]); + + debug("rtable->exlusive_irqs = %x\n", rtable->exclusive_irqs); - debug("rtable->slots[loop].irq[0].bitmap = %x\n", + debug("rtable->slots[loop].irq[0].bitmap = %x\n", rtable->slots[loop].irq[0].bitmap); - debug("rtable->slots[loop].irq[1].bitmap = %x\n", + debug("rtable->slots[loop].irq[1].bitmap = %x\n", rtable->slots[loop].irq[1].bitmap); - debug("rtable->slots[loop].irq[2].bitmap = %x\n", + debug("rtable->slots[loop].irq[2].bitmap = %x\n", rtable->slots[loop].irq[2].bitmap); - debug("rtable->slots[loop].irq[3].bitmap = %x\n", + debug("rtable->slots[loop].irq[3].bitmap = %x\n", rtable->slots[loop].irq[3].bitmap); - debug("rtable->slots[loop].irq[0].link = %x\n", + debug("rtable->slots[loop].irq[0].link = %x\n", rtable->slots[loop].irq[0].link); - debug("rtable->slots[loop].irq[1].link = %x\n", + debug("rtable->slots[loop].irq[1].link = %x\n", rtable->slots[loop].irq[1].link); - debug("rtable->slots[loop].irq[2].link = %x\n", + debug("rtable->slots[loop].irq[2].link = %x\n", rtable->slots[loop].irq[2].link); - debug("rtable->slots[loop].irq[3].link = %x\n", + debug("rtable->slots[loop].irq[3].link = %x\n", rtable->slots[loop].irq[3].link); - debug("end of init_devno\n"); - kfree(rtable); - return 0; - } + debug("end of init_devno\n"); + kfree(rtable); + return 0; } } diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 6808d8333ecc..737a1c44b07a 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) int max_irq; int pos; int irq; + int node; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq = create_irq(); + node = dev_to_node(&dev->dev); + irq = create_irq_nr(0, node); if (irq <= 0) { kfree(cfg); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index a563fbe559d0..cd389162735f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1972,15 +1972,6 @@ static int __init init_dmars(void) } } -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) { - ret = enable_intr_remapping(0); - if (ret) - printk(KERN_ERR - "IOMMU: enable interrupt remapping failed\n"); - } -#endif - /* * For each rmrr * for each dev attached to rmrr diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index f5e0ea724a6f..3a0cb0bb0593 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static int ir_ioapic_num; int intr_remapping_enabled; +static int disable_intremap; +static __init int setup_nointremap(char *str) +{ + disable_intremap = 1; + return 0; +} +early_param("nointremap", setup_nointremap); + struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; @@ -23,15 +31,12 @@ struct irq_2_iommu { }; #ifdef CONFIG_GENERIC_HARDIRQS -static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) +static struct irq_2_iommu *get_one_free_irq_2_iommu(int node) { struct irq_2_iommu *iommu; - int node; - - node = cpu_to_node(cpu); iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node); - printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node); + printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node); return iommu; } @@ -48,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq) return desc->irq_2_iommu; } -static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) +static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; struct irq_2_iommu *irq_iommu; @@ -56,7 +61,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) /* * alloc irq desc if not allocated already. */ - desc = irq_to_desc_alloc_cpu(irq, cpu); + desc = irq_to_desc_alloc_node(irq, node); if (!desc) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); return NULL; @@ -65,14 +70,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) irq_iommu = desc->irq_2_iommu; if (!irq_iommu) - desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu); + desc->irq_2_iommu = get_one_free_irq_2_iommu(node); return desc->irq_2_iommu; } static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { - return irq_2_iommu_alloc_cpu(irq, boot_cpu_id); + return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id)); } #else /* !CONFIG_SPARSE_IRQ */ @@ -423,20 +428,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) readl, (sts & DMA_GSTS_IRTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); - if (mode == 0) { - spin_lock_irqsave(&iommu->register_lock, flags); - - /* enable comaptiblity format interrupt pass through */ - cmd = iommu->gcmd | DMA_GCMD_CFI; - iommu->gcmd |= DMA_GCMD_CFI; - writel(cmd, iommu->reg + DMAR_GCMD_REG); - - IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_CFIS), sts); - - spin_unlock_irqrestore(&iommu->register_lock, flags); - } - /* * global invalidation of interrupt entry cache before enabling * interrupt-remapping. @@ -516,6 +507,23 @@ end: spin_unlock_irqrestore(&iommu->register_lock, flags); } +int __init intr_remapping_supported(void) +{ + struct dmar_drhd_unit *drhd; + + if (disable_intremap) + return 0; + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (!ecap_ir_support(iommu->ecap)) + return 0; + } + + return 1; +} + int __init enable_intr_remapping(int eim) { struct dmar_drhd_unit *drhd; diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c index adf17856bacc..7f207f335bec 100644 --- a/drivers/pnp/pnpacpi/rsparser.c +++ b/drivers/pnp/pnpacpi/rsparser.c @@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev, } flags = irq_flags(triggering, polarity, shareable); - irq = acpi_register_gsi(gsi, triggering, polarity); + irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity); if (irq >= 0) pcibios_penalize_isa_irq(irq, 1); else diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 8ac9cddac575..cab100acf983 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES secure, but slightly less efficient. If in doubt, say yes. +config XEN_DEV_EVTCHN + tristate "Xen /dev/xen/evtchn device" + depends on XEN + default y + help + The evtchn driver allows a userspace process to triger event + channels and to receive notification of an event channel + firing. + If in doubt, say yes. + config XENFS tristate "Xen filesystem" depends on XEN @@ -41,3 +51,13 @@ config XEN_COMPAT_XENFS a xen platform. If in doubt, say yes. +config XEN_SYS_HYPERVISOR + bool "Create xen entries under /sys/hypervisor" + depends on XEN && SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents.
\ No newline at end of file diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ff8accc9e103..ec2a39b1e26f 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,4 +4,6 @@ obj-y += xenbus/ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XENFS) += xenfs/
\ No newline at end of file +obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
\ No newline at end of file diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 30963af5dba0..891d2e90753a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq) return info_for_irq(irq)->evtchn; } +unsigned irq_from_evtchn(unsigned int evtchn) +{ + return evtchn_to_irq[evtchn]; +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -335,7 +341,7 @@ static int find_unbound_irq(void) if (irq == nr_irqs) panic("No available IRQ to bind to: increase nr_irqs!\n"); - desc = irq_to_desc_alloc_cpu(irq, 0); + desc = irq_to_desc_alloc_node(irq, 0); if (WARN_ON(desc == NULL)) return -1; @@ -688,13 +694,13 @@ void rebind_evtchn_irq(int evtchn, int irq) } /* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) - return; + return -1; /* Send future instances of this interrupt to other vcpu. */ bind_vcpu.port = evtchn; @@ -707,13 +713,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); -} + return 0; +} -static void set_affinity_irq(unsigned irq, const struct cpumask *dest) +static int set_affinity_irq(unsigned irq, const struct cpumask *dest) { unsigned tcpu = cpumask_first(dest); - rebind_irq_to_cpu(irq, tcpu); + + return rebind_irq_to_cpu(irq, tcpu); } int resend_irq_on_evtchn(unsigned int irq) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c new file mode 100644 index 000000000000..af031950f9b1 --- /dev/null +++ b/drivers/xen/evtchn.c @@ -0,0 +1,507 @@ +/****************************************************************************** + * evtchn.c + * + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/errno.h> +#include <linux/miscdevice.h> +#include <linux/major.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/poll.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/gfp.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <xen/events.h> +#include <xen/evtchn.h> +#include <asm/xen/hypervisor.h> + +struct per_user_data { + struct mutex bind_mutex; /* serialize bind/unbind operations */ + + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + evtchn_port_t *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + struct mutex ring_cons_mutex; /* protect against concurrent readers */ + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; + const char *name; +}; + +/* Who's bound to each port? */ +static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ + +irqreturn_t evtchn_interrupt(int irq, void *data) +{ + unsigned int port = (unsigned long)data; + struct per_user_data *u; + + spin_lock(&port_user_lock); + + u = port_user[port]; + + disable_irq_nosync(irq); + + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + wmb(); /* Ensure ring contents visible */ + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } + } else { + u->ring_overflow = 1; + } + + spin_unlock(&port_user_lock); + + return IRQ_HANDLED; +} + +static ssize_t evtchn_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + struct per_user_data *u = file->private_data; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + if (count == 0) + return 0; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + mutex_lock(&u->ring_cons_mutex); + + rc = -EFBIG; + if (u->ring_overflow) + goto unlock_out; + + c = u->ring_cons; + p = u->ring_prod; + if (c != p) + break; + + mutex_unlock(&u->ring_cons_mutex); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(u->evtchn_wait, + u->ring_cons != u->ring_prod); + if (rc) + return rc; + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(evtchn_port_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + } else { + bytes1 = (p - c) * sizeof(evtchn_port_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + rc = -EFAULT; + rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) + goto unlock_out; + + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + rc = bytes1 + bytes2; + + unlock_out: + mutex_unlock(&u->ring_cons_mutex); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + rc = 0; + if (count == 0) + goto out; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + rc = -EFAULT; + if (copy_from_user(kbuf, buf, count) != 0) + goto out; + + spin_lock_irq(&port_user_lock); + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) + enable_irq(irq_from_evtchn(kbuf[i])); + spin_unlock_irq(&port_user_lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_bind_to_user(struct per_user_data *u, int port) +{ + int rc = 0; + + /* + * Ports are never reused, so every caller should pass in a + * unique port. + * + * (Locking not necessary because we haven't registered the + * interrupt handler yet, and our caller has already + * serialized bind operations.) + */ + BUG_ON(port_user[port] != NULL); + port_user[port] = u; + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); + if (rc >= 0) + rc = 0; + + return rc; +} + +static void evtchn_unbind_from_user(struct per_user_data *u, int port) +{ + int irq = irq_from_evtchn(port); + + unbind_from_irqhandler(irq, (void *)(unsigned long)port); + + /* make sure we unbind the irq handler before clearing the port */ + barrier(); + + port_user[port] = NULL; +} + +static long evtchn_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + struct per_user_data *u = file->private_data; + void __user *uarg = (void __user *) arg; + + /* Prevent bind from racing with unbind */ + mutex_lock(&u->bind_mutex); + + switch (cmd) { + case IOCTL_EVTCHN_BIND_VIRQ: { + struct ioctl_evtchn_bind_virq bind; + struct evtchn_bind_virq bind_virq; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_virq.virq = bind.virq; + bind_virq.vcpu = 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_virq.port); + if (rc == 0) + rc = bind_virq.port; + break; + } + + case IOCTL_EVTCHN_BIND_INTERDOMAIN: { + struct ioctl_evtchn_bind_interdomain bind; + struct evtchn_bind_interdomain bind_interdomain; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_interdomain.remote_dom = bind.remote_domain; + bind_interdomain.remote_port = bind.remote_port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + if (rc == 0) + rc = bind_interdomain.local_port; + break; + } + + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { + struct ioctl_evtchn_bind_unbound_port bind; + struct evtchn_alloc_unbound alloc_unbound; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = bind.remote_domain; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, alloc_unbound.port); + if (rc == 0) + rc = alloc_unbound.port; + break; + } + + case IOCTL_EVTCHN_UNBIND: { + struct ioctl_evtchn_unbind unbind; + + rc = -EFAULT; + if (copy_from_user(&unbind, uarg, sizeof(unbind))) + break; + + rc = -EINVAL; + if (unbind.port >= NR_EVENT_CHANNELS) + break; + + spin_lock_irq(&port_user_lock); + + rc = -ENOTCONN; + if (port_user[unbind.port] != u) { + spin_unlock_irq(&port_user_lock); + break; + } + + evtchn_unbind_from_user(u, unbind.port); + + spin_unlock_irq(&port_user_lock); + + rc = 0; + break; + } + + case IOCTL_EVTCHN_NOTIFY: { + struct ioctl_evtchn_notify notify; + + rc = -EFAULT; + if (copy_from_user(¬ify, uarg, sizeof(notify))) + break; + + if (notify.port >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[notify.port] != u) { + rc = -ENOTCONN; + } else { + notify_remote_via_evtchn(notify.port); + rc = 0; + } + break; + } + + case IOCTL_EVTCHN_RESET: { + /* Initialise the ring to empty. Clear errors. */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&port_user_lock); + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->ring_cons_mutex); + rc = 0; + break; + } + + default: + rc = -ENOSYS; + break; + } + mutex_unlock(&u->bind_mutex); + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm); + if (u->name == NULL) { + kfree(u); + return -ENOMEM; + } + + init_waitqueue_head(&u->evtchn_wait); + + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + if (u->ring == NULL) { + kfree(u->name); + kfree(u); + return -ENOMEM; + } + + mutex_init(&u->bind_mutex); + mutex_init(&u->ring_cons_mutex); + + filp->private_data = u; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + struct per_user_data *u = filp->private_data; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + if (port_user[i] != u) + continue; + + evtchn_unbind_from_user(port_user[i], i); + } + + spin_unlock_irq(&port_user_lock); + + kfree(u->name); + kfree(u); + + return 0; +} + +static const struct file_operations evtchn_fops = { + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .unlocked_ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, +}; + +static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, +}; +static int __init evtchn_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* Create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + printk(KERN_INFO "Event-channel device installed.\n"); + + return 0; +} + +static void __exit evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 4b5b84837ee1..fddc2025dece 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -98,9 +98,8 @@ static void do_suspend(void) goto out; } - printk("suspending xenbus...\n"); - /* XXX use normal device tree? */ - xenbus_suspend(); + printk(KERN_DEBUG "suspending xenstore...\n"); + xs_suspend(); err = device_power_down(PMSG_SUSPEND); if (err) { @@ -116,9 +115,9 @@ static void do_suspend(void) if (!cancelled) { xen_arch_resume(); - xenbus_resume(); + xs_resume(); } else - xenbus_suspend_cancel(); + xs_suspend_cancel(); device_power_up(PMSG_RESUME); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c new file mode 100644 index 000000000000..88a60e03ccf0 --- /dev/null +++ b/drivers/xen/sys-hypervisor.c @@ -0,0 +1,445 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day <ncmike@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xenbus.h> +#include <xen/interface/xen.h> +#include <xen/interface/version.h> + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return sprintf(buffer, "xen\n"); +} + +HYPERVISOR_ATTR_RO(type); + +static int __init xen_sysfs_type_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); +} + +static void xen_sysfs_type_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); +} + +/* xen version attributes */ +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version >> 16); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(major); + +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version & 0xff); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(minor); + +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *extra; + + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); + if (extra) { + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); + if (!ret) + ret = sprintf(buffer, "%s\n", extra); + kfree(extra); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(extra); + +static struct attribute *version_attrs[] = { + &major_attr.attr, + &minor_attr.attr, + &extra_attr.attr, + NULL +}; + +static struct attribute_group version_group = { + .name = "version", + .attrs = version_attrs, +}; + +static int __init xen_sysfs_version_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &version_group); +} + +static void xen_sysfs_version_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &version_group); +} + +/* UUID */ + +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *vm, *val; + int ret; + extern int xenstored_ready; + + if (!xenstored_ready) + return -EBUSY; + + vm = xenbus_read(XBT_NIL, "vm", "", NULL); + if (IS_ERR(vm)) + return PTR_ERR(vm); + val = xenbus_read(XBT_NIL, vm, "uuid", NULL); + kfree(vm); + if (IS_ERR(val)) + return PTR_ERR(val); + ret = sprintf(buffer, "%s\n", val); + kfree(val); + return ret; +} + +HYPERVISOR_ATTR_RO(uuid); + +static int __init xen_sysfs_uuid_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); +} + +static void xen_sysfs_uuid_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); +} + +/* xen compilation attributes */ + +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compiler); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiler); + +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_by); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiled_by); + +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_date); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compile_date); + +static struct attribute *xen_compile_attrs[] = { + &compiler_attr.attr, + &compiled_by_attr.attr, + &compile_date_attr.attr, + NULL +}; + +static struct attribute_group xen_compilation_group = { + .name = "compilation", + .attrs = xen_compile_attrs, +}; + +int __init static xen_compilation_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); +} + +static void xen_compilation_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); +} + +/* xen properties info */ + +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *caps; + + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); + if (caps) { + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); + if (!ret) + ret = sprintf(buffer, "%s\n", caps); + kfree(caps); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(capabilities); + +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *cset; + + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); + if (cset) { + ret = HYPERVISOR_xen_version(XENVER_changeset, cset); + if (!ret) + ret = sprintf(buffer, "%s\n", cset); + kfree(cset); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(changeset); + +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_platform_parameters *parms; + + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); + if (parms) { + ret = HYPERVISOR_xen_version(XENVER_platform_parameters, + parms); + if (!ret) + ret = sprintf(buffer, "%lx\n", parms->virt_start); + kfree(parms); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(virtual_start); + +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); + if (ret > 0) + ret = sprintf(buffer, "%x\n", ret); + + return ret; +} + +HYPERVISOR_ATTR_RO(pagesize); + +static ssize_t xen_feature_show(int index, char *buffer) +{ + ssize_t ret; + struct xen_feature_info info; + + info.submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, &info); + if (!ret) + ret = sprintf(buffer, "%08x", info.submap); + + return ret; +} + +static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + ssize_t len; + int i; + + len = 0; + for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) { + int ret = xen_feature_show(i, buffer + len); + if (ret < 0) { + if (len == 0) + len = ret; + break; + } + len += ret; + } + if (len > 0) + buffer[len++] = '\n'; + + return len; +} + +HYPERVISOR_ATTR_RO(features); + +static struct attribute *xen_properties_attrs[] = { + &capabilities_attr.attr, + &changeset_attr.attr, + &virtual_start_attr.attr, + &pagesize_attr.attr, + &features_attr.attr, + NULL +}; + +static struct attribute_group xen_properties_group = { + .name = "properties", + .attrs = xen_properties_attrs, +}; + +static int __init xen_properties_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); +} + +static void xen_properties_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); +} + +static int __init hyper_sysfs_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xen_sysfs_type_init(); + if (ret) + goto out; + ret = xen_sysfs_version_init(); + if (ret) + goto version_out; + ret = xen_compilation_init(); + if (ret) + goto comp_out; + ret = xen_sysfs_uuid_init(); + if (ret) + goto uuid_out; + ret = xen_properties_init(); + if (ret) + goto prop_out; + + goto out; + +prop_out: + xen_sysfs_uuid_destroy(); +uuid_out: + xen_compilation_destroy(); +comp_out: + xen_sysfs_version_destroy(); +version_out: + xen_sysfs_type_destroy(); +out: + return ret; +} + +static void __exit hyper_sysfs_exit(void) +{ + xen_properties_destroy(); + xen_compilation_destroy(); + xen_sysfs_uuid_destroy(); + xen_sysfs_version_destroy(); + xen_sysfs_type_destroy(); + +} +module_init(hyper_sysfs_init); +module_exit(hyper_sysfs_exit); + +static ssize_t hyp_sysfs_show(struct kobject *kobj, + struct attribute *attr, + char *buffer) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->show) + return hyp_attr->show(hyp_attr, buffer); + return 0; +} + +static ssize_t hyp_sysfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t len) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->store) + return hyp_attr->store(hyp_attr, buffer, len); + return 0; +} + +static struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, +}; + +static struct kobj_type hyp_sysfs_kobj_type = { + .sysfs_ops = &hyp_sysfs_ops, +}; + +static int __init hypervisor_subsys_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; + return 0; +} +device_initcall(hypervisor_subsys_init); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 773d1cf23283..d42e25d5968d 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name); static void xenbus_dev_shutdown(struct device *_dev); +static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +static int xenbus_dev_resume(struct device *dev); + /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = { .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, .dev_attrs = xenbus_dev_attrs, + + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, }, }; @@ -654,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) kfree(root); } +EXPORT_SYMBOL_GPL(xenbus_dev_changed); static void frontend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) @@ -669,7 +676,7 @@ static struct xenbus_watch fe_watch = { .callback = frontend_changed, }; -static int suspend_dev(struct device *dev, void *data) +static int xenbus_dev_suspend(struct device *dev, pm_message_t state) { int err = 0; struct xenbus_driver *drv; @@ -682,35 +689,14 @@ static int suspend_dev(struct device *dev, void *data) drv = to_xenbus_driver(dev->driver); xdev = container_of(dev, struct xenbus_device, dev); if (drv->suspend) - err = drv->suspend(xdev); + err = drv->suspend(xdev, state); if (err) printk(KERN_WARNING "xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } -static int suspend_cancel_dev(struct device *dev, void *data) -{ - int err = 0; - struct xenbus_driver *drv; - struct xenbus_device *xdev; - - DPRINTK(""); - - if (dev->driver == NULL) - return 0; - drv = to_xenbus_driver(dev->driver); - xdev = container_of(dev, struct xenbus_device, dev); - if (drv->suspend_cancel) - err = drv->suspend_cancel(xdev); - if (err) - printk(KERN_WARNING - "xenbus: suspend_cancel %s failed: %i\n", - dev_name(dev), err); - return 0; -} - -static int resume_dev(struct device *dev, void *data) +static int xenbus_dev_resume(struct device *dev) { int err; struct xenbus_driver *drv; @@ -755,33 +741,6 @@ static int resume_dev(struct device *dev, void *data) return 0; } -void xenbus_suspend(void) -{ - DPRINTK(""); - - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - xenbus_backend_suspend(suspend_dev); - xs_suspend(); -} -EXPORT_SYMBOL_GPL(xenbus_suspend); - -void xenbus_resume(void) -{ - xb_init_comms(); - xs_resume(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - xenbus_backend_resume(resume_dev); -} -EXPORT_SYMBOL_GPL(xenbus_resume); - -void xenbus_suspend_cancel(void) -{ - xs_suspend_cancel(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev); - xenbus_backend_resume(suspend_cancel_dev); -} -EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); - /* A flag to determine if xenstored is 'ready' (i.e. has started) */ int xenstored_ready = 0; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e325eab4724d..eab33f1dbdf7 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -673,6 +673,8 @@ void xs_resume(void) struct xenbus_watch *watch; char token[sizeof(watch) * 2 + 1]; + xb_init_comms(); + mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); up_write(&xs_state.transaction_mutex); diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 515741a8e6b8..6559e0c752ce 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -20,10 +20,27 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); +static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + char *tmp = ""; + + if (xen_initial_domain()) + tmp = "control_d\n"; + + return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp)); +} + +static const struct file_operations capabilities_file_ops = { + .read = capabilities_read, +}; + static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, + [1] = {}, + { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, {""}, }; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index eeb246845909..2341375386f8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { /* - * If the dentry isn't hashed just go ahead and try the - * mount again with a new wait (not much else we can do). - */ - if (!d_unhashed(dentry)) { - /* - * But if the dentry is hashed, that means that we - * got here through the revalidate path. Thus, we - * need to check if the dentry has been mounted - * while we waited on the wq_mutex. If it has, - * simply return success. - */ - if (d_mountpoint(dentry)) - return 0; - } + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (have_submounts(dentry)) + return 0; } return 1; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 06560c520f49..618e21c0b7a3 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -241,7 +241,7 @@ write_out_data: spin_lock(&journal->j_list_lock); } /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) + if (!buffer_jbd(bh) || bh2jh(bh) != jh || jh->b_transaction != commit_transaction || jh->b_jlist != BJ_SyncData) { jbd_unlock_bh_state(bh); @@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_list_lock); continue; } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) { __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bca39cf99ee..1afa4dd4cae2 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -12,20 +12,14 @@ static int loadavg_proc_show(struct seq_file *m, void *v) { - int a, b, c; - unsigned long seq; + unsigned long avnrun[3]; - do { - seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - } while (read_seqretry(&xtime_lock, seq)); + get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, task_active_pid_ns(current)->last_pid); return 0; diff --git a/include/Kbuild b/include/Kbuild index d8c3e3cbf416..fe36accd4328 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -8,3 +8,4 @@ header-y += mtd/ header-y += rdma/ header-y += video/ header-y += drm/ +header-y += xen/ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8e6d0ca70aba..e410f602cab1 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* - * A facility to provide batching of the reload of page tables with the - * actual context switch code for paravirtualized guests. By convention, - * only one of the lazy modes (CPU, MMU) should be active at any given - * time, entry should never be nested, and entry and exits should always - * be paired. This is for sanity of maintaining and reasoning about the - * kernel code. + * A facility to provide batching of the reload of page tables and + * other process state with the actual context switch code for + * paravirtualized guests. By convention, only one of the batched + * update (lazy) modes (CPU, MMU) should be active at any given time, + * entry should never be nested, and entry and exits should always be + * paired. This is for sanity of maintaining and reasoning about the + * kernel code. In this case, the exit (end of the context switch) is + * in architecture-specific code, and so doesn't need a generic + * definition. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE -#define arch_enter_lazy_cpu_mode() do {} while (0) -#define arch_leave_lazy_cpu_mode() do {} while (0) -#define arch_flush_lazy_cpu_mode() do {} while (0) +#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH +#define arch_start_context_switch(prev) do {} while (0) #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 88be890ee3c7..51b4b0a5ce8c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num; extern int sbf_port; extern unsigned long acpi_realmode_flags; -int acpi_register_gsi (u32 gsi, int triggering, int polarity); +int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); #ifdef CONFIG_X86_IO_APIC diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9f315382610b..c5ac87ca7bc6 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1022,6 +1022,8 @@ typedef struct cpumask *cpumask_var_t; bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); +bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); +bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); @@ -1040,6 +1042,19 @@ static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, return true; } +static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + cpumask_clear(*mask); + return true; +} + +static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, + int node) +{ + cpumask_clear(*mask); + return true; +} + static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 28d53cb7b5a2..171ad8aedc83 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus); extern void dma_debug_init(u32 num_entries); +extern int dma_debug_resize_entries(u32 num_entries); + extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, @@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries) { } +static inline int dma_debug_resize_entries(u32 num_entries) +{ + return 0; +} + static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, diff --git a/include/linux/dmar.h b/include/linux/dmar.h index e397dc342cda..10ff5c498824 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -108,6 +108,7 @@ struct irte { }; #ifdef CONFIG_INTR_REMAP extern int intr_remapping_enabled; +extern int intr_remapping_supported(void); extern int enable_intr_remapping(int); extern void disable_intr_remapping(void); extern int reenable_intr_remapping(int); @@ -157,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic) } #define irq_remapped(irq) (0) #define enable_intr_remapping(mode) (-1) +#define disable_intr_remapping() (0) +#define reenable_intr_remapping(mode) (0) #define intr_remapping_enabled (0) #endif diff --git a/include/linux/futex.h b/include/linux/futex.h index 3bf5bb5a34f9..34956c8fdebf 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -23,6 +23,8 @@ union ktime; #define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -38,6 +40,10 @@ union ktime; #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) /* * Support for robust futexes: the kernel cleans up held futexes at diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 91bb76f44f14..ff374ceface0 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -566,6 +566,6 @@ struct irq_desc; extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); -extern int arch_init_chip_data(struct irq_desc *desc, int cpu); +extern int arch_init_chip_data(struct irq_desc *desc, int node); #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index b7cbeed972e4..eedbb8e5e0cc 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -117,7 +117,7 @@ struct irq_chip { void (*eoi)(unsigned int irq); void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, + int (*set_affinity)(unsigned int irq, const struct cpumask *dest); int (*retrigger)(unsigned int irq); int (*set_type)(unsigned int irq, unsigned int flow_type); @@ -187,7 +187,7 @@ struct irq_desc { spinlock_t lock; #ifdef CONFIG_SMP cpumask_var_t affinity; - unsigned int cpu; + unsigned int node; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif @@ -201,26 +201,23 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; extern void arch_init_copy_chip_data(struct irq_desc *old_desc, - struct irq_desc *desc, int cpu); + struct irq_desc *desc, int node); extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); #ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; -#else /* CONFIG_SPARSE_IRQ */ -extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); -#endif /* CONFIG_SPARSE_IRQ */ - -extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +#endif -static inline struct irq_desc * -irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) -{ -#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC - return irq_to_desc(irq); +#ifdef CONFIG_NUMA_IRQ_DESC +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); #else +static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) +{ return desc; -#endif } +#endif + +extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); /* * Migration helpers for obsolete names, they will go away: @@ -386,7 +383,7 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ -extern unsigned int create_irq_nr(unsigned int irq_want); +extern unsigned int create_irq_nr(unsigned int irq_want, int node); extern int create_irq(void); extern void destroy_irq(unsigned int irq); @@ -424,47 +421,48 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #ifdef CONFIG_SMP /** - * init_alloc_desc_masks - allocate cpumasks for irq_desc + * alloc_desc_masks - allocate cpumasks for irq_desc * @desc: pointer to irq_desc struct * @cpu: cpu which will be handling the cpumasks * @boot: true if need bootmem * * Allocates affinity and pending_mask cpumask if required. * Returns true if successful (or not required). - * Side effect: affinity has all bits set, pending_mask has all bits clear. */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, bool boot) { - int node; - +#ifdef CONFIG_CPUMASK_OFFSTACK if (boot) { alloc_bootmem_cpumask_var(&desc->affinity); - cpumask_setall(desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ alloc_bootmem_cpumask_var(&desc->pending_mask); - cpumask_clear(desc->pending_mask); #endif return true; } - node = cpu_to_node(cpu); - if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) return false; - cpumask_setall(desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { free_cpumask_var(desc->affinity); return false; } - cpumask_clear(desc->pending_mask); +#endif #endif return true; } +static inline void init_desc_masks(struct irq_desc *desc) +{ + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + /** * init_copy_desc_masks - copy cpumasks for irq_desc * @old_desc: pointer to old irq_desc struct @@ -478,7 +476,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, static inline void init_copy_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { -#ifdef CONFIG_CPUMASKS_OFFSTACK +#ifdef CONFIG_CPUMASK_OFFSTACK cpumask_copy(new_desc->affinity, old_desc->affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -499,12 +497,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc, #else /* !CONFIG_SMP */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, +static inline bool alloc_desc_masks(struct irq_desc *desc, int node, bool boot) { return true; } +static inline void init_desc_masks(struct irq_desc *desc) +{ +} + static inline void init_copy_desc_masks(struct irq_desc *old_desc, struct irq_desc *new_desc) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c21af6abffb..5528ff32512e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1029,8 +1029,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); extern void remove_all_active_ranges(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab8..878cab4f5fcc 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -150,5 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 5932ace22400..ff687281f233 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -116,6 +116,7 @@ struct fs_struct; * 11 bit fractions. */ extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ @@ -135,8 +136,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); -extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern void calc_global_load(void); extern unsigned long get_parent_ip(unsigned long addr); @@ -838,7 +839,17 @@ struct sched_group { */ u32 reciprocal_cpu_power; - unsigned long cpumask[]; + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_group' in kernel/sched.c) + */ + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_cpus(struct sched_group *sg) @@ -924,8 +935,17 @@ struct sched_domain { char *name; #endif - /* span of all CPUs in this domain */ - unsigned long span[]; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_domain' in kernel/sched.c) + */ + unsigned long span[0]; }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 938234c4a996..d4841ed8215b 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) #define __raw_spin_is_locked(lock) ((void)(lock), 0) /* for sched.c and kernel_lock.c: */ # define __raw_spin_lock(lock) do { (void)(lock); } while (0) +# define __raw_spin_lock_flags(lock, flags) do { (void)(lock); } while (0) # define __raw_spin_unlock(lock) do { (void)(lock); } while (0) # define __raw_spin_trylock(lock) ({ (void)(lock); 1; }) #endif /* DEBUG_SPINLOCK */ diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ac9ff54f7cb3..cb1a6631b8f4 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t address); -extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); +extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, + dma_addr_t address); extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e6b820f8b56b..a8cc4e13434c 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -21,13 +21,14 @@ struct restart_block { struct { unsigned long arg0, arg1, arg2, arg3; }; - /* For futex_wait */ + /* For futex_wait and futex_wait_requeue_pi */ struct { u32 *uaddr; u32 val; u32 flags; u32 bitset; u64 time; + u32 *uaddr2; } futex; /* For nanosleep */ struct { diff --git a/include/linux/wait.h b/include/linux/wait.h index bc024632f365..6788e1a4d4ca 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, list_del(&old->task_list); } -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, diff --git a/include/xen/Kbuild b/include/xen/Kbuild new file mode 100644 index 000000000000..4e65c16a445b --- /dev/null +++ b/include/xen/Kbuild @@ -0,0 +1 @@ +header-y += evtchn.h diff --git a/include/xen/events.h b/include/xen/events.h index 0d5f1adc0363..e68d59a90ca8 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq); irq will be disabled so it won't deliver an interrupt. */ void xen_poll_irq(int irq); +/* Determine the IRQ which is bound to an event channel */ +unsigned irq_from_evtchn(unsigned int evtchn); + #endif /* _XEN_EVENTS_H */ diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h new file mode 100644 index 000000000000..14e833ee4e0b --- /dev/null +++ b/include/xen/evtchn.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_EVTCHN_H__ +#define __LINUX_PUBLIC_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) +struct ioctl_evtchn_bind_virq { + unsigned int virq; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IOC(_IOC_NONE, 'E', 5, 0) + +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h index 453235e923f0..e8b6519d47e9 100644 --- a/include/xen/interface/version.h +++ b/include/xen/interface/version.h @@ -57,4 +57,7 @@ struct xen_feature_info { /* Declares the features reported by XENVER_get_features. */ #include "features.h" +/* arg == NULL; returns host memory page size. */ +#define XENVER_pagesize 7 + #endif /* __XEN_PUBLIC_VERSION_H__ */ diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index f87f9614844d..b9763badbd77 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -91,8 +91,7 @@ struct xenbus_driver { void (*otherend_changed)(struct xenbus_device *dev, enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); - int (*suspend)(struct xenbus_device *dev); - int (*suspend_cancel)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev, pm_message_t state); int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; diff --git a/ipc/shm.c b/ipc/shm.c index 560818353599..15dd238e5338 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -968,10 +968,13 @@ SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *next; + struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; - loff_t size = 0; int retval = -EINVAL; +#ifdef CONFIG_MMU + loff_t size = 0; + struct vm_area_struct *next; +#endif if (addr & ~PAGE_MASK) return retval; diff --git a/kernel/futex.c b/kernel/futex.c index d546b2d53a62..80b5ce716596 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -96,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -107,7 +111,9 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; + + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; /* Bitset for the optional bitmasked wakeup */ u32 bitset; @@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; @@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, int set_waiters) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ static void wake_futex(struct futex_q *q) { - plist_del(&q->list, &q->list.plist); + struct task_struct *p = q->task; + /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. */ - wake_up(&q->waiter); + get_task_struct(p); + + plist_del(&q->list, &q->list.plist); /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -802,24 +959,185 @@ out: return ret; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -854,32 +1172,99 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) continue; - if (++ret <= nr_wake) { + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { wake_futex(this); - } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); - drop_count++; + continue; + } - if (ret - nr_wake >= nr_requeue) - break; + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -899,7 +1284,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1119,35 +1504,149 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) { - struct task_struct *curr = current; - struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); - struct futex_hash_bucket *hb; - struct futex_q q; - u32 uval; - int ret; - struct hrtimer_sleeper t; - int rem = 0; + struct task_struct *owner; + int ret = 0; - if (!bitset) - return -EINVAL; + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } - q.pi_state = NULL; - q.bitset = bitset; -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); - if (unlikely(ret != 0)) + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); goto out; + } -retry_private: - hb = queue_lock(&q); + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + set_current_state(TASK_INTERRUPTIBLE); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; /* * Access the page AFTER the hash-bucket is locked. @@ -1165,95 +1664,83 @@ retry_private: * A consequence is that futex_wait() can return zero and absorb * a wakeup when *uaddr != val on entry to the syscall. This is * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + ret = get_futex_value_locked(&uval, uaddr); - if (unlikely(ret)) { - queue_unlock(&q, hb); + if (ret) { + queue_unlock(q, *hb); ret = get_user(uval, uaddr); if (ret) - goto out_put_key; + goto out; if (!fshared) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(fshared, &q->key); goto retry; } - ret = -EWOULDBLOCK; - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; - } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; +static int futex_wait(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q; + int ret; - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); + if (!bitset) + return -EINVAL; - hrtimer_cancel(&t.timer); + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = NULL; - /* Flag if a timeout occured */ - rem = (t.task == NULL); + if (abs_time) { + to = &timeout; - destroy_hrtimer_on_stack(&t.timer); - } + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); } - __set_current_state(TASK_RUNNING); - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out; + + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1270,7 +1757,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1282,6 +1769,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } @@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } @@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); @@ -1339,81 +1833,15 @@ retry: retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1423,25 +1851,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } @@ -1465,71 +1874,21 @@ retry_locked: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1537,9 +1896,7 @@ retry_locked: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1549,7 +1906,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* @@ -1572,7 +1929,6 @@ uaddr_faulted: goto retry; } - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1674,6 +2030,229 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) + goto out_key2; + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + /* + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). + */ + ret = -EWOULDBLOCK; + } + +out_put_keys: + put_futex_key(fshared, &q.key); +out_key2: + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f52964..7d047808419d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f2..13c68e71b726 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 26e08754744f..18041a254d32 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> @@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } - arch_init_chip_data(desc, cpu); + init_desc_masks(desc); + arch_init_chip_data(desc, node); } /* @@ -169,7 +173,8 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -256,7 +262,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); @@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } @@ -453,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -468,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38f..73468253143b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca59243..aaf5c9d05770 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b7..cfe767ca1545 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include <linux/irq.h> +#include <linux/interrupt.h> + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d2..2f69bee57bf2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; @@ -109,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f1..e5cc0cd28d54 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } @@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ffa..820c5af44f3e 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } -/* - * Slow path lock function: +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. */ static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) { - struct rt_mutex_waiter waiter; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; - - spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - for (;;) { /* Try to acquire the lock: */ if (try_to_take_rt_mutex(lock)) @@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, } /* - * waiter.task is NULL the first time we come here and + * waiter->task is NULL the first time we come here and * when we have been woken up by the previous owner * but the lock got stolen by a higher prio task. */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, detect_deadlock); /* * If we got woken up by the owner then start loop * all over without going into schedule to try * to get the lock now: */ - if (unlikely(!waiter.task)) { + if (unlikely(!waiter->task)) { /* * Reset the return value. We might * have returned with -EDEADLK and the @@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, spin_unlock(&lock->wait_lock); - debug_rt_mutex_print_deadlock(&waiter); + debug_rt_mutex_print_deadlock(waiter); - if (waiter.task) + if (waiter->task) schedule_rt_mutex(lock); spin_lock(&lock->wait_lock); set_current_state(state); } + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); + set_current_state(TASK_RUNNING); if (unlikely(waiter.task)) @@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * @@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800ea..97a2f81866af 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc1..076e403b9c88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2458,6 +2464,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2766,7 +2783,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; @@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) { - unsigned long i, running = 0, uninterruptible = 0; + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - return running + uninterruptible; + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } } /* @@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5070,15 +5271,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; @@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; @@ -8938,6 +9157,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9045,6 +9266,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ @@ -9055,6 +9279,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ @@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574cd..344712a5e3ed 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) vec->count = 0; if (bootmem) alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f119..5f9650e8fe75 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c13..499672c10cbd 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f9712..9bf0d2a73045 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568ee..ad63d8501207 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd348511..f674f332a024 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 45bd711a242e..944ba03cae19 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -743,6 +743,14 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e7..52a8bf8931f3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c31..a26ed294f938 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) } /* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1187,16 +1146,6 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - unsigned long seq; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c8..ea7c3b4275cf 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); diff --git a/lib/cpumask.c b/lib/cpumask.c index 1f71b97de0f9..eb23aaa0c7b8 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -119,6 +119,12 @@ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) } EXPORT_SYMBOL(alloc_cpumask_var_node); +bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) +{ + return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); +} +EXPORT_SYMBOL(zalloc_cpumask_var_node); + /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned @@ -135,6 +141,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) } EXPORT_SYMBOL(alloc_cpumask_var); +bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +{ + return alloc_cpumask_var(mask, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(zalloc_cpumask_var); + /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 69da09a085a1..ad65fc0317d9 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -23,9 +23,11 @@ #include <linux/dma-debug.h> #include <linux/spinlock.h> #include <linux/debugfs.h> +#include <linux/uaccess.h> #include <linux/device.h> #include <linux/types.h> #include <linux/sched.h> +#include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> @@ -85,6 +87,7 @@ static u32 show_num_errors = 1; static u32 num_free_entries; static u32 min_free_entries; +static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 req_entries; @@ -97,6 +100,16 @@ static struct dentry *show_all_errors_dent __read_mostly; static struct dentry *show_num_errors_dent __read_mostly; static struct dentry *num_free_entries_dent __read_mostly; static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *filter_dent __read_mostly; + +/* per-driver filter related state */ + +#define NAME_MAX_LEN 64 + +static char current_driver_name[NAME_MAX_LEN] __read_mostly; +static struct device_driver *current_driver __read_mostly; + +static DEFINE_RWLOCK(driver_name_lock); static const char *type2name[4] = { "single", "page", "scather-gather", "coherent" }; @@ -104,6 +117,11 @@ static const char *type2name[4] = { "single", "page", static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; +/* little merge helper - remove it after the merge window */ +#ifndef BUS_NOTIFY_UNBOUND_DRIVER +#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 +#endif + /* * The access to some variables in this macro is racy. We can't use atomic_t * here because all these variables are exported to debugfs. Some of them even @@ -121,15 +139,54 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) { #ifdef CONFIG_STACKTRACE if (entry) { - printk(KERN_WARNING "Mapped at:\n"); + pr_warning("Mapped at:\n"); print_stack_trace(&entry->stacktrace, 0); } #endif } +static bool driver_filter(struct device *dev) +{ + struct device_driver *drv; + unsigned long flags; + bool ret; + + /* driver filter off */ + if (likely(!current_driver_name[0])) + return true; + + /* driver filter on and initialized */ + if (current_driver && dev->driver == current_driver) + return true; + + if (current_driver || !current_driver_name[0]) + return false; + + /* driver filter on but not yet initialized */ + drv = get_driver(dev->driver); + if (!drv) + return false; + + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); + + ret = false; + if (drv->name && + strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { + current_driver = drv; + ret = true; + } + + read_unlock_irqrestore(&driver_name_lock, flags); + put_driver(drv); + + return ret; +} + #define err_printk(dev, entry, format, arg...) do { \ error_count += 1; \ - if (show_all_errors || show_num_errors > 0) { \ + if (driver_filter(dev) && \ + (show_all_errors || show_num_errors > 0)) { \ WARN(1, "%s %s: " format, \ dev_driver_string(dev), \ dev_name(dev) , ## arg); \ @@ -185,15 +242,50 @@ static void put_hash_bucket(struct hash_bucket *bucket, static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket, struct dma_debug_entry *ref) { - struct dma_debug_entry *entry; + struct dma_debug_entry *entry, *ret = NULL; + int matches = 0, match_lvl, last_lvl = 0; list_for_each_entry(entry, &bucket->list, list) { - if ((entry->dev_addr == ref->dev_addr) && - (entry->dev == ref->dev)) + if ((entry->dev_addr != ref->dev_addr) || + (entry->dev != ref->dev)) + continue; + + /* + * Some drivers map the same physical address multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therfore we implement a + * best-fit algorithm here which returns the entry from + * the hash which fits best to the reference value + * instead of the first-fit. + */ + matches += 1; + match_lvl = 0; + entry->size == ref->size ? ++match_lvl : match_lvl; + entry->type == ref->type ? ++match_lvl : match_lvl; + entry->direction == ref->direction ? ++match_lvl : match_lvl; + + if (match_lvl == 3) { + /* perfect-fit - return the result */ return entry; + } else if (match_lvl > last_lvl) { + /* + * We found an entry that fits better then the + * previous one + */ + last_lvl = match_lvl; + ret = entry; + } } - return NULL; + /* + * If we have multiple matches but no perfect-fit, just return + * NULL. + */ + ret = (matches == 1) ? ret : NULL; + + return ret; } /* @@ -257,6 +349,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) put_hash_bucket(bucket, &flags); } +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + /* struct dma_entry allocator * * The next two functions implement the allocator for @@ -270,15 +377,12 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_lock_irqsave(&free_entries_lock, flags); if (list_empty(&free_entries)) { - printk(KERN_ERR "DMA-API: debugging out of memory " - "- disabling\n"); + pr_err("DMA-API: debugging out of memory - disabling\n"); global_disable = true; goto out; } - entry = list_entry(free_entries.next, struct dma_debug_entry, list); - list_del(&entry->list); - memset(entry, 0, sizeof(*entry)); + entry = __dma_entry_alloc(); #ifdef CONFIG_STACKTRACE entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; @@ -286,9 +390,6 @@ static struct dma_debug_entry *dma_entry_alloc(void) entry->stacktrace.skip = 2; save_stack_trace(&entry->stacktrace); #endif - num_free_entries -= 1; - if (num_free_entries < min_free_entries) - min_free_entries = num_free_entries; out: spin_unlock_irqrestore(&free_entries_lock, flags); @@ -310,6 +411,53 @@ static void dma_entry_free(struct dma_debug_entry *entry) spin_unlock_irqrestore(&free_entries_lock, flags); } +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} +EXPORT_SYMBOL(dma_debug_resize_entries); + /* * DMA-API debugging init code * @@ -334,8 +482,7 @@ static int prealloc_memory(u32 num_entries) num_free_entries = num_entries; min_free_entries = num_entries; - printk(KERN_INFO "DMA-API: preallocated %d debug entries\n", - num_entries); + pr_info("DMA-API: preallocated %d debug entries\n", num_entries); return 0; @@ -349,11 +496,102 @@ out_err: return -ENOMEM; } +static ssize_t filter_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN + 1]; + unsigned long flags; + int len; + + if (!current_driver_name[0]) + return 0; + + /* + * We can't copy to userspace directly because current_driver_name can + * only be read under the driver_name_lock with irqs disabled. So + * create a temporary copy first. + */ + read_lock_irqsave(&driver_name_lock, flags); + len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); + read_unlock_irqrestore(&driver_name_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t filter_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN]; + unsigned long flags; + size_t len; + int i; + + /* + * We can't copy from userspace directly. Access to + * current_driver_name is protected with a write_lock with irqs + * disabled. Since copy_from_user can fault and may sleep we + * need to copy to temporary buffer first + */ + len = min(count, (size_t)(NAME_MAX_LEN - 1)); + if (copy_from_user(buf, userbuf, len)) + return -EFAULT; + + buf[len] = 0; + + write_lock_irqsave(&driver_name_lock, flags); + + /* + * Now handle the string we got from userspace very carefully. + * The rules are: + * - only use the first token we got + * - token delimiter is everything looking like a space + * character (' ', '\n', '\t' ...) + * + */ + if (!isalnum(buf[0])) { + /* + * If the first character userspace gave us is not + * alphanumerical then assume the filter should be + * switched off. + */ + if (current_driver_name[0]) + pr_info("DMA-API: switching off dma-debug driver filter\n"); + current_driver_name[0] = 0; + current_driver = NULL; + goto out_unlock; + } + + /* + * Now parse out the first token and use it as the name for the + * driver to filter for. + */ + for (i = 0; i < NAME_MAX_LEN; ++i) { + current_driver_name[i] = buf[i]; + if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) + break; + } + current_driver_name[i] = 0; + current_driver = NULL; + + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + +out_unlock: + write_unlock_irqrestore(&driver_name_lock, flags); + + return count; +} + +const struct file_operations filter_fops = { + .read = filter_read, + .write = filter_write, +}; + static int dma_debug_fs_init(void) { dma_debug_dent = debugfs_create_dir("dma-api", NULL); if (!dma_debug_dent) { - printk(KERN_ERR "DMA-API: can not create debugfs directory\n"); + pr_err("DMA-API: can not create debugfs directory\n"); return -ENOMEM; } @@ -392,6 +630,11 @@ static int dma_debug_fs_init(void) if (!min_free_entries_dent) goto out_err; + filter_dent = debugfs_create_file("driver_filter", 0644, + dma_debug_dent, NULL, &filter_fops); + if (!filter_dent) + goto out_err; + return 0; out_err: @@ -400,9 +643,64 @@ out_err: return -ENOMEM; } +static int device_dma_allocations(struct device *dev) +{ + struct dma_debug_entry *entry; + unsigned long flags; + int count = 0, i; + + local_irq_save(flags); + + for (i = 0; i < HASH_SIZE; ++i) { + spin_lock(&dma_entry_hash[i].lock); + list_for_each_entry(entry, &dma_entry_hash[i].list, list) { + if (entry->dev == dev) + count += 1; + } + spin_unlock(&dma_entry_hash[i].lock); + } + + local_irq_restore(flags); + + return count; +} + +static int dma_debug_device_change(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + int count; + + + switch (action) { + case BUS_NOTIFY_UNBOUND_DRIVER: + count = device_dma_allocations(dev); + if (count == 0) + break; + err_printk(dev, NULL, "DMA-API: device driver has pending " + "DMA allocations while released from device " + "[count=%d]\n", count); + break; + default: + break; + } + + return 0; +} + void dma_debug_add_bus(struct bus_type *bus) { - /* FIXME: register notifier */ + struct notifier_block *nb; + + nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); + if (nb == NULL) { + pr_err("dma_debug_add_bus: out of memory\n"); + return; + } + + nb->notifier_call = dma_debug_device_change; + + bus_register_notifier(bus, nb); } /* @@ -421,8 +719,7 @@ void dma_debug_init(u32 num_entries) } if (dma_debug_fs_init() != 0) { - printk(KERN_ERR "DMA-API: error creating debugfs entries " - "- disabling\n"); + pr_err("DMA-API: error creating debugfs entries - disabling\n"); global_disable = true; return; @@ -432,14 +729,15 @@ void dma_debug_init(u32 num_entries) num_entries = req_entries; if (prealloc_memory(num_entries) != 0) { - printk(KERN_ERR "DMA-API: debugging out of memory error " - "- disabled\n"); + pr_err("DMA-API: debugging out of memory error - disabled\n"); global_disable = true; return; } - printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); + nr_total_entries = num_free_entries; + + pr_info("DMA-API: debugging enabled by kernel config\n"); } static __init int dma_debug_cmdline(char *str) @@ -448,8 +746,7 @@ static __init int dma_debug_cmdline(char *str) return -EINVAL; if (strncmp(str, "off", 3) == 0) { - printk(KERN_INFO "DMA-API: debugging disabled on kernel " - "command line\n"); + pr_info("DMA-API: debugging disabled on kernel command line\n"); global_disable = true; } @@ -723,15 +1020,15 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; entry->paddr = sg_phys(s); - entry->size = s->length; - entry->dev_addr = s->dma_address; + entry->size = sg_dma_len(s); + entry->dev_addr = sg_dma_address(s); entry->direction = direction; entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; if (!PageHighMem(sg_page(s))) { check_for_stack(dev, sg_virt(s)); - check_for_illegal_area(dev, sg_virt(s), s->length); + check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } add_dma_entry(entry); @@ -739,13 +1036,33 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, } EXPORT_SYMBOL(debug_dma_map_sg); +static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s) +{ + struct dma_debug_entry *entry, ref; + struct hash_bucket *bucket; + unsigned long flags; + int mapped_ents; + + ref.dev = dev; + ref.dev_addr = sg_dma_address(s); + ref.size = sg_dma_len(s), + + bucket = get_hash_bucket(&ref, &flags); + entry = hash_bucket_find(bucket, &ref); + mapped_ents = 0; + + if (entry) + mapped_ents = entry->sg_mapped_ents; + put_hash_bucket(bucket, &flags); + + return mapped_ents; +} + void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { - struct dma_debug_entry *entry; struct scatterlist *s; int mapped_ents = 0, i; - unsigned long flags; if (unlikely(global_disable)) return; @@ -756,8 +1073,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, .type = dma_debug_sg, .dev = dev, .paddr = sg_phys(s), - .dev_addr = s->dma_address, - .size = s->length, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), .direction = dir, .sg_call_ents = 0, }; @@ -765,14 +1082,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, if (mapped_ents && i >= mapped_ents) break; - if (mapped_ents == 0) { - struct hash_bucket *bucket; + if (!i) { ref.sg_call_ents = nelems; - bucket = get_hash_bucket(&ref, &flags); - entry = hash_bucket_find(bucket, &ref); - if (entry) - mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); + mapped_ents = get_nr_mapped_entries(dev, s); } check_unmap(&ref); @@ -874,14 +1186,20 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) { struct scatterlist *s; - int i; + int mapped_ents = 0, i; if (unlikely(global_disable)) return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, s->dma_address, s->dma_length, 0, - direction, true); + if (!i) + mapped_ents = get_nr_mapped_entries(dev, s); + + if (i >= mapped_ents) + break; + + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, + direction, true); } } EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); @@ -890,15 +1208,39 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) { struct scatterlist *s; - int i; + int mapped_ents = 0, i; if (unlikely(global_disable)) return; for_each_sg(sg, s, nelems, i) { - check_sync(dev, s->dma_address, s->dma_length, 0, - direction, false); + if (!i) + mapped_ents = get_nr_mapped_entries(dev, s); + + if (i >= mapped_ents) + break; + + check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0, + direction, false); } } EXPORT_SYMBOL(debug_dma_sync_sg_for_device); +static int __init dma_debug_driver_setup(char *str) +{ + int i; + + for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { + current_driver_name[i] = *str; + if (*str == 0) + break; + } + + if (current_driver_name[0]) + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + + + return 1; +} +__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2b0b5a7d2ced..bffe6d7ef9d9 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -60,8 +60,8 @@ enum dma_sync_target { int swiotlb_force; /* - * Used to do a quick range check in swiotlb_unmap_single and - * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in unmap_single and + * sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; @@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } @@ -140,9 +140,15 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -static void *swiotlb_bus_to_virt(dma_addr_t address) +void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) { - return phys_to_virt(swiotlb_bus_to_phys(address)); + return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); +} + +int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, + dma_addr_t addr, size_t size) +{ + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); } int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) @@ -309,10 +315,10 @@ cleanup1: return -ENOMEM; } -static int +static inline int address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); + return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } static inline int range_needs_mapping(phys_addr_t paddr, size_t size) @@ -341,7 +347,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, unsigned long flags; while (size) { - sz = min(PAGE_SIZE - offset, size); + sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn), @@ -476,7 +482,7 @@ found: * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ static void -unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -560,7 +566,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, size)) { /* * The allocated memory isn't reachable by the device. - * Fall back on swiotlb_map_single(). */ free_pages((unsigned long) ret, order); ret = NULL; @@ -568,9 +573,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (!ret) { /* * We are either out of memory or the device can't DMA - * to GFP_DMA memory; fall back on - * swiotlb_map_single(), which will grab memory from - * the lowest available address range. + * to GFP_DMA memory; fall back on map_single(), which + * will grab memory from the lowest available address range. */ ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); if (!ret) @@ -587,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); return NULL; } *dma_handle = dev_addr; @@ -604,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long) vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -634,7 +638,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) * physical address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. */ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, @@ -642,18 +646,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { phys_addr_t phys = page_to_phys(page) + offset; - void *ptr = page_address(page) + offset; dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); void *map; BUG_ON(dir == DMA_NONE); /* - * If the pointer passed in happens to be in the device's DMA window, + * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ if (!address_needs_mapping(dev, dev_addr, size) && - !range_needs_mapping(virt_to_phys(ptr), size)) + !range_needs_mapping(phys, size)) return dev_addr; /* @@ -679,23 +682,35 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_single call. All + * match what was provided for in a previous swiotlb_map_page call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(dma_addr)) { + do_unmap_single(hwdev, dma_addr, size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); +} + void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - unmap_single(hwdev, dma_addr, size, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + unmap_single(hwdev, dev_addr, size, dir); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -703,7 +718,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. * - * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * If you perform a swiotlb_map_page() but wish to interrogate the buffer * using the cpu, yet do not wish to teardown the dma mapping, you must * call this function before doing so. At the next point you give the dma * address back to the card, you must first perform a @@ -713,13 +728,19 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) + + if (is_swiotlb_buffer(dma_addr)) { sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); } void @@ -746,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, unsigned long offset, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset; - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); } void @@ -777,7 +792,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); /* * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_single + * This is the scatter-gather version of the above swiotlb_map_page * interface. Here the scatter gather list elements are each tagged with the * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). @@ -788,7 +803,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); * The routine returns the number of addr/length pairs actually * used, at most nents. * - * Device ownership issues as mentioned above for swiotlb_map_single are the + * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int @@ -836,7 +851,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_single() above. + * concerning calls here are the same as for swiotlb_unmap_page() above. */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, @@ -847,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), - sg->dma_length, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); @@ -879,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, sg->dma_length, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } } void diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7536acea135b..756ccafa9cec 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -408,6 +408,8 @@ enum format_type { FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, + FORMAT_TYPE_UBYTE, + FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, @@ -573,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec) } static char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec) + struct printf_spec spec, char ext) { unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - sprint_symbol(sym, value); + if (ext != 'f') + sprint_symbol(sym, value); + else + kallsyms_lookup(value, NULL, NULL, NULL, sym); return string(buf, end, sym, spec); #else spec.field_width = 2*sizeof(void *); @@ -690,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, * * Right now we handle: * - * - 'F' For symbolic function descriptor pointers + * - 'F' For symbolic function descriptor pointers with offset + * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers * - 'R' For a struct resource pointer, it prints the range of * addresses (not the name nor the flags) @@ -713,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, switch (*fmt) { case 'F': + case 'f': ptr = dereference_function_descriptor(ptr); /* Fallthrough */ case 'S': - return symbol_string(buf, end, ptr, spec); + return symbol_string(buf, end, ptr, spec, *fmt); case 'R': return resource_string(buf, end, ptr, spec); case 'm': @@ -853,11 +860,15 @@ qualifier: spec->qualifier = -1; if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { - spec->qualifier = *fmt; - ++fmt; - if (spec->qualifier == 'l' && *fmt == 'l') { - spec->qualifier = 'L'; - ++fmt; + spec->qualifier = *fmt++; + if (unlikely(spec->qualifier == *fmt)) { + if (spec->qualifier == 'l') { + spec->qualifier = 'L'; + ++fmt; + } else if (spec->qualifier == 'h') { + spec->qualifier = 'H'; + ++fmt; + } } } @@ -919,6 +930,11 @@ qualifier: spec->type = FORMAT_TYPE_SIZE_T; } else if (spec->qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; + } else if (spec->qualifier == 'H') { + if (spec->flags & SIGN) + spec->type = FORMAT_TYPE_BYTE; + else + spec->type = FORMAT_TYPE_UBYTE; } else if (spec->qualifier == 'h') { if (spec->flags & SIGN) spec->type = FORMAT_TYPE_SHORT; @@ -943,7 +959,8 @@ qualifier: * * This function follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * * The return value is the number of characters which would @@ -1087,6 +1104,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = (unsigned char) va_arg(args, int); + break; + case FORMAT_TYPE_BYTE: + num = (signed char) va_arg(args, int); + break; case FORMAT_TYPE_USHORT: num = (unsigned short) va_arg(args, int); break; @@ -1363,6 +1386,10 @@ do { \ case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + case FORMAT_TYPE_BYTE: + save_arg(char); + break; case FORMAT_TYPE_USHORT: case FORMAT_TYPE_SHORT: save_arg(short); @@ -1391,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf); * * The format follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * %n is ignored * @@ -1538,6 +1566,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) case FORMAT_TYPE_PTRDIFF: num = get_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = get_arg(unsigned char); + break; + case FORMAT_TYPE_BYTE: + num = get_arg(signed char); + break; case FORMAT_TYPE_USHORT: num = get_arg(unsigned short); break; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ecf2aa5..474c7e9dd51a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve; static int __meminitdata nr_nodemap_entries; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; - static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; @@ -3103,64 +3099,6 @@ void __init sparse_memory_present_with_active_regions(int nid) } /** - * push_node_boundaries - Push node boundaries to at least the requested boundary - * @nid: The nid of the node to push the boundary for - * @start_pfn: The start pfn of the node - * @end_pfn: The end pfn of the node - * - * In reserve-based hot-add, mem_map is allocated that is unused until hotadd - * time. Specifically, on x86_64, SRAT will report ranges that can potentially - * be hotplugged even though no physical memory exists. This function allows - * an arch to push out the node boundaries so mem_map is allocated that can - * be used later. - */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering push_node_boundaries(%u, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Initialise the boundary for this node if necessary */ - if (node_boundary_end_pfn[nid] == 0) - node_boundary_start_pfn[nid] = -1UL; - - /* Update the boundaries */ - if (node_boundary_start_pfn[nid] > start_pfn) - node_boundary_start_pfn[nid] = start_pfn; - if (node_boundary_end_pfn[nid] < end_pfn) - node_boundary_end_pfn[nid] = end_pfn; -} - -/* If necessary, push the node boundary out for reserve hotadd */ -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering account_node_boundary(%u, %lu, %lu)\n", - nid, *start_pfn, *end_pfn); - - /* Return if boundary information has not been provided */ - if (node_boundary_end_pfn[nid] == 0) - return; - - /* Check the boundaries and update if necessary */ - if (node_boundary_start_pfn[nid] < *start_pfn) - *start_pfn = node_boundary_start_pfn[nid]; - if (node_boundary_end_pfn[nid] > *end_pfn) - *end_pfn = node_boundary_end_pfn[nid]; -} -#else -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) {} - -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) {} -#endif - - -/** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. * @start_pfn: Passed by reference. On return, it will have the node start_pfn. @@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, if (*start_pfn == -1UL) *start_pfn = 0; - - /* Push the node boundaries out if requested */ - account_node_boundary(nid, start_pfn, end_pfn); } /* @@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); - memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ } /* Compare two active node_active_regions */ diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca12..c0b2c1a76e81 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -23,7 +23,7 @@ * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. + * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists @@ -38,8 +38,8 @@ * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * @@ -61,7 +61,6 @@ #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> @@ -88,7 +87,6 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ @@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* optional reserved chunk, only accessible for reserved allocations */ +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ static struct pcpu_chunk *pcpu_reserved_chunk; -/* offset limit of the reserved chunk */ static int pcpu_reserved_chunk_limit; /* @@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit; * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - - * chunk slots, rbtree, chunks and area maps in chunks. + * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); @@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; } +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } } -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - /** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * CONTEXT: - * pcpu_lock. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; + void *first_start = pcpu_first_chunk->vm->addr; - /* is it in the reserved chunk? */ - if (pcpu_reserved_chunk) { - void *start = pcpu_reserved_chunk->vm->addr; - - if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; + return pcpu_first_chunk; } - /* nah... search the regular ones */ - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); + return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** @@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) alloc_mask, 0); if (!*pagep) goto err; + pcpu_set_page_chunk(*pagep, chunk); } } @@ -879,7 +834,6 @@ restart: spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); goto restart; area_found: @@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; - rb_erase(&chunk->rb_node, &pcpu_addr_root); list_move(&chunk->list, &todo); } @@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (reserved_size) { schunk->free_size = reserved_size; - pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; - pcpu_reserved_chunk_limit = static_size + schunk->free_size; - /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); @@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - if (!dchunk) { - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); - } else { - pcpu_chunk_relocate(dchunk, -1); - pcpu_chunk_addr_insert(dchunk); - } + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index cc29b44b1500..e5becb92b3e7 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -167,6 +167,9 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, struct tcf_exts e; int err; + if (!tca[TCA_OPTIONS]) + return -EINVAL; + if (head == NULL) { if (!handle) return -EINVAL; diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index cba61ca403ca..2b706617c89a 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -188,20 +188,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@ # --------------------------------------------------------------------------- quiet_cmd_gzip = GZIP $@ -cmd_gzip = gzip -f -9 < $< > $@ +cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \ + (rm -f $@ ; false) # Bzip2 # --------------------------------------------------------------------------- -# Bzip2 does not include size in file... so we have to fake that -size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size - -quiet_cmd_bzip2 = BZIP2 $@ -cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false) +# Bzip2 and LZMA do not include size in file... so we have to fake that; +# append the size as a 32-bit littleendian number as gzip does. +size_append = echo -ne $(shell \ +dec_size=0; \ +for F in $1; do \ + fsize=$$(stat -c "%s" $$F); \ + dec_size=$$(expr $$dec_size + $$fsize); \ +done; \ +printf "%08x" $$dec_size | \ + sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \ +) + +quiet_cmd_bzip2 = BZIP2 $@ +cmd_bzip2 = (cat $(filter-out FORCE,$^) | \ + bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) # Lzma # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ -cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false) +cmd_lzma = (cat $(filter-out FORCE,$^) | \ + lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) diff --git a/scripts/bin_size b/scripts/bin_size deleted file mode 100644 index 43e1b360cee6..000000000000 --- a/scripts/bin_size +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -if [ $# = 0 ] ; then - echo Usage: $0 file -fi - -size_dec=`stat -c "%s" $1` -size_hex_echo_string=`printf "%08x" $size_dec | - sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'` -/bin/echo -ne $size_hex_echo_string diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4293528200b3..4d0dd390aa50 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2301,7 +2301,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size, bad_pfn = page_to_pfn(bad_page); - if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; } |