diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-17 00:20:36 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-17 00:20:36 +0200 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386 | |
download | linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.xz linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/i386')
264 files changed, 80767 insertions, 0 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig new file mode 100644 index 000000000000..17a0cbce6f30 --- /dev/null +++ b/arch/i386/Kconfig @@ -0,0 +1,1269 @@ +# +# For a description of the syntax of this configuration file, +# see Documentation/kbuild/kconfig-language.txt. +# + +mainmenu "Linux Kernel Configuration" + +config X86 + bool + default y + help + This is Linux's home port. Linux was originally native to the Intel + 386, and runs on all the later x86 processors including the Intel + 486, 586, Pentiums, and various instruction-set-compatible chips by + AMD, Cyrix, and others. + +config MMU + bool + default y + +config SBUS + bool + +config UID16 + bool + default y + +config GENERIC_ISA_DMA + bool + default y + +config GENERIC_IOMAP + bool + default y + +source "init/Kconfig" + +menu "Processor type and features" + +choice + prompt "Subarchitecture Type" + default X86_PC + +config X86_PC + bool "PC-compatible" + help + Choose this option if your computer is a standard PC or compatible. + +config X86_ELAN + bool "AMD Elan" + help + Select this for an AMD Elan processor. + + Do not use this option for K6/Athlon/Opteron processors! + + If unsure, choose "PC-compatible" instead. + +config X86_VOYAGER + bool "Voyager (NCR)" + help + Voyager is an MCA-based 32-way capable SMP architecture proprietary + to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. + + *** WARNING *** + + If you do not specifically know you have a Voyager based machine, + say N here, otherwise the kernel you build will not be bootable. + +config X86_NUMAQ + bool "NUMAQ (IBM/Sequent)" + select DISCONTIGMEM + select NUMA + help + This option is used for getting Linux to run on a (IBM/Sequent) NUMA + multiquad box. This changes the way that processors are bootstrapped, + and uses Clustered Logical APIC addressing mode instead of Flat Logical. + You will need a new lynxer.elf file to flash your firmware with - send + email to <Martin.Bligh@us.ibm.com>. + +config X86_SUMMIT + bool "Summit/EXA (IBM x440)" + depends on SMP + help + This option is needed for IBM systems that use the Summit/EXA chipset. + In particular, it is needed for the x440. + + If you don't have one of these computers, you should say N here. + +config X86_BIGSMP + bool "Support for other sub-arch SMP systems with more than 8 CPUs" + depends on SMP + help + This option is needed for the systems that have more than 8 CPUs + and if the system is not of any sub-arch type above. + + If you don't have such a system, you should say N here. + +config X86_VISWS + bool "SGI 320/540 (Visual Workstation)" + help + The SGI Visual Workstation series is an IA32-based workstation + based on SGI systems chips with some legacy PC hardware attached. + + Say Y here to create a kernel to run on the SGI 320 or 540. + + A kernel compiled for the Visual Workstation will not run on PCs + and vice versa. See <file:Documentation/sgi-visws.txt> for details. + +config X86_GENERICARCH + bool "Generic architecture (Summit, bigsmp, ES7000, default)" + depends on SMP + help + This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. + It is intended for a generic binary kernel. + +config X86_ES7000 + bool "Support for Unisys ES7000 IA32 series" + depends on SMP + help + Support for Unisys ES7000 systems. Say 'Y' here if this kernel is + supposed to run on an IA32-based Unisys ES7000 system. + Only choose this option if you have such a system, otherwise you + should say N here. + +endchoice + +config ACPI_SRAT + bool + default y + depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) + +config X86_SUMMIT_NUMA + bool + default y + depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) + +config X86_CYCLONE_TIMER + bool + default y + depends on X86_SUMMIT || X86_GENERICARCH + +config ES7000_CLUSTERED_APIC + bool + default y + depends on SMP && X86_ES7000 && MPENTIUMIII + +if !X86_ELAN + +choice + prompt "Processor family" + default M686 + +config M386 + bool "386" + ---help--- + This is the processor type of your CPU. This information is used for + optimizing purposes. In order to compile a kernel that can run on + all x86 CPU types (albeit not optimally fast), you can specify + "386" here. + + The kernel will not necessarily run on earlier architectures than + the one you have chosen, e.g. a Pentium optimized kernel will run on + a PPro, but not necessarily on a i486. + + Here are the settings recommended for greatest speed: + - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI + 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels + will run on a 386 class machine. + - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or + SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + - "586" for generic Pentium CPUs lacking the TSC + (time stamp counter) register. + - "Pentium-Classic" for the Intel Pentium. + - "Pentium-MMX" for the Intel Pentium MMX. + - "Pentium-Pro" for the Intel Pentium Pro. + - "Pentium-II" for the Intel Pentium II or pre-Coppermine Celeron. + - "Pentium-III" for the Intel Pentium III or Coppermine Celeron. + - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. + - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). + - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). + - "Crusoe" for the Transmeta Crusoe series. + - "Efficeon" for the Transmeta Efficeon series. + - "Winchip-C6" for original IDT Winchip. + - "Winchip-2" for IDT Winchip 2. + - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "MediaGX/Geode" for Cyrix MediaGX aka Geode. + - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. + - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). + + If you don't know what to do, choose "386". + +config M486 + bool "486" + help + Select this for a 486 series processor, either Intel or one of the + compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, + DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or + U5S. + +config M586 + bool "586/K5/5x86/6x86/6x86MX" + help + Select this for an 586 or 686 series processor such as the AMD K5, + the Cyrix 5x86, 6x86 and 6x86MX. This choice does not + assume the RDTSC (Read Time Stamp Counter) instruction. + +config M586TSC + bool "Pentium-Classic" + help + Select this for a Pentium Classic processor with the RDTSC (Read + Time Stamp Counter) instruction for benchmarking. + +config M586MMX + bool "Pentium-MMX" + help + Select this for a Pentium with the MMX graphics/multimedia + extended instructions. + +config M686 + bool "Pentium-Pro" + help + Select this for Intel Pentium Pro chips. This enables the use of + Pentium Pro extended instructions, and disables the init-time guard + against the f00f bug found in earlier Pentiums. + +config MPENTIUMII + bool "Pentium-II/Celeron(pre-Coppermine)" + help + Select this for Intel chips based on the Pentium-II and + pre-Coppermine Celeron core. This option enables an unaligned + copy optimization, compiles the kernel with optimization flags + tailored for the chip, and applies any applicable Pentium Pro + optimizations. + +config MPENTIUMIII + bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" + help + Select this for Intel chips based on the Pentium-III and + Celeron-Coppermine core. This option enables use of some + extended prefetch instructions in addition to the Pentium II + extensions. + +config MPENTIUMM + bool "Pentium M" + help + Select this for Intel Pentium M (not Pentium-4 M) + notebook chips. + +config MPENTIUM4 + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" + help + Select this for Intel Pentium 4 chips. This includes the + Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M + (not Pentium M) chips. This option enables compile flags + optimized for the chip, uses the correct cache shift, and + applies any applicable Pentium III optimizations. + +config MK6 + bool "K6/K6-II/K6-III" + help + Select this for an AMD K6-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. + +config MK7 + bool "Athlon/Duron/K7" + help + Select this for an AMD Athlon K7-family processor. Enables use of + some extended instructions, and passes appropriate optimization + flags to GCC. + +config MK8 + bool "Opteron/Athlon64/Hammer/K8" + help + Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables + use of some extended instructions, and passes appropriate optimization + flags to GCC. + +config MCRUSOE + bool "Crusoe" + help + Select this for a Transmeta Crusoe processor. Treats the processor + like a 586 with TSC, and sets some GCC optimization flags (like a + Pentium Pro with no alignment requirements). + +config MEFFICEON + bool "Efficeon" + help + Select this for a Transmeta Efficeon processor. + +config MWINCHIPC6 + bool "Winchip-C6" + help + Select this for an IDT Winchip C6 chip. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. + +config MWINCHIP2 + bool "Winchip-2" + help + Select this for an IDT Winchip-2. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment requirements. + +config MWINCHIP3D + bool "Winchip-2A/Winchip-3" + help + Select this for an IDT Winchip-2A or 3. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment reqirements. Also enable out of order memory + stores for this CPU, which can increase performance of some + operations. + +config MGEODE + bool "MediaGX/Geode" + help + Select this for a Cyrix MediaGX aka Geode chip. Linux and GCC + treat this chip as a 586TSC with some extended instructions + and alignment reqirements. + +config MCYRIXIII + bool "CyrixIII/VIA-C3" + help + Select this for a Cyrix III or C3 chip. Presently Linux and GCC + treat this chip as a generic 586. Whilst the CPU is 686 class, + it lacks the cmov extension which gcc assumes is present when + generating 686 code. + Note that Nehemiah (Model 9) and above will not boot with this + kernel due to them lacking the 3DNow! instructions used in earlier + incarnations of the CPU. + +config MVIAC3_2 + bool "VIA C3-2 (Nehemiah)" + help + Select this for a VIA C3 "Nehemiah". Selecting this enables usage + of SSE and tells gcc to treat the CPU as a 686. + Note, this kernel will not boot on older (pre model 9) C3s. + +endchoice + +config X86_GENERIC + bool "Generic x86 support" + help + Instead of just including optimizations for the selected + x86 variant (e.g. PII, Crusoe or Athlon), include some more + generic optimizations as well. This will make the kernel + perform better on x86 CPUs other than that selected. + + This is really intended for distributors who need more + generic optimizations. + +endif + +# +# Define implied options from the CPU selection here +# +config X86_CMPXCHG + bool + depends on !M386 + default y + +config X86_XADD + bool + depends on !M386 + default y + +config X86_L1_CACHE_SHIFT + int + default "7" if MPENTIUM4 || X86_GENERIC + default "4" if X86_ELAN || M486 || M386 + default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE + default "6" if MK7 || MK8 || MPENTIUMM + +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + depends on !M386 + default y + +config GENERIC_CALIBRATE_DELAY + bool + default y + +config X86_PPRO_FENCE + bool + depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODE + default y + +config X86_F00F_BUG + bool + depends on M586MMX || M586TSC || M586 || M486 || M386 + default y + +config X86_WP_WORKS_OK + bool + depends on !M386 + default y + +config X86_INVLPG + bool + depends on !M386 + default y + +config X86_BSWAP + bool + depends on !M386 + default y + +config X86_POPAD_OK + bool + depends on !M386 + default y + +config X86_ALIGNMENT_16 + bool + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODE + default y + +config X86_GOOD_APIC + bool + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON + default y + +config X86_INTEL_USERCOPY + bool + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON + default y + +config X86_USE_PPRO_CHECKSUM + bool + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON + default y + +config X86_USE_3DNOW + bool + depends on MCYRIXIII || MK7 + default y + +config X86_OOSTORE + bool + depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MGEODE) && MTRR + default y + +config HPET_TIMER + bool "HPET Timer Support" + help + This enables the use of the HPET for the kernel's internal timer. + HPET is the next generation timer replacing legacy 8254s. + You can safely choose Y here. However, HPET will only be + activated if the platform and the BIOS support this feature. + Otherwise the 8254 will be used for timing services. + + Choose N to continue using the legacy 8254 timer. + +config HPET_EMULATE_RTC + bool "Provide RTC interrupt" + depends on HPET_TIMER && RTC=y + +config SMP + bool "Symmetric multi-processing support" + ---help--- + This enables support for systems with more than one CPU. If you have + a system with only one CPU, like most personal computers, say N. If + you have a system with more than one CPU, say Y. + + If you say N here, the kernel will run on single and multiprocessor + machines, but will use only one CPU of a multiprocessor machine. If + you say Y here, the kernel will run on many, but not all, + singleprocessor machines. On a singleprocessor machine, the kernel + will run faster if you say N here. + + Note that if you say Y here and choose architecture "586" or + "Pentium" under "Processor family", the kernel will not work on 486 + architectures. Similarly, multiprocessor kernels for the "PPro" + architecture may not work on all Pentium based boards. + + People using multiprocessor machines who say Y here should also say + Y to "Enhanced Real Time Clock Support", below. The "Advanced Power + Management" code will be disabled if you say Y here. + + See also the <file:Documentation/smp.txt>, + <file:Documentation/i386/IO-APIC.txt>, + <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at + <http://www.tldp.org/docs.html#howto>. + + If you don't know what to do here, say N. + +config NR_CPUS + int "Maximum number of CPUs (2-255)" + range 2 255 + depends on SMP + default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 + default "8" + help + This allows you to specify the maximum number of CPUs which this + kernel will support. The maximum supported value is 255 and the + minimum value which makes sense is 2. + + This is purely to save memory - each supported CPU adds + approximately eight kilobytes to the kernel image. + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default off + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + +config PREEMPT + bool "Preemptible Kernel" + help + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load. + + Say Y here if you are building a kernel for a desktop, embedded + or real-time system. Say N if you are unsure. + +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + +config X86_UP_APIC + bool "Local APIC support on uniprocessors" + depends on !SMP && !(X86_VISWS || X86_VOYAGER) + help + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU + system which has a processor with a local APIC, you can say Y here to + enable and use it. If you say Y here even though your machine doesn't + have a local APIC, then the kernel will still run with no slowdown at + all. The local APIC supports CPU-generated self-interrupts (timer, + performance counters), and the NMI watchdog which detects hard + lockups. + +config X86_UP_IOAPIC + bool "IO-APIC support on uniprocessors" + depends on X86_UP_APIC + help + An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an + SMP-capable replacement for PC-style interrupt controllers. Most + SMP systems and many recent uniprocessor systems have one. + + If you have a single-CPU system with an IO-APIC, you can say Y here + to use it. If you say Y here even though your machine doesn't have + an IO-APIC, then the kernel will still run with no slowdown at all. + +config X86_LOCAL_APIC + bool + depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) + default y + +config X86_IO_APIC + bool + depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) + default y + +config X86_VISWS_APIC + bool + depends on X86_VISWS + default y + +config X86_TSC + bool + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODE) && !X86_NUMAQ + default y + +config X86_MCE + bool "Machine Check Exception" + depends on !X86_VOYAGER + ---help--- + Machine Check Exception support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, component failure). + The action the kernel takes depends on the severity of the problem, + ranging from a warning message on the console, to halting the machine. + Your processor must be a Pentium or newer to support this - check the + flags in /proc/cpuinfo for mce. Note that some older Pentium systems + have a design flaw which leads to false MCE events - hence MCE is + disabled on all P5 processors, unless explicitly enabled with "mce" + as a boot argument. Similarly, if MCE is built in and creates a + problem on some new non-standard machine, you can boot with "nomce" + to disable it. MCE support simply ignores non-MCE processors like + the 386 and 486, so nearly everyone can say Y here. + +config X86_MCE_NONFATAL + tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" + depends on X86_MCE + help + Enabling this feature starts a timer that triggers every 5 seconds which + will look at the machine check registers to see if anything happened. + Non-fatal problems automatically get corrected (but still logged). + Disable this if you don't want to see these messages. + Seeing the messages this option prints out may be indicative of dying hardware, + or out-of-spec (ie, overclocked) hardware. + This option only does something on certain CPUs. + (AMD Athlon/Duron and Intel Pentium 4) + +config X86_MCE_P4THERMAL + bool "check for P4 thermal throttling interrupt." + depends on X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS + help + Enabling this feature will cause a message to be printed when the P4 + enters thermal throttling. + +config TOSHIBA + tristate "Toshiba Laptop support" + ---help--- + This adds a driver to safely access the System Management Mode of + the CPU on Toshiba portables with a genuine Toshiba BIOS. It does + not work on models with a Phoenix BIOS. The System Management Mode + is used to set the BIOS and power saving options on Toshiba portables. + + For information on utilities to make use of this driver see the + Toshiba Linux utilities web site at: + <http://www.buzzard.org.uk/toshiba/>. + + Say Y if you intend to run this kernel on a Toshiba portable. + Say N otherwise. + +config I8K + tristate "Dell laptop support" + ---help--- + This adds a driver to safely access the System Management Mode + of the CPU on the Dell Inspiron 8000. The System Management Mode + is used to read cpu temperature and cooling fan status and to + control the fans on the I8K portables. + + This driver has been tested only on the Inspiron 8000 but it may + also work with other Dell laptops. You can force loading on other + models by passing the parameter `force=1' to the module. Use at + your own risk. + + For information on utilities to make use of this driver see the + I8K Linux utilities web site at: + <http://people.debian.org/~dz/i8k/> + + Say Y if you intend to run this kernel on a Dell Inspiron 8000. + Say N otherwise. + +config MICROCODE + tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + ---help--- + If you say Y here and also to "/dev file system support" in the + 'File systems' section, you will be able to update the microcode on + Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, + Pentium III, Pentium 4, Xeon etc. You will obviously need the + actual microcode binary data itself which is not shipped with the + Linux kernel. + + For latest news and information on obtaining all the required + ingredients for this driver, check: + <http://www.urbanmyth.org/microcode/>. + + To compile this driver as a module, choose M here: the + module will be called microcode. + +config X86_MSR + tristate "/dev/cpu/*/msr - Model-specific register support" + help + This device gives privileged processes access to the x86 + Model-Specific Registers (MSRs). It is a character device with + major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. + MSR accesses are directed to a specific CPU on multi-processor + systems. + +config X86_CPUID + tristate "/dev/cpu/*/cpuid - CPU information support" + help + This device gives processes access to the x86 CPUID instruction to + be executed on a specific processor. It is a character device + with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to + /dev/cpu/31/cpuid. + +source "drivers/firmware/Kconfig" + +choice + prompt "High Memory Support" + default NOHIGHMEM + +config NOHIGHMEM + bool "off" + ---help--- + Linux can use up to 64 Gigabytes of physical memory on x86 systems. + However, the address space of 32-bit x86 processors is only 4 + Gigabytes large. That means that, if you have a large amount of + physical memory, not all of it can be "permanently mapped" by the + kernel. The physical memory that's not permanently mapped is called + "high memory". + + If you are compiling a kernel which will never run on a machine with + more than 1 Gigabyte total physical RAM, answer "off" here (default + choice and suitable for most users). This will result in a "3GB/1GB" + split: 3GB are mapped so that each process sees a 3GB virtual memory + space and the remaining part of the 4GB virtual memory space is used + by the kernel to permanently map as much physical memory as + possible. + + If the machine has between 1 and 4 Gigabytes physical RAM, then + answer "4GB" here. + + If more than 4 Gigabytes is used then answer "64GB" here. This + selection turns Intel PAE (Physical Address Extension) mode on. + PAE implements 3-level paging on IA32 processors. PAE is fully + supported by Linux, PAE mode is implemented on all recent Intel + processors (Pentium Pro and better). NOTE: If you say "64GB" here, + then the kernel will not boot on CPUs that don't support PAE! + + The actual amount of total physical memory will either be + auto detected or can be forced by using a kernel command line option + such as "mem=256M". (Try "man bootparam" or see the documentation of + your boot loader (lilo or loadlin) about how to pass options to the + kernel at boot time.) + + If unsure, say "off". + +config HIGHMEM4G + bool "4GB" + help + Select this if you have a 32-bit processor and between 1 and 4 + gigabytes of physical RAM. + +config HIGHMEM64G + bool "64GB" + help + Select this if you have a 32-bit processor and more than 4 + gigabytes of physical RAM. + +endchoice + +config HIGHMEM + bool + depends on HIGHMEM64G || HIGHMEM4G + default y + +config X86_PAE + bool + depends on HIGHMEM64G + default y + +# Common NUMA Features +config NUMA + bool "Numa Memory Allocation and Scheduler Support" + depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) + default n if X86_PC + default y if (X86_NUMAQ || X86_SUMMIT) + +# Need comments to help the hapless user trying to turn on NUMA support +comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" + depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) + +comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" + depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) + +config DISCONTIGMEM + bool + depends on NUMA + default y + +config HAVE_ARCH_BOOTMEM_NODE + bool + depends on NUMA + default y + +config HAVE_MEMORY_PRESENT + bool + depends on DISCONTIGMEM + default y + +config NEED_NODE_MEMMAP_SIZE + bool + depends on DISCONTIGMEM + default y + +config HIGHPTE + bool "Allocate 3rd-level pagetables from highmem" + depends on HIGHMEM4G || HIGHMEM64G + help + The VM uses one page table entry for each page of physical memory. + For systems with a lot of RAM, this can be wasteful of precious + low memory. Setting this option will put user-space page table + entries in high memory. + +config MATH_EMULATION + bool "Math emulation" + ---help--- + Linux can emulate a math coprocessor (used for floating point + operations) if you don't have one. 486DX and Pentium processors have + a math coprocessor built in, 486SX and 386 do not, unless you added + a 487DX or 387, respectively. (The messages during boot time can + give you some hints here ["man dmesg"].) Everyone needs either a + coprocessor or this emulation. + + If you don't have a math coprocessor, you need to say Y here; if you + say Y here even though you have a coprocessor, the coprocessor will + be used nevertheless. (This behavior can be changed with the kernel + command line option "no387", which comes handy if your coprocessor + is broken. Try "man bootparam" or see the documentation of your boot + loader (lilo or loadlin) about how to pass options to the kernel at + boot time.) This means that it is a good idea to say Y here if you + intend to use this kernel on different machines. + + More information about the internals of the Linux math coprocessor + emulation can be found in <file:arch/i386/math-emu/README>. + + If you are not sure, say Y; apart from resulting in a 66 KB bigger + kernel, it won't hurt. + +config MTRR + bool "MTRR (Memory Type Range Register) support" + ---help--- + On Intel P6 family processors (Pentium Pro, Pentium II and later) + the Memory Type Range Registers (MTRRs) may be used to control + processor access to memory ranges. This is most useful if you have + a video (VGA) card on a PCI or AGP bus. Enabling write-combining + allows bus write transfers to be combined into a larger transfer + before bursting over the PCI/AGP bus. This can increase performance + of image write operations 2.5 times or more. Saying Y here creates a + /proc/mtrr file which may be used to manipulate your processor's + MTRRs. Typically the X server should use this. + + This code has a reasonably generic interface so that similar + control registers on other processors can be easily supported + as well: + + The Cyrix 6x86, 6x86MX and M II processors have Address Range + Registers (ARRs) which provide a similar functionality to MTRRs. For + these, the ARRs are used to emulate the MTRRs. + The AMD K6-2 (stepping 8 and above) and K6-3 processors have two + MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing + write-combining. All of these processors are supported by this code + and it makes sense to say Y here if you have one of them. + + Saying Y here also fixes a problem with buggy SMP BIOSes which only + set the MTRRs for the boot CPU and not for the secondary CPUs. This + can lead to all sorts of problems, so it's good to say Y here. + + You can safely say Y even if your machine doesn't have MTRRs, you'll + just add about 9 KB to your kernel. + + See <file:Documentation/mtrr.txt> for more information. + +config EFI + bool "Boot from EFI support (EXPERIMENTAL)" + depends on ACPI + default n + ---help--- + This enables the the kernel to boot on EFI platforms using + system configuration information passed to it from the firmware. + This also enables the kernel to use any EFI runtime services that are + available (such as the EFI variable services). + + This option is only useful on systems that have EFI firmware + and will result in a kernel image that is ~8k larger. In addition, + you must use the latest ELILO loader available at + <http://elilo.sourceforge.net> in order to take advantage of + kernel initialization using EFI information (neither GRUB nor LILO know + anything about EFI). However, even with this option, the resultant + kernel should continue to boot on existing non-EFI platforms. + +config IRQBALANCE + bool "Enable kernel irq balancing" + depends on SMP && X86_IO_APIC + default y + help + The default yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. + +config HAVE_DEC_LOCK + bool + depends on (SMP || PREEMPT) && X86_CMPXCHG + default y + +# turning this on wastes a bunch of space. +# Summit needs it only when NUMA is on +config BOOT_IOREMAP + bool + depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) + default y + +config REGPARM + bool "Use register arguments (EXPERIMENTAL)" + depends on EXPERIMENTAL + default n + help + Compile the kernel with -mregparm=3. This uses a different ABI + and passes the first three arguments of a function call in registers. + This will probably break binary only modules. + + This feature is only enabled for gcc-3.0 and later - earlier compilers + generate incorrect output with certain kernel constructs when + -mregparm=3 is used. + +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc/<pid>/seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + +endmenu + + +menu "Power management options (ACPI, APM)" + depends on !X86_VOYAGER + +source kernel/power/Kconfig + +source "drivers/acpi/Kconfig" + +menu "APM (Advanced Power Management) BIOS Support" +depends on PM && !X86_VISWS + +config APM + tristate "APM (Advanced Power Management) BIOS support" + depends on PM + ---help--- + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + If you select "Y" here, you can disable actual use of the APM + BIOS by passing the "apm=off" option to the kernel at boot time. + + Note that the APM support is almost completely disabled for + machines with more than one CPU. + + In order to use APM, you will need supporting software. For location + and more information, read <file:Documentation/pm.txt> and the + Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + This driver does not support the TI 4000M TravelMate and the ACER + 486/DX4/75 because they don't have compliant BIOSes. Many "green" + desktop machines also don't have compliant BIOSes, and this driver + may cause those machines to panic during the boot phase. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). + + Some other things you should try when experiencing seemingly random, + "weird" problems: + + 1) make sure that you have enough swap space and that it is + enabled. + 2) pass the "no-hlt" option to the kernel + 3) switch on floating point emulation in the kernel and pass + the "no387" option to the kernel + 4) pass the "floppy=nodma" option to the kernel + 5) pass the "mem=4M" option to the kernel (thereby disabling + all but the first 4 MB of RAM) + 6) make sure that the CPU is not over clocked. + 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/> + 8) disable the cache from your BIOS settings + 9) install a fan for the video card or exchange video RAM + 10) install a better fan for the CPU + 11) exchange RAM chips + 12) exchange the motherboard. + + To compile this driver as a module, choose M here: the + module will be called apm. + +config APM_IGNORE_USER_SUSPEND + bool "Ignore USER SUSPEND" + depends on APM + help + This option will ignore USER SUSPEND requests. On machines with a + compliant APM BIOS, you want to say N. However, on the NEC Versa M + series notebooks, it is necessary to say Y because of a BIOS bug. + +config APM_DO_ENABLE + bool "Enable PM at boot time" + depends on APM + ---help--- + Enable APM features at boot time. From page 36 of the APM BIOS + specification: "When disabled, the APM BIOS does not automatically + power manage devices, enter the Standby State, enter the Suspend + State, or take power saving steps in response to CPU Idle calls." + This driver will make CPU Idle calls when Linux is idle (unless this + feature is turned off -- see "Do CPU IDLE calls", below). This + should always save battery power, but more complicated APM features + will be dependent on your BIOS implementation. You may need to turn + this option off if your computer hangs at boot time when using APM + support, or if it beeps continuously instead of suspending. Turn + this off if you have a NEC UltraLite Versa 33/C or a Toshiba + T400CDT. This is off by default since most machines do fine without + this feature. + +config APM_CPU_IDLE + bool "Make CPU Idle calls when idle" + depends on APM + help + Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. + On some machines, this can activate improved power savings, such as + a slowed CPU clock rate, when the machine is idle. These idle calls + are made after the idle loop has run for some length of time (e.g., + 333 mS). On some machines, this will cause a hang at boot time or + whenever the CPU becomes idle. (On machines with more than one CPU, + this option does nothing.) + +config APM_DISPLAY_BLANK + bool "Enable console blanking using APM" + depends on APM + help + Enable console blanking using the APM. Some laptops can use this to + turn off the LCD backlight when the screen blanker of the Linux + virtual console blanks the screen. Note that this is only used by + the virtual console screen blanker, and won't turn off the backlight + when using the X Window system. This also doesn't have anything to + do with your VESA-compliant power-saving monitor. Further, this + option doesn't work for all laptops -- it might not turn off your + backlight at all, or it might print a lot of errors to the console, + especially if you are using gpm. + +config APM_RTC_IS_GMT + bool "RTC stores time in GMT" + depends on APM + help + Say Y here if your RTC (Real Time Clock a.k.a. hardware clock) + stores the time in GMT (Greenwich Mean Time). Say N if your RTC + stores localtime. + + It is in fact recommended to store GMT in your RTC, because then you + don't have to worry about daylight savings time changes. The only + reason not to use GMT in your RTC is if you also run a broken OS + that doesn't understand GMT. + +config APM_ALLOW_INTS + bool "Allow interrupts during APM BIOS calls" + depends on APM + help + Normally we disable external interrupts while we are making calls to + the APM BIOS as a measure to lessen the effects of a badly behaving + BIOS implementation. The BIOS should reenable interrupts if it + needs to. Unfortunately, some BIOSes do not -- especially those in + many of the newer IBM Thinkpads. If you experience hangs when you + suspend, try setting this to Y. Otherwise, say N. + +config APM_REAL_MODE_POWER_OFF + bool "Use real mode APM BIOS call to power off" + depends on APM + help + Use real mode APM BIOS calls to switch off the computer. This is + a work-around for a number of buggy BIOSes. Switch this option on if + your computer crashes instead of powering off properly. + +endmenu + +source "arch/i386/kernel/cpu/cpufreq/Kconfig" + +endmenu + +menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" + +config PCI + bool "PCI support" if !X86_VISWS + depends on !X86_VOYAGER + default y if X86_VISWS + help + Find out whether you have a PCI motherboard. PCI is the name of a + bus system, i.e. the way the CPU talks to the other stuff inside + your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or + VESA. If you have PCI, say Y, otherwise N. + + The PCI-HOWTO, available from + <http://www.tldp.org/docs.html#howto>, contains valuable + information about which PCI hardware does work under Linux and which + doesn't. + +choice + prompt "PCI access mode" + depends on PCI && !X86_VISWS + default PCI_GOANY + ---help--- + On PCI systems, the BIOS can be used to detect the PCI devices and + determine their configuration. However, some old PCI motherboards + have BIOS bugs and may crash if this is done. Also, some embedded + PCI-based systems don't have any BIOS at all. Linux can also try to + detect the PCI hardware directly without using the BIOS. + + With this option, you can specify how Linux should detect the + PCI devices. If you choose "BIOS", the BIOS will be used, + if you choose "Direct", the BIOS won't be used, and if you + choose "MMConfig", then PCI Express MMCONFIG will be used. + If you choose "Any", the kernel will try MMCONFIG, then the + direct access method and falls back to the BIOS if that doesn't + work. If unsure, go with the default, which is "Any". + +config PCI_GOBIOS + bool "BIOS" + +config PCI_GOMMCONFIG + bool "MMConfig" + +config PCI_GODIRECT + bool "Direct" + +config PCI_GOANY + bool "Any" + +endchoice + +config PCI_BIOS + bool + depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) + default y + +config PCI_DIRECT + bool + depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS) + default y + +config PCI_MMCONFIG + bool + depends on PCI && (PCI_GOMMCONFIG || (PCI_GOANY && ACPI)) + select ACPI_BOOT + default y + +source "drivers/pci/pcie/Kconfig" + +source "drivers/pci/Kconfig" + +config ISA + bool "ISA support" + depends on !(X86_VOYAGER || X86_VISWS) + help + Find out whether you have ISA slots on your motherboard. ISA is the + name of a bus system, i.e. the way the CPU talks to the other stuff + inside your box. Other bus systems are PCI, EISA, MicroChannel + (MCA) or VESA. ISA is an older system, now being displaced by PCI; + newer boards don't support it. If you have ISA, say Y, otherwise N. + +config EISA + bool "EISA support" + depends on ISA + ---help--- + The Extended Industry Standard Architecture (EISA) bus was + developed as an open alternative to the IBM MicroChannel bus. + + The EISA bus provided some of the features of the IBM MicroChannel + bus while maintaining backward compatibility with cards made for + the older ISA bus. The EISA bus saw limited use between 1988 and + 1995 when it was made obsolete by the PCI bus. + + Say Y here if you are building a kernel for an EISA-based machine. + + Otherwise, say N. + +source "drivers/eisa/Kconfig" + +config MCA + bool "MCA support" if !(X86_VISWS || X86_VOYAGER) + default y if X86_VOYAGER + help + MicroChannel Architecture is found in some IBM PS/2 machines and + laptops. It is a bus system similar to PCI or ISA. See + <file:Documentation/mca.txt> (and especially the web page given + there) before attempting to build an MCA bus kernel. + +source "drivers/mca/Kconfig" + +config SCx200 + tristate "NatSemi SCx200 support" + depends on !X86_VOYAGER + help + This provides basic support for the National Semiconductor SCx200 + processor. Right now this is just a driver for the GPIO pins. + + If you don't know what to do here, say N. + + This support is also available as a module. If compiled as a + module, it will be called scx200. + +source "drivers/pcmcia/Kconfig" + +source "drivers/pci/hotplug/Kconfig" + +endmenu + +menu "Executable file formats" + +source "fs/Kconfig.binfmt" + +endmenu + +source "drivers/Kconfig" + +source "fs/Kconfig" + +source "arch/i386/oprofile/Kconfig" + +source "arch/i386/Kconfig.debug" + +source "security/Kconfig" + +source "crypto/Kconfig" + +source "lib/Kconfig" + +# +# Use the generic interrupt handling code in kernel/irq/: +# +config GENERIC_HARDIRQS + bool + default y + +config GENERIC_IRQ_PROBE + bool + default y + +config X86_SMP + bool + depends on SMP && !X86_VOYAGER + default y + +config X86_HT + bool + depends on SMP && !(X86_VISWS || X86_VOYAGER) + default y + +config X86_BIOS_REBOOT + bool + depends on !(X86_VISWS || X86_VOYAGER) + default y + +config X86_TRAMPOLINE + bool + depends on X86_SMP || (X86_VOYAGER && SMP) + default y + +config PC + bool + depends on X86 && !EMBEDDED + default y diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug new file mode 100644 index 000000000000..bfb2064f7104 --- /dev/null +++ b/arch/i386/Kconfig.debug @@ -0,0 +1,72 @@ +menu "Kernel hacking" + +source "lib/Kconfig.debug" + +config EARLY_PRINTK + bool "Early printk" if EMBEDDED && DEBUG_KERNEL + default y + help + Write kernel log output directly into the VGA buffer or to a serial + port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. + +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL + +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + +comment "Page alloc debug is incompatible with Software Suspend on i386" + depends on DEBUG_KERNEL && SOFTWARE_SUSPEND + +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + +config 4KSTACKS + bool "Use 4Kb for kernel stacks instead of 8Kb" + depends on DEBUG_KERNEL + help + If you say Y here the kernel will use a 4Kb stacksize for the + kernel stack attached to each process/thread. This facilitates + running more threads on a system and also reduces the pressure + on the VM subsystem for higher order allocations. This option + will also use IRQ stacks to compensate for the reduced stackspace. + +config X86_FIND_SMP_CONFIG + bool + depends on X86_LOCAL_APIC || X86_VOYAGER + default y + +config X86_MPPARSE + bool + depends on X86_LOCAL_APIC && !X86_VISWS + default y + +endmenu diff --git a/arch/i386/Makefile b/arch/i386/Makefile new file mode 100644 index 000000000000..314c7146e9bf --- /dev/null +++ b/arch/i386/Makefile @@ -0,0 +1,173 @@ +# +# i386/Makefile +# +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. Remember to do have actions +# for "archclean" cleaning up for this architecture. +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1994 by Linus Torvalds +# +# 19990713 Artur Skawina <skawina@geocities.com> +# Added '-march' and '-mpreferred-stack-boundary' support +# +# Kianusch Sayah Karadji <kianusch@sk-tech.net> +# Added support for GEODE CPU + +LDFLAGS := -m elf_i386 +OBJCOPYFLAGS := -O binary -R .note -R .comment -S +LDFLAGS_vmlinux := +CHECKFLAGS += -D__i386__ + +CFLAGS += -pipe -msoft-float + +# prevent gcc from keeping the stack 16 byte aligned +CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + +align := $(cc-option-align) +cflags-$(CONFIG_M386) += -march=i386 +cflags-$(CONFIG_M486) += -march=i486 +cflags-$(CONFIG_M586) += -march=i586 +cflags-$(CONFIG_M586TSC) += -march=i586 +cflags-$(CONFIG_M586MMX) += $(call cc-option,-march=pentium-mmx,-march=i586) +cflags-$(CONFIG_M686) += -march=i686 +cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call cc-option,-mtune=pentium2) +cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call cc-option,-mtune=pentium3) +cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call cc-option,-mtune=pentium3) +cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call cc-option,-mtune=pentium4) +cflags-$(CONFIG_MK6) += -march=k6 +# Please note, that patches that add -march=athlon-xp and friends are pointless. +# They make zero difference whatsosever to performance at this time. +cflags-$(CONFIG_MK7) += $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4) +cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)) +cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call cc-option,-mtune=pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) +cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) +cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) +cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) + +# AMD Elan support +cflags-$(CONFIG_X86_ELAN) += -march=i486 + +# MediaGX aka Geode support +cflags-$(CONFIG_MGEODE) += $(call cc-option,-march=pentium-mmx,-march=i586) + +# -mregparm=3 works ok on gcc-3.0 and later +# +GCC_VERSION := $(call cc-version) +cflags-$(CONFIG_REGPARM) += $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;) + +# Disable unit-at-a-time mode, it makes gcc use a lot more stack +# due to the lack of sharing of stacklots. +CFLAGS += $(call cc-option,-fno-unit-at-a-time) + +CFLAGS += $(cflags-y) + +# Default subarch .c files +mcore-y := mach-default + +# Voyager subarch support +mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-i386/mach-voyager +mcore-$(CONFIG_X86_VOYAGER) := mach-voyager + +# VISWS subarch support +mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-i386/mach-visws +mcore-$(CONFIG_X86_VISWS) := mach-visws + +# NUMAQ subarch support +mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-i386/mach-numaq +mcore-$(CONFIG_X86_NUMAQ) := mach-default + +# BIGSMP subarch support +mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-i386/mach-bigsmp +mcore-$(CONFIG_X86_BIGSMP) := mach-default + +#Summit subarch support +mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit +mcore-$(CONFIG_X86_SUMMIT) := mach-default + +# generic subarchitecture +mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic +mcore-$(CONFIG_X86_GENERICARCH) := mach-default +core-$(CONFIG_X86_GENERICARCH) += arch/i386/mach-generic/ + +# ES7000 subarch support +mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 +mcore-$(CONFIG_X86_ES7000) := mach-default +core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ + +# default subarch .h files +mflags-y += -Iinclude/asm-i386/mach-default + +head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o + +libs-y += arch/i386/lib/ +core-y += arch/i386/kernel/ \ + arch/i386/mm/ \ + arch/i386/$(mcore-y)/ \ + arch/i386/crypto/ +drivers-$(CONFIG_MATH_EMULATION) += arch/i386/math-emu/ +drivers-$(CONFIG_PCI) += arch/i386/pci/ +# must be linked after kernel/ +drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ +drivers-$(CONFIG_PM) += arch/i386/power/ + +CFLAGS += $(mflags-y) +AFLAGS += $(mflags-y) + +boot := arch/i386/boot + +.PHONY: zImage bzImage compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 install + +all: bzImage + +# KBUILD_IMAGE specify target image being built + KBUILD_IMAGE := $(boot)/bzImage +zImage zlilo zdisk: KBUILD_IMAGE := arch/i386/boot/zImage + +zImage bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + +compressed: zImage + +zlilo bzlilo: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo + +zdisk bzdisk: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk + +fdimage fdimage144 fdimage288: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ + +install: + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ + +prepare: include/asm-$(ARCH)/asm_offsets.h +CLEAN_FILES += include/asm-$(ARCH)/asm_offsets.h + +arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ + include/config/MARKER + +include/asm-$(ARCH)/asm_offsets.h: arch/$(ARCH)/kernel/asm-offsets.s + $(call filechk,gen-asm-offsets) + +archclean: + $(Q)$(MAKE) $(clean)=arch/i386/boot + +define archhelp + echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)' + echo ' install - Install kernel using' + echo ' (your) ~/bin/installkernel or' + echo ' (distribution) /sbin/installkernel or' + echo ' install to $$(INSTALL_PATH) and run lilo' + echo ' bzdisk - Create a boot floppy in /dev/fd0' + echo ' fdimage - Create a boot floppy image' +endef + +CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile new file mode 100644 index 000000000000..aa7064a75ee6 --- /dev/null +++ b/arch/i386/boot/Makefile @@ -0,0 +1,104 @@ +# +# arch/i386/boot/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1994 by Linus Torvalds +# + +# ROOT_DEV specifies the default root-device when making the image. +# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case +# the default of FLOPPY is used by 'build'. + +ROOT_DEV := CURRENT + +# If you want to preset the SVGA mode, uncomment the next line and +# set SVGA_MODE to whatever number you want. +# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. +# The number is the same as you would ordinarily press at bootup. + +SVGA_MODE := -DSVGA_MODE=NORMAL_VGA + +# If you want the RAM disk device, define this to be the size in blocks. + +#RAMDISK := -DRAMDISK=512 + +targets := vmlinux.bin bootsect bootsect.o setup setup.o \ + zImage bzImage +subdir- := compressed + +hostprogs-y := tools/build + +HOSTCFLAGS_build.o := $(LINUXINCLUDE) + +# --------------------------------------------------------------------------- + +$(obj)/zImage: IMAGE_OFFSET := 0x1000 +$(obj)/zImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) +$(obj)/bzImage: IMAGE_OFFSET := 0x100000 +$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: BUILDFLAGS := -b + +quiet_cmd_image = BUILD $@ +cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ + $(obj)/vmlinux.bin $(ROOT_DEV) > $@ + +$(obj)/zImage $(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ + $(obj)/vmlinux.bin $(obj)/tools/build FORCE + $(call if_changed,image) + @echo 'Kernel: $@ is ready' + +$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE + $(call if_changed,objcopy) + +LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary +LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext + +$(obj)/setup $(obj)/bootsect: %: %.o FORCE + $(call if_changed,ld) + +$(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ + +# Set this if you want to pass append arguments to the zdisk/fdimage kernel +FDARGS = + +$(obj)/mtools.conf: $(src)/mtools.conf.in + sed -e 's|@OBJ@|$(obj)|g' < $< > $@ + +# This requires write access to /dev/fd0 +zdisk: $(BOOTIMAGE) $(obj)/mtools.conf + MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync + syslinux /dev/fd0 ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync + +# These require being root or having syslinux 2.02 or higher installed +fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 + MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync + syslinux $(obj)/fdimage ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync + +fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 + MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync + syslinux $(obj)/fdimage ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync + +zlilo: $(BOOTIMAGE) + if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi + if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi + cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz + cp System.map $(INSTALL_PATH)/ + if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi + +install: $(BOOTIMAGE) + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $< System.map "$(INSTALL_PATH)" diff --git a/arch/i386/boot/bootsect.S b/arch/i386/boot/bootsect.S new file mode 100644 index 000000000000..ba9fe14db6a9 --- /dev/null +++ b/arch/i386/boot/bootsect.S @@ -0,0 +1,98 @@ +/* + * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds + * + * modified by Drew Eckhardt + * modified by Bruce Evans (bde) + * modified by Chris Noe (May 1999) (as86 -> gas) + * gutted by H. Peter Anvin (Jan 2003) + * + * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment + * addresses must be multiplied by 16 to obtain their respective linear + * addresses. To avoid confusion, linear addresses are written using leading + * hex while segment addresses are written as segment:offset. + * + */ + +#include <asm/boot.h> + +SETUPSECTS = 4 /* default nr of setup-sectors */ +BOOTSEG = 0x07C0 /* original address of boot-sector */ +INITSEG = DEF_INITSEG /* we move boot here - out of the way */ +SETUPSEG = DEF_SETUPSEG /* setup starts here */ +SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ +SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ + /* to be loaded */ +ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ +SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ + +#ifndef SVGA_MODE +#define SVGA_MODE ASK_VGA +#endif + +#ifndef RAMDISK +#define RAMDISK 0 +#endif + +#ifndef ROOT_RDONLY +#define ROOT_RDONLY 1 +#endif + +.code16 +.text + +.global _start +_start: + + # Normalize the start address + jmpl $BOOTSEG, $start2 + +start2: + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + movw $0x7c00, %sp + sti + cld + + movw $bugger_off_msg, %si + +msg_loop: + lodsb + andb %al, %al + jz die + movb $0xe, %ah + movw $7, %bx + int $0x10 + jmp msg_loop + +die: + # Allow the user to press a key, then reboot + xorw %ax, %ax + int $0x16 + int $0x19 + + # int 0x19 should never return. In case it does anyway, + # invoke the BIOS reset code... + ljmp $0xf000,$0xfff0 + + +bugger_off_msg: + .ascii "Direct booting from floppy is no longer supported.\r\n" + .ascii "Please use a boot loader program instead.\r\n" + .ascii "\n" + .ascii "Remove disk and press any key to reboot . . .\r\n" + .byte 0 + + + # Kernel attributes; used by setup + + .org 497 +setup_sects: .byte SETUPSECTS +root_flags: .word ROOT_RDONLY +syssize: .word SYSSIZE +swap_dev: .word SWAP_DEV +ram_size: .word RAMDISK +vid_mode: .word SVGA_MODE +root_dev: .word ROOT_DEV +boot_flag: .word 0xAA55 diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile new file mode 100644 index 000000000000..258ea95224f6 --- /dev/null +++ b/arch/i386/boot/compressed/Makefile @@ -0,0 +1,25 @@ +# +# linux/arch/i386/boot/compressed/Makefile +# +# create a compressed vmlinux image from the original vmlinux +# + +targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o +EXTRA_AFLAGS := -traditional + +LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 + +$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE + $(call if_changed,ld) + @: + +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) + +LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T + +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE + $(call if_changed,ld) diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S new file mode 100644 index 000000000000..c5e80b69e7d4 --- /dev/null +++ b/arch/i386/boot/compressed/head.S @@ -0,0 +1,128 @@ +/* + * linux/boot/head.S + * + * Copyright (C) 1991, 1992, 1993 Linus Torvalds + */ + +/* + * head.S contains the 32-bit startup code. + * + * NOTE!!! Startup happens at absolute address 0x00001000, which is also where + * the page directory will exist. The startup code will be overwritten by + * the page directory. [According to comments etc elsewhere on a compressed + * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] + * + * Page 0 is deliberately kept safe, since System Management Mode code in + * laptops may need to access the BIOS data stored there. This is also + * useful for future device drivers that either access the BIOS via VM86 + * mode. + */ + +/* + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ +.text + +#include <linux/linkage.h> +#include <asm/segment.h> + + .globl startup_32 + +startup_32: + cld + cli + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + + lss stack_start,%esp + xorl %eax,%eax +1: incl %eax # check that A20 really IS enabled + movl %eax,0x000000 # loop forever if it isn't + cmpl %eax,0x100000 + je 1b + +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ + pushl $0 + popfl +/* + * Clear BSS + */ + xorl %eax,%eax + movl $_edata,%edi + movl $_end,%ecx + subl %edi,%ecx + cld + rep + stosb +/* + * Do the decompression, and jump to the new kernel.. + */ + subl $16,%esp # place for structure on the stack + movl %esp,%eax + pushl %esi # real mode pointer as second arg + pushl %eax # address of structure as first arg + call decompress_kernel + orl %eax,%eax + jnz 3f + popl %esi # discard address + popl %esi # real mode pointer + xorl %ebx,%ebx + ljmp $(__BOOT_CS), $0x100000 + +/* + * We come here, if we were loaded high. + * We need to move the move-in-place routine down to 0x1000 + * and then start it with the buffer addresses in registers, + * which we got from the stack. + */ +3: + movl $move_routine_start,%esi + movl $0x1000,%edi + movl $move_routine_end,%ecx + subl %esi,%ecx + addl $3,%ecx + shrl $2,%ecx + cld + rep + movsl + + popl %esi # discard the address + popl %ebx # real mode pointer + popl %esi # low_buffer_start + popl %ecx # lcount + popl %edx # high_buffer_start + popl %eax # hcount + movl $0x100000,%edi + cli # make sure we don't get interrupted + ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine + +/* + * Routine (template) for moving the decompressed kernel in place, + * if we were high loaded. This _must_ PIC-code ! + */ +move_routine_start: + movl %ecx,%ebp + shrl $2,%ecx + rep + movsl + movl %ebp,%ecx + andl $3,%ecx + rep + movsb + movl %edx,%esi + movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 + addl $3,%ecx + shrl $2,%ecx + rep + movsl + movl %ebx,%esi # Restore setup pointer + xorl %ebx,%ebx + ljmp $(__BOOT_CS), $0x100000 +move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c new file mode 100644 index 000000000000..fa67045234a3 --- /dev/null +++ b/arch/i386/boot/compressed/misc.c @@ -0,0 +1,382 @@ +/* + * misc.c + * + * This is a collection of several routines from gzip-1.0.3 + * adapted for Linux. + * + * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 + * puts by Nick Holloway 1993, better puts by Martin Mares 1995 + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ + +#include <linux/linkage.h> +#include <linux/vmalloc.h> +#include <linux/tty.h> +#include <video/edid.h> +#include <asm/io.h> + +/* + * gzip declarations + */ + +#define OF(args) args +#define STATIC static + +#undef memset +#undef memcpy + +/* + * Why do we do this? Don't ask me.. + * + * Incomprehensible are the ways of bootloaders. + */ +static void* memset(void *, int, size_t); +static void* memcpy(void *, __const void *, size_t); +#define memzero(s, n) memset ((s), 0, (n)) + +typedef unsigned char uch; +typedef unsigned short ush; +typedef unsigned long ulg; + +#define WSIZE 0x8000 /* Window size must be at least 32k, */ + /* and a power of two */ + +static uch *inbuf; /* input buffer */ +static uch window[WSIZE]; /* Sliding window buffer */ + +static unsigned insize = 0; /* valid bytes in inbuf */ +static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ +static unsigned outcnt = 0; /* bytes in output buffer */ + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ +#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ +#define RESERVED 0xC0 /* bit 6,7: reserved */ + +#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) + +/* Diagnostic functions */ +#ifdef DEBUG +# define Assert(cond,msg) {if(!(cond)) error(msg);} +# define Trace(x) fprintf x +# define Tracev(x) {if (verbose) fprintf x ;} +# define Tracevv(x) {if (verbose>1) fprintf x ;} +# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} +# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + +static int fill_inbuf(void); +static void flush_window(void); +static void error(char *m); +static void gzip_mark(void **); +static void gzip_release(void **); + +/* + * This is set up by the setup-routine at boot-time + */ +static unsigned char *real_mode; /* Pointer to real-mode data */ + +#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) +#ifndef STANDARD_MEMORY_BIOS_CALL +#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) +#endif +#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) + +extern char input_data[]; +extern int input_len; + +static long bytes_out = 0; +static uch *output_data; +static unsigned long output_ptr = 0; + +static void *malloc(int size); +static void free(void *where); + +static void putstr(const char *); + +extern int end; +static long free_mem_ptr = (long)&end; +static long free_mem_end_ptr; + +#define INPLACE_MOVE_ROUTINE 0x1000 +#define LOW_BUFFER_START 0x2000 +#define LOW_BUFFER_MAX 0x90000 +#define HEAP_SIZE 0x3000 +static unsigned int low_buffer_end, low_buffer_size; +static int high_loaded =0; +static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; + +static char *vidmem = (char *)0xb8000; +static int vidport; +static int lines, cols; + +#ifdef CONFIG_X86_NUMAQ +static void * xquad_portio = NULL; +#endif + +#include "../../../../lib/inflate.c" + +static void *malloc(int size) +{ + void *p; + + if (size <0) error("Malloc error"); + if (free_mem_ptr <= 0) error("Memory error"); + + free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ + + p = (void *)free_mem_ptr; + free_mem_ptr += size; + + if (free_mem_ptr >= free_mem_end_ptr) + error("Out of memory"); + + return p; +} + +static void free(void *where) +{ /* Don't care */ +} + +static void gzip_mark(void **ptr) +{ + *ptr = (void *) free_mem_ptr; +} + +static void gzip_release(void **ptr) +{ + free_mem_ptr = (long) *ptr; +} + +static void scroll(void) +{ + int i; + + memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); + for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) + vidmem[i] = ' '; +} + +static void putstr(const char *s) +{ + int x,y,pos; + char c; + + x = RM_SCREEN_INFO.orig_x; + y = RM_SCREEN_INFO.orig_y; + + while ( ( c = *s++ ) != '\0' ) { + if ( c == '\n' ) { + x = 0; + if ( ++y >= lines ) { + scroll(); + y--; + } + } else { + vidmem [ ( x + cols * y ) * 2 ] = c; + if ( ++x >= cols ) { + x = 0; + if ( ++y >= lines ) { + scroll(); + y--; + } + } + } + } + + RM_SCREEN_INFO.orig_x = x; + RM_SCREEN_INFO.orig_y = y; + + pos = (x + cols * y) * 2; /* Update cursor position */ + outb_p(14, vidport); + outb_p(0xff & (pos >> 9), vidport+1); + outb_p(15, vidport); + outb_p(0xff & (pos >> 1), vidport+1); +} + +static void* memset(void* s, int c, size_t n) +{ + int i; + char *ss = (char*)s; + + for (i=0;i<n;i++) ss[i] = c; + return s; +} + +static void* memcpy(void* __dest, __const void* __src, + size_t __n) +{ + int i; + char *d = (char *)__dest, *s = (char *)__src; + + for (i=0;i<__n;i++) d[i] = s[i]; + return __dest; +} + +/* =========================================================================== + * Fill the input buffer. This is called only when the buffer is empty + * and at least one byte is really needed. + */ +static int fill_inbuf(void) +{ + if (insize != 0) { + error("ran out of input data"); + } + + inbuf = input_data; + insize = input_len; + inptr = 1; + return inbuf[0]; +} + +/* =========================================================================== + * Write the output window window[0..outcnt-1] and update crc and bytes_out. + * (Used for the decompressed data only.) + */ +static void flush_window_low(void) +{ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, *out, ch; + + in = window; + out = &output_data[output_ptr]; + for (n = 0; n < outcnt; n++) { + ch = *out++ = *in++; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + output_ptr += (ulg)outcnt; + outcnt = 0; +} + +static void flush_window_high(void) +{ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + in = window; + for (n = 0; n < outcnt; n++) { + ch = *output_data++ = *in++; + if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; +} + +static void flush_window(void) +{ + if (high_loaded) flush_window_high(); + else flush_window_low(); +} + +static void error(char *x) +{ + putstr("\n\n"); + putstr(x); + putstr("\n\n -- System halted"); + + while(1); /* Halt */ +} + +#define STACK_SIZE (4096) + +long user_stack [STACK_SIZE]; + +struct { + long * a; + short b; + } stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS }; + +static void setup_normal_output_buffer(void) +{ +#ifdef STANDARD_MEMORY_BIOS_CALL + if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); +#else + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); +#endif + output_data = (char *)0x100000; /* Points to 1M */ + free_mem_end_ptr = (long)real_mode; +} + +struct moveparams { + uch *low_buffer_start; int lcount; + uch *high_buffer_start; int hcount; +}; + +static void setup_output_buffer_if_we_run_high(struct moveparams *mv) +{ + high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); +#ifdef STANDARD_MEMORY_BIOS_CALL + if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); +#else + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < + (3*1024)) + error("Less than 4MB of memory"); +#endif + mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; + low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX + ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; + low_buffer_size = low_buffer_end - LOW_BUFFER_START; + high_loaded = 1; + free_mem_end_ptr = (long)high_buffer_start; + if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(0x100000 + low_buffer_size); + mv->hcount = 0; /* say: we need not to move high_buffer */ + } + else mv->hcount = -1; + mv->high_buffer_start = high_buffer_start; +} + +static void close_output_buffer_if_we_run_high(struct moveparams *mv) +{ + if (bytes_out > low_buffer_size) { + mv->lcount = low_buffer_size; + if (mv->hcount) + mv->hcount = bytes_out - low_buffer_size; + } else { + mv->lcount = bytes_out; + mv->hcount = 0; + } +} + + +asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) +{ + real_mode = rmode; + + if (RM_SCREEN_INFO.orig_video_mode == 7) { + vidmem = (char *) 0xb0000; + vidport = 0x3b4; + } else { + vidmem = (char *) 0xb8000; + vidport = 0x3d4; + } + + lines = RM_SCREEN_INFO.orig_video_lines; + cols = RM_SCREEN_INFO.orig_video_cols; + + if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); + else setup_output_buffer_if_we_run_high(mv); + + makecrc(); + putstr("Uncompressing Linux... "); + gunzip(); + putstr("Ok, booting the kernel.\n"); + if (high_loaded) close_output_buffer_if_we_run_high(mv); + return high_loaded; +} diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..1ed9d791f863 --- /dev/null +++ b/arch/i386/boot/compressed/vmlinux.scr @@ -0,0 +1,9 @@ +SECTIONS +{ + .data : { + input_len = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + input_data_end = .; + } +} diff --git a/arch/i386/boot/edd.S b/arch/i386/boot/edd.S new file mode 100644 index 000000000000..027d6b354ffb --- /dev/null +++ b/arch/i386/boot/edd.S @@ -0,0 +1,176 @@ +/* + * BIOS Enhanced Disk Drive support + * Copyright (C) 2002, 2003, 2004 Dell, Inc. + * by Matt Domsch <Matt_Domsch@dell.com> October 2002 + * conformant to T13 Committee www.t13.org + * projects 1572D, 1484D, 1386D, 1226DT + * disk signature read by Matt Domsch <Matt_Domsch@dell.com> + * and Andrew Wilks <Andrew_Wilks@dell.com> September 2003, June 2004 + * legacy CHS retreival by Patrick J. LoPresti <patl@users.sourceforge.net> + * March 2004 + * Command line option parsing, Matt Domsch, November 2004 + */ + +#include <linux/edd.h> +#include <asm/setup.h> + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + movb $0, (EDD_MBR_SIG_NR_BUF) + movb $0, (EDDNR) + +# Check the command line for two options: +# edd=of disables EDD completely (edd=off) +# edd=sk skips the MBR test (edd=skipmbr) + pushl %esi + cmpl $0, %cs:cmd_line_ptr + jz done_cl + movl %cs:(cmd_line_ptr), %esi +# ds:esi has the pointer to the command line now + movl $(COMMAND_LINE_SIZE-7), %ecx +# loop through kernel command line one byte at a time +cl_loop: + cmpl $EDD_CL_EQUALS, (%si) + jz found_edd_equals + incl %esi + loop cl_loop + jmp done_cl +found_edd_equals: +# only looking at first two characters after equals + addl $4, %esi + cmpw $EDD_CL_OFF, (%si) # edd=of + jz do_edd_off + cmpw $EDD_CL_SKIP, (%si) # edd=sk + jz do_edd_skipmbr + jmp done_cl +do_edd_skipmbr: + popl %esi + jmp edd_start +do_edd_off: + popl %esi + jmp edd_done +done_cl: + popl %esi + + +# Read the first sector of each BIOS disk device and store the 4-byte signature +edd_mbr_sig_start: + movb $0x80, %dl # from device 80 + movw $EDD_MBR_SIG_BUF, %bx # store buffer ptr in bx +edd_mbr_sig_read: + movl $0xFFFFFFFF, %eax + movl %eax, (%bx) # assume failure + pushw %bx + movb $READ_SECTORS, %ah + movb $1, %al # read 1 sector + movb $0, %dh # at head 0 + movw $1, %cx # cylinder 0, sector 0 + pushw %es + pushw %ds + popw %es + movw $EDDBUF, %bx # disk's data goes into EDDBUF + pushw %dx # work around buggy BIOSes + stc # work around buggy BIOSes + int $0x13 + sti # work around buggy BIOSes + popw %dx + popw %es + popw %bx + jc edd_mbr_sig_done # on failure, we're done. + movl (EDDBUF+EDD_MBR_SIG_OFFSET), %eax # read sig out of the MBR + movl %eax, (%bx) # store success + incb (EDD_MBR_SIG_NR_BUF) # note that we stored something + incb %dl # increment to next device + addw $4, %bx # increment sig buffer ptr + cmpb $EDD_MBR_SIG_MAX, (EDD_MBR_SIG_NR_BUF) # Out of space? + jb edd_mbr_sig_read # keep looping +edd_mbr_sig_done: + +# Do the BIOS Enhanced Disk Drive calls +# This consists of two calls: +# int 13h ah=41h "Check Extensions Present" +# int 13h ah=48h "Get Device Parameters" +# int 13h ah=08h "Legacy Get Device Parameters" +# +# A buffer of size EDDMAXNR*(EDDEXTSIZE+EDDPARMSIZE) is reserved for our use +# in the boot_params at EDDBUF. The first four bytes of which are +# used to store the device number, interface support map and version +# results from fn41. The next four bytes are used to store the legacy +# cylinders, heads, and sectors from fn08. The following 74 bytes are used to +# store the results from fn48. Starting from device 80h, fn41, then fn48 +# are called and their results stored in EDDBUF+n*(EDDEXTSIZE+EDDPARMIZE). +# Then the pointer is incremented to store the data for the next call. +# This repeats until either a device doesn't exist, or until EDDMAXNR +# devices have been stored. +# The one tricky part is that ds:si always points EDDEXTSIZE bytes into +# the structure, and the fn41 and fn08 results are stored at offsets +# from there. This removes the need to increment the pointer for +# every store, and leaves it ready for the fn48 call. +# A second one-byte buffer, EDDNR, in the boot_params stores +# the number of BIOS devices which exist, up to EDDMAXNR. +# In setup.c, copy_edd() stores both boot_params buffers away +# for later use, as they would get overwritten otherwise. +# This code is sensitive to the size of the structs in edd.h +edd_start: + # %ds points to the bootsector + # result buffer for fn48 + movw $EDDBUF+EDDEXTSIZE, %si # in ds:si, fn41 results + # kept just before that + movb $0x80, %dl # BIOS device 0x80 + +edd_check_ext: + movb $CHECKEXTENSIONSPRESENT, %ah # Function 41 + movw $EDDMAGIC1, %bx # magic + int $0x13 # make the call + jc edd_done # no more BIOS devices + + cmpw $EDDMAGIC2, %bx # is magic right? + jne edd_next # nope, next... + + movb %dl, %ds:-8(%si) # store device number + movb %ah, %ds:-7(%si) # store version + movw %cx, %ds:-6(%si) # store extensions + incb (EDDNR) # note that we stored something + +edd_get_device_params: + movw $EDDPARMSIZE, %ds:(%si) # put size + movw $0x0, %ds:2(%si) # work around buggy BIOSes + movb $GETDEVICEPARAMETERS, %ah # Function 48 + int $0x13 # make the call + # Don't check for fail return + # it doesn't matter. +edd_get_legacy_chs: + xorw %ax, %ax + movw %ax, %ds:-4(%si) + movw %ax, %ds:-2(%si) + # Ralf Brown's Interrupt List says to set ES:DI to + # 0000h:0000h "to guard against BIOS bugs" + pushw %es + movw %ax, %es + movw %ax, %di + pushw %dx # legacy call clobbers %dl + movb $LEGACYGETDEVICEPARAMETERS, %ah # Function 08 + int $0x13 # make the call + jc edd_legacy_done # failed + movb %cl, %al # Low 6 bits are max + andb $0x3F, %al # sector number + movb %al, %ds:-1(%si) # Record max sect + movb %dh, %ds:-2(%si) # Record max head number + movb %ch, %al # Low 8 bits of max cyl + shr $6, %cl + movb %cl, %ah # High 2 bits of max cyl + movw %ax, %ds:-4(%si) + +edd_legacy_done: + popw %dx + popw %es + movw %si, %ax # increment si + addw $EDDPARMSIZE+EDDEXTSIZE, %ax + movw %ax, %si + +edd_next: + incb %dl # increment to next device + cmpb $EDDMAXNR, (EDDNR) # Out of space? + jb edd_check_ext # keep looping + +edd_done: +#endif diff --git a/arch/i386/boot/install.sh b/arch/i386/boot/install.sh new file mode 100644 index 000000000000..90f2452b3b9e --- /dev/null +++ b/arch/i386/boot/install.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# arch/i386/boot/install.sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for i386 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) +# + +# User may have a custom install script + +if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi +if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi + +# Default install - same as make zlilo + +if [ -f $4/vmlinuz ]; then + mv $4/vmlinuz $4/vmlinuz.old +fi + +if [ -f $4/System.map ]; then + mv $4/System.map $4/System.old +fi + +cat $2 > $4/vmlinuz +cp $3 $4/System.map + +if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi diff --git a/arch/i386/boot/mtools.conf.in b/arch/i386/boot/mtools.conf.in new file mode 100644 index 000000000000..efd6d2490c1d --- /dev/null +++ b/arch/i386/boot/mtools.conf.in @@ -0,0 +1,17 @@ +# +# mtools configuration file for "make (b)zdisk" +# + +# Actual floppy drive +drive a: + file="/dev/fd0" + +# 1.44 MB floppy disk image +drive v: + file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter + +# 2.88 MB floppy disk image (mostly for virtual uses) +drive w: + file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter + + diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S new file mode 100644 index 000000000000..a934ab32bf8e --- /dev/null +++ b/arch/i386/boot/setup.S @@ -0,0 +1,1028 @@ +/* + * setup.S Copyright (C) 1991, 1992 Linus Torvalds + * + * setup.s is responsible for getting the system data from the BIOS, + * and putting them into the appropriate places in system memory. + * both setup.s and system has been loaded by the bootblock. + * + * This code asks the bios for memory/disk/other parameters, and + * puts them in a "safe" place: 0x90000-0x901FF, ie where the + * boot-block used to be. It is then up to the protected mode + * system to read them from there before the area is overwritten + * for buffer-blocks. + * + * Move PS/2 aux init code to psaux.c + * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92 + * + * some changes and additional features by Christoph Niemann, + * March 1993/June 1994 (Christoph.Niemann@linux.org) + * + * add APM BIOS checking by Stephen Rothwell, May 1994 + * (sfr@canb.auug.org.au) + * + * High load stuff, initrd support and position independency + * by Hans Lermen & Werner Almesberger, February 1996 + * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch> + * + * Video handling moved to video.S by Martin Mares, March 1996 + * <mj@k332.feld.cvut.cz> + * + * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david + * parsons) to avoid loadlin confusion, July 1997 + * + * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. + * <stiker@northlink.com> + * + * Fix to work around buggy BIOSes which dont use carry bit correctly + * and/or report extended memory in CX/DX for e801h memory size detection + * call. As a result the kernel got wrong figures. The int15/e801h docs + * from Ralf Brown interrupt list seem to indicate AX/BX should be used + * anyway. So to avoid breaking many machines (presumably there was a reason + * to orginally use CX/DX instead of AX/BX), we do a kludge to see + * if CX/DX have been changed in the e801 call and if so use AX/BX . + * Michael Miller, April 2001 <michaelm@mjmm.org> + * + * New A20 code ported from SYSLINUX by H. Peter Anvin. AMD Elan bugfixes + * by Robert Schwebel, December 2001 <robert@schwebel.de> + */ + +#include <linux/config.h> +#include <asm/segment.h> +#include <linux/version.h> +#include <linux/compile.h> +#include <asm/boot.h> +#include <asm/e820.h> +#include <asm/page.h> + +/* Signature words to ensure LILO loaded us right */ +#define SIG1 0xAA55 +#define SIG2 0x5A5A + +INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way +SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). +SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment + # ... and the former contents of CS + +DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 + +.code16 +.globl begtext, begdata, begbss, endtext, enddata, endbss + +.text +begtext: +.data +begdata: +.bss +begbss: +.text + +start: + jmp trampoline + +# This is the setup header, and it must start at %cs:2 (old 0x9020:2) + + .ascii "HdrS" # header signature + .word 0x0203 # header version number (>= 0x0105) + # or else old loadlin-1.5 will fail) +realmode_swtch: .word 0, 0 # default_switch, SETUPSEG +start_sys_seg: .word SYSSEG + .word kernel_version # pointing to kernel version string + # above section of header is compatible + # with loadlin-1.5 (header v1.5). Don't + # change it. + +type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, + # Bootlin, SYSLX, bootsect...) + # See Documentation/i386/boot.txt for + # assigned ids + +# flags, unused bits must be zero (RFU) bit within loadflags +loadflags: +LOADED_HIGH = 1 # If set, the kernel is loaded high +CAN_USE_HEAP = 0x80 # If set, the loader also has set + # heap_end_ptr to tell how much + # space behind setup.S can be used for + # heap purposes. + # Only the loader knows what is free +#ifndef __BIG_KERNEL__ + .byte 0 +#else + .byte LOADED_HIGH +#endif + +setup_move_size: .word 0x8000 # size to move, when setup is not + # loaded at 0x90000. We will move setup + # to 0x90000 then just before jumping + # into the kernel. However, only the + # loader knows how much data behind + # us also needs to be loaded. + +code32_start: # here loaders can put a different + # start address for 32-bit code. +#ifndef __BIG_KERNEL__ + .long 0x1000 # 0x1000 = default for zImage +#else + .long 0x100000 # 0x100000 = default for big kernel +#endif + +ramdisk_image: .long 0 # address of loaded ramdisk image + # Here the loader puts the 32-bit + # address where it loaded the image. + # This only will be read by the kernel. + +ramdisk_size: .long 0 # its size in bytes + +bootsect_kludge: + .long 0 # obsolete + +heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) + # space from here (exclusive) down to + # end of setup code can be used by setup + # for local heap purposes. + +pad1: .word 0 +cmd_line_ptr: .long 0 # (Header version 0x0202 or later) + # If nonzero, a 32-bit pointer + # to the kernel command line. + # The command line should be + # located between the start of + # setup and the end of low + # memory (0xa0000), or it may + # get overwritten before it + # gets read. If this field is + # used, there is no longer + # anything magical about the + # 0x90000 segment; the setup + # can be located anywhere in + # low memory 0x10000 or higher. + +ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff + # (Header version 0x0203 or later) + # The highest safe address for + # the contents of an initrd + +trampoline: call start_of_setup + .align 16 + # The offset at this point is 0x240 + .space (0x7ff-0x240+1) # E820 & EDD space (ending at 0x7ff) +# End of setup header ##################################################### + +start_of_setup: +# Bootlin depends on this being done early + movw $0x01500, %ax + movb $0x81, %dl + int $0x13 + +#ifdef SAFE_RESET_DISK_CONTROLLER +# Reset the disk controller. + movw $0x0000, %ax + movb $0x80, %dl + int $0x13 +#endif + +# Set %ds = %cs, we know that SETUPSEG = %cs at this point + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds +# Check signature at end of setup + cmpw $SIG1, setup_sig1 + jne bad_sig + + cmpw $SIG2, setup_sig2 + jne bad_sig + + jmp good_sig1 + +# Routine to print asciiz string at ds:si +prtstr: + lodsb + andb %al, %al + jz fin + + call prtchr + jmp prtstr + +fin: ret + +# Space printing +prtsp2: call prtspc # Print double space +prtspc: movb $0x20, %al # Print single space (note: fall-thru) + +# Part of above routine, this one just prints ascii al +prtchr: pushw %ax + pushw %cx + movw $7,%bx + movw $0x01, %cx + movb $0x0e, %ah + int $0x10 + popw %cx + popw %ax + ret + +beep: movb $0x07, %al + jmp prtchr + +no_sig_mess: .string "No setup signature found ..." + +good_sig1: + jmp good_sig + +# We now have to find the rest of the setup code/data +bad_sig: + movw %cs, %ax # SETUPSEG + subw $DELTA_INITSEG, %ax # INITSEG + movw %ax, %ds + xorb %bh, %bh + movb (497), %bl # get setup sect from bootsect + subw $4, %bx # LILO loads 4 sectors of setup + shlw $8, %bx # convert to words (1sect=2^8 words) + movw %bx, %cx + shrw $3, %bx # convert to segment + addw $SYSSEG, %bx + movw %bx, %cs:start_sys_seg +# Move rest of setup code/data to here + movw $2048, %di # four sectors loaded by LILO + subw %si, %si + pushw %cs + popw %es + movw $SYSSEG, %ax + movw %ax, %ds + rep + movsw + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds + cmpw $SIG1, setup_sig1 + jne no_sig + + cmpw $SIG2, setup_sig2 + jne no_sig + + jmp good_sig + +no_sig: + lea no_sig_mess, %si + call prtstr + +no_sig_loop: + hlt + jmp no_sig_loop + +good_sig: + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %ds +# Check if an old loader tries to load a big-kernel + testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel? + jz loader_ok # No, no danger for old loaders. + + cmpb $0, %cs:type_of_loader # Do we have a loader that + # can deal with us? + jnz loader_ok # Yes, continue. + + pushw %cs # No, we have an old loader, + popw %ds # die. + lea loader_panic_mess, %si + call prtstr + + jmp no_sig_loop + +loader_panic_mess: .string "Wrong loader, giving up..." + +loader_ok: +# Get memory size (extended mem, kB) + + xorl %eax, %eax + movl %eax, (0x1e0) +#ifndef STANDARD_MEMORY_BIOS_CALL + movb %al, (E820NR) +# Try three different memory detection schemes. First, try +# e820h, which lets us assemble a memory map, then try e801h, +# which returns a 32-bit memory size, and finally 88h, which +# returns 0-64m + +# method E820H: +# the memory map from hell. e820h returns memory classified into +# a whole bunch of different types, and allows memory holes and +# everything. We scan through this memory map and build a list +# of the first 32 memory areas, which we return at [E820MAP]. +# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. + +#define SMAP 0x534d4150 + +meme820: + xorl %ebx, %ebx # continuation counter + movw $E820MAP, %di # point into the whitelist + # so we can have the bios + # directly write into it. + +jmpe820: + movl $0x0000e820, %eax # e820, upper word zeroed + movl $SMAP, %edx # ascii 'SMAP' + movl $20, %ecx # size of the e820rec + pushw %ds # data record. + popw %es + int $0x15 # make the call + jc bail820 # fall to e801 if it fails + + cmpl $SMAP, %eax # check the return is `SMAP' + jne bail820 # fall to e801 if it fails + +# cmpl $1, 16(%di) # is this usable memory? +# jne again820 + + # If this is usable memory, we save it by simply advancing %di by + # sizeof(e820rec). + # +good820: + movb (E820NR), %al # up to 32 entries + cmpb $E820MAX, %al + jnl bail820 + + incb (E820NR) + movw %di, %ax + addw $20, %ax + movw %ax, %di +again820: + cmpl $0, %ebx # check to see if + jne jmpe820 # %ebx is set to EOF +bail820: + + +# method E801H: +# memory size is in 1k chunksizes, to avoid confusing loadlin. +# we store the 0xe801 memory size in a completely different place, +# because it will most likely be longer than 16 bits. +# (use 1e0 because that's what Larry Augustine uses in his +# alternative new memory detection scheme, and it's sensible +# to write everything into the same place.) + +meme801: + stc # fix to work around buggy + xorw %cx,%cx # BIOSes which dont clear/set + xorw %dx,%dx # carry on pass/error of + # e801h memory size call + # or merely pass cx,dx though + # without changing them. + movw $0xe801, %ax + int $0x15 + jc mem88 + + cmpw $0x0, %cx # Kludge to handle BIOSes + jne e801usecxdx # which report their extended + cmpw $0x0, %dx # memory in AX/BX rather than + jne e801usecxdx # CX/DX. The spec I have read + movw %ax, %cx # seems to indicate AX/BX + movw %bx, %dx # are more reasonable anyway... + +e801usecxdx: + andl $0xffff, %edx # clear sign extend + shll $6, %edx # and go from 64k to 1k chunks + movl %edx, (0x1e0) # store extended memory size + andl $0xffff, %ecx # clear sign extend + addl %ecx, (0x1e0) # and add lower memory into + # total size. + +# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or +# 64mb, depending on the bios) in ax. +mem88: + +#endif + movb $0x88, %ah + int $0x15 + movw %ax, (2) + +# Set the keyboard repeat rate to the max + movw $0x0305, %ax + xorw %bx, %bx + int $0x16 + +# Check for video adapter and its parameters and allow the +# user to browse video modes. + call video # NOTE: we need %ds pointing + # to bootsector + +# Get hd0 data... + xorw %ax, %ax + movw %ax, %ds + ldsw (4 * 0x41), %si + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + pushw %ax + movw %ax, %es + movw $0x0080, %di + movw $0x10, %cx + pushw %cx + cld + rep + movsb +# Get hd1 data... + xorw %ax, %ax + movw %ax, %ds + ldsw (4 * 0x46), %si + popw %cx + popw %es + movw $0x0090, %di + rep + movsb +# Check that there IS a hd1 :-) + movw $0x01500, %ax + movb $0x81, %dl + int $0x13 + jc no_disk1 + + cmpb $3, %ah + je is_disk1 + +no_disk1: + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %es + movw $0x0090, %di + movw $0x10, %cx + xorw %ax, %ax + cld + rep + stosb +is_disk1: +# check for Micro Channel (MCA) bus + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %ds + xorw %ax, %ax + movw %ax, (0xa0) # set table length to 0 + movb $0xc0, %ah + stc + int $0x15 # moves feature table to es:bx + jc no_mca + + pushw %ds + movw %es, %ax + movw %ax, %ds + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %es + movw %bx, %si + movw $0xa0, %di + movw (%si), %cx + addw $2, %cx # table length is a short + cmpw $0x10, %cx + jc sysdesc_ok + + movw $0x10, %cx # we keep only first 16 bytes +sysdesc_ok: + rep + movsb + popw %ds +no_mca: +#ifdef CONFIG_X86_VOYAGER + movb $0xff, 0x40 # flag on config found + movb $0xc0, %al + mov $0xff, %ah + int $0x15 # put voyager config info at es:di + jc no_voyager + movw $0x40, %si # place voyager info in apm table + cld + movw $7, %cx +voyager_rep: + movb %es:(%di), %al + movb %al,(%si) + incw %di + incw %si + decw %cx + jnz voyager_rep +no_voyager: +#endif +# Check for PS/2 pointing device + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %ds + movw $0, (0x1ff) # default is no pointing device + int $0x11 # int 0x11: equipment list + testb $0x04, %al # check if mouse installed + jz no_psmouse + + movw $0xAA, (0x1ff) # device present +no_psmouse: + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) + movl $0x0000E980, %eax # IST Support + movl $0x47534943, %edx # Request value + int $0x15 + + movl %eax, (96) + movl %ebx, (100) + movl %ecx, (104) + movl %edx, (108) +#endif + +#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) +# Then check for an APM BIOS... + # %ds points to the bootsector + movw $0, 0x40 # version = 0 means no APM BIOS + movw $0x05300, %ax # APM BIOS installation check + xorw %bx, %bx + int $0x15 + jc done_apm_bios # Nope, no APM BIOS + + cmpw $0x0504d, %bx # Check for "PM" signature + jne done_apm_bios # No signature, no APM BIOS + + andw $0x02, %cx # Is 32 bit supported? + je done_apm_bios # No 32-bit, no (good) APM BIOS + + movw $0x05304, %ax # Disconnect first just in case + xorw %bx, %bx + int $0x15 # ignore return code + movw $0x05303, %ax # 32 bit connect + xorl %ebx, %ebx + xorw %cx, %cx # paranoia :-) + xorw %dx, %dx # ... + xorl %esi, %esi # ... + xorw %di, %di # ... + int $0x15 + jc no_32_apm_bios # Ack, error. + + movw %ax, (66) # BIOS code segment + movl %ebx, (68) # BIOS entry point offset + movw %cx, (72) # BIOS 16 bit code segment + movw %dx, (74) # BIOS data segment + movl %esi, (78) # BIOS code segment lengths + movw %di, (82) # BIOS data segment length +# Redo the installation check as the 32 bit connect +# modifies the flags returned on some BIOSs + movw $0x05300, %ax # APM BIOS installation check + xorw %bx, %bx + xorw %cx, %cx # paranoia + int $0x15 + jc apm_disconnect # error -> shouldn't happen + + cmpw $0x0504d, %bx # check for "PM" signature + jne apm_disconnect # no sig -> shouldn't happen + + movw %ax, (64) # record the APM BIOS version + movw %cx, (76) # and flags + jmp done_apm_bios + +apm_disconnect: # Tidy up + movw $0x05304, %ax # Disconnect + xorw %bx, %bx + int $0x15 # ignore return code + + jmp done_apm_bios + +no_32_apm_bios: + andw $0xfffd, (76) # remove 32 bit support bit +done_apm_bios: +#endif + +#include "edd.S" + +# Now we want to move to protected mode ... + cmpw $0, %cs:realmode_swtch + jz rmodeswtch_normal + + lcall *%cs:realmode_swtch + + jmp rmodeswtch_end + +rmodeswtch_normal: + pushw %cs + call default_switch + +rmodeswtch_end: +# we get the code32 start address and modify the below 'jmpi' +# (loader may have changed it) + movl %cs:code32_start, %eax + movl %eax, %cs:code32 + +# Now we move the system to its rightful place ... but we check if we have a +# big-kernel. In that case we *must* not move it ... + testb $LOADED_HIGH, %cs:loadflags + jz do_move0 # .. then we have a normal low + # loaded zImage + # .. or else we have a high + # loaded bzImage + jmp end_move # ... and we skip moving + +do_move0: + movw $0x100, %ax # start of destination segment + movw %cs, %bp # aka SETUPSEG + subw $DELTA_INITSEG, %bp # aka INITSEG + movw %cs:start_sys_seg, %bx # start of source segment + cld +do_move: + movw %ax, %es # destination segment + incb %ah # instead of add ax,#0x100 + movw %bx, %ds # source segment + addw $0x100, %bx + subw %di, %di + subw %si, %si + movw $0x800, %cx + rep + movsw + cmpw %bp, %bx # assume start_sys_seg > 0x200, + # so we will perhaps read one + # page more than needed, but + # never overwrite INITSEG + # because destination is a + # minimum one page below source + jb do_move + +end_move: +# then we load the segment descriptors + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds + +# Check whether we need to be downward compatible with version <=201 + cmpl $0, cmd_line_ptr + jne end_move_self # loader uses version >=202 features + cmpb $0x20, type_of_loader + je end_move_self # bootsect loader, we know of it + +# Boot loader doesnt support boot protocol version 2.02. +# If we have our code not at 0x90000, we need to move it there now. +# We also then need to move the params behind it (commandline) +# Because we would overwrite the code on the current IP, we move +# it in two steps, jumping high after the first one. + movw %cs, %ax + cmpw $SETUPSEG, %ax + je end_move_self + + cli # make sure we really have + # interrupts disabled ! + # because after this the stack + # should not be used + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ss, %dx + cmpw %ax, %dx + jb move_self_1 + + addw $INITSEG, %dx + subw %ax, %dx # this will go into %ss after + # the move +move_self_1: + movw %ax, %ds + movw $INITSEG, %ax # real INITSEG + movw %ax, %es + movw %cs:setup_move_size, %cx + std # we have to move up, so we use + # direction down because the + # areas may overlap + movw %cx, %di + decw %di + movw %di, %si + subw $move_self_here+0x200, %cx + rep + movsb + ljmp $SETUPSEG, $move_self_here + +move_self_here: + movw $move_self_here+0x200, %cx + rep + movsb + movw $SETUPSEG, %ax + movw %ax, %ds + movw %dx, %ss +end_move_self: # now we are at the right place + +# +# Enable A20. This is at the very best an annoying procedure. +# A20 code ported from SYSLINUX 1.52-1.63 by H. Peter Anvin. +# AMD Elan bug fix by Robert Schwebel. +# + +#if defined(CONFIG_X86_ELAN) + movb $0x02, %al # alternate A20 gate + outb %al, $0x92 # this works on SC410/SC520 +a20_elan_wait: + call a20_test + jz a20_elan_wait + jmp a20_done +#endif + + +A20_TEST_LOOPS = 32 # Iterations per wait +A20_ENABLE_LOOPS = 255 # Total loops to try + + +#ifndef CONFIG_X86_VOYAGER +a20_try_loop: + + # First, see if we are on a system with no A20 gate. +a20_none: + call a20_test + jnz a20_done + + # Next, try the BIOS (INT 0x15, AX=0x2401) +a20_bios: + movw $0x2401, %ax + pushfl # Be paranoid about flags + int $0x15 + popfl + + call a20_test + jnz a20_done + + # Try enabling A20 through the keyboard controller +#endif /* CONFIG_X86_VOYAGER */ +a20_kbc: + call empty_8042 + +#ifndef CONFIG_X86_VOYAGER + call a20_test # Just in case the BIOS worked + jnz a20_done # but had a delayed reaction. +#endif + + movb $0xD1, %al # command write + outb %al, $0x64 + call empty_8042 + + movb $0xDF, %al # A20 on + outb %al, $0x60 + call empty_8042 + +#ifndef CONFIG_X86_VOYAGER + # Wait until a20 really *is* enabled; it can take a fair amount of + # time on certain systems; Toshiba Tecras are known to have this + # problem. +a20_kbc_wait: + xorw %cx, %cx +a20_kbc_wait_loop: + call a20_test + jnz a20_done + loop a20_kbc_wait_loop + + # Final attempt: use "configuration port A" +a20_fast: + inb $0x92, %al # Configuration Port A + orb $0x02, %al # "fast A20" version + andb $0xFE, %al # don't accidentally reset + outb %al, $0x92 + + # Wait for configuration port A to take effect +a20_fast_wait: + xorw %cx, %cx +a20_fast_wait_loop: + call a20_test + jnz a20_done + loop a20_fast_wait_loop + + # A20 is still not responding. Try frobbing it again. + # + decb (a20_tries) + jnz a20_try_loop + + movw $a20_err_msg, %si + call prtstr + +a20_die: + hlt + jmp a20_die + +a20_tries: + .byte A20_ENABLE_LOOPS + +a20_err_msg: + .ascii "linux: fatal error: A20 gate not responding!" + .byte 13, 10, 0 + + # If we get here, all is good +a20_done: + +#endif /* CONFIG_X86_VOYAGER */ +# set up gdt and idt + lidt idt_48 # load idt with 0,0 + xorl %eax, %eax # Compute gdt_base + movw %ds, %ax # (Convert %ds:gdt to a linear ptr) + shll $4, %eax + addl $gdt, %eax + movl %eax, (gdt_48+2) + lgdt gdt_48 # load gdt with whatever is + # appropriate + +# make sure any possible coprocessor is properly reset.. + xorw %ax, %ax + outb %al, $0xf0 + call delay + + outb %al, $0xf1 + call delay + +# well, that went ok, I hope. Now we mask all interrupts - the rest +# is done in init_IRQ(). + movb $0xFF, %al # mask all interrupts for now + outb %al, $0xA1 + call delay + + movb $0xFB, %al # mask all irq's but irq2 which + outb %al, $0x21 # is cascaded + +# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't +# need no steenking BIOS anyway (except for the initial loading :-). +# The BIOS-routine wants lots of unnecessary data, and it's less +# "interesting" anyway. This is how REAL programmers do it. +# +# Well, now's the time to actually move into protected mode. To make +# things as simple as possible, we do no register set-up or anything, +# we let the gnu-compiled 32-bit programs do that. We just jump to +# absolute address 0x1000 (or the loader supplied one), +# in 32-bit protected mode. +# +# Note that the short jump isn't strictly needed, although there are +# reasons why it might be a good idea. It won't hurt in any case. + movw $1, %ax # protected mode (PE) bit + lmsw %ax # This is it! + jmp flush_instr + +flush_instr: + xorw %bx, %bx # Flag to indicate a boot + xorl %esi, %esi # Pointer to real-mode code + movw %cs, %si + subw $DELTA_INITSEG, %si + shll $4, %esi # Convert to 32-bit pointer + +# jump to startup_32 in arch/i386/boot/compressed/head.S +# +# NOTE: For high loaded big kernels we need a +# jmpi 0x100000,__BOOT_CS +# +# but we yet haven't reloaded the CS register, so the default size +# of the target offset still is 16 bit. +# However, using an operand prefix (0x66), the CPU will properly +# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference +# Manual, Mixing 16-bit and 32-bit code, page 16-6) + + .byte 0x66, 0xea # prefix + jmpi-opcode +code32: .long 0x1000 # will be set to 0x100000 + # for big kernels + .word __BOOT_CS + +# Here's a bunch of information about your current kernel.. +kernel_version: .ascii UTS_RELEASE + .ascii " (" + .ascii LINUX_COMPILE_BY + .ascii "@" + .ascii LINUX_COMPILE_HOST + .ascii ") " + .ascii UTS_VERSION + .byte 0 + +# This is the default real mode switch routine. +# to be called just before protected mode transition +default_switch: + cli # no interrupts allowed ! + movb $0x80, %al # disable NMI for bootup + # sequence + outb %al, $0x70 + lret + + +#ifndef CONFIG_X86_VOYAGER +# This routine tests whether or not A20 is enabled. If so, it +# exits with zf = 0. +# +# The memory address used, 0x200, is the int $0x80 vector, which +# should be safe. + +A20_TEST_ADDR = 4*0x80 + +a20_test: + pushw %cx + pushw %ax + xorw %cx, %cx + movw %cx, %fs # Low memory + decw %cx + movw %cx, %gs # High memory area + movw $A20_TEST_LOOPS, %cx + movw %fs:(A20_TEST_ADDR), %ax + pushw %ax +a20_test_wait: + incw %ax + movw %ax, %fs:(A20_TEST_ADDR) + call delay # Serialize and make delay constant + cmpw %gs:(A20_TEST_ADDR+0x10), %ax + loope a20_test_wait + + popw %fs:(A20_TEST_ADDR) + popw %ax + popw %cx + ret + +#endif /* CONFIG_X86_VOYAGER */ + +# This routine checks that the keyboard command queue is empty +# (after emptying the output buffers) +# +# Some machines have delusions that the keyboard buffer is always full +# with no keyboard attached... +# +# If there is no keyboard controller, we will usually get 0xff +# to all the reads. With each IO taking a microsecond and +# a timeout of 100,000 iterations, this can take about half a +# second ("delay" == outb to port 0x80). That should be ok, +# and should also be plenty of time for a real keyboard controller +# to empty. +# + +empty_8042: + pushl %ecx + movl $100000, %ecx + +empty_8042_loop: + decl %ecx + jz empty_8042_end_loop + + call delay + + inb $0x64, %al # 8042 status port + testb $1, %al # output buffer? + jz no_output + + call delay + inb $0x60, %al # read it + jmp empty_8042_loop + +no_output: + testb $2, %al # is input buffer full? + jnz empty_8042_loop # yes - loop +empty_8042_end_loop: + popl %ecx + ret + +# Read the cmos clock. Return the seconds in al +gettime: + pushw %cx + movb $0x02, %ah + int $0x1a + movb %dh, %al # %dh contains the seconds + andb $0x0f, %al + movb %dh, %ah + movb $0x04, %cl + shrb %cl, %ah + aad + popw %cx + ret + +# Delay is needed after doing I/O +delay: + outb %al,$0x80 + ret + +# Descriptor tables +# +# NOTE: The intel manual says gdt should be sixteen bytes aligned for +# efficiency reasons. However, there are machines which are known not +# to boot with misaligned GDTs, so alter this at your peril! If you alter +# GDT_ENTRY_BOOT_CS (in asm/segment.h) remember to leave at least two +# empty GDT entries (one for NULL and one reserved). +# +# NOTE: On some CPUs, the GDT must be 8 byte aligned. This is +# true for the Voyager Quad CPU card which will not boot without +# This directive. 16 byte aligment is recommended by intel. +# + .align 16 +gdt: + .fill GDT_ENTRY_BOOT_CS,8,0 + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9A00 # code read/exec + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9200 # data read/write + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) +gdt_end: + .align 4 + + .word 0 # alignment byte +idt_48: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + + .word 0 # alignment byte +gdt_48: + .word gdt_end - gdt - 1 # gdt limit + .word 0, 0 # gdt base (filled in later) + +# Include video setup & detection code + +#include "video.S" + +# Setup signature -- must be last +setup_sig1: .word SIG1 +setup_sig2: .word SIG2 + +# After this point, there is some free space which is used by the video mode +# handling code to store the temporary mode table (not used by the kernel). + +modelist: + +.text +endtext: +.data +enddata: +.bss +endbss: diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c new file mode 100644 index 000000000000..26509b826aed --- /dev/null +++ b/arch/i386/boot/tools/build.c @@ -0,0 +1,184 @@ +/* + * $Id: build.c,v 1.5 1997/05/19 12:29:58 mj Exp $ + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1997 Martin Mares + */ + +/* + * This file builds a disk-image from three different files: + * + * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - setup: 8086 machine code, sets up system parm + * - system: 80386 code for actual system + * + * It does some checking that all files are of the correct type, and + * just writes the result to stdout, removing headers and padding to + * the right amount. It also writes some system data to stderr. + */ + +/* + * Changes by tytso to allow root device specification + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + * Cross compiling fixes by Gertjan van Wingerde, July 1996 + * Rewritten by Martin Mares, April 1997 + */ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <unistd.h> +#include <fcntl.h> +#include <asm/boot.h> + +typedef unsigned char byte; +typedef unsigned short word; +typedef unsigned long u32; + +#define DEFAULT_MAJOR_ROOT 0 +#define DEFAULT_MINOR_ROOT 0 + +/* Minimal number of setup sectors (see also bootsect.S) */ +#define SETUP_SECTS 4 + +byte buf[1024]; +int fd; +int is_big_kernel; + +void die(const char * str, ...) +{ + va_list args; + va_start(args, str); + vfprintf(stderr, str, args); + fputc('\n', stderr); + exit(1); +} + +void file_open(const char *name) +{ + if ((fd = open(name, O_RDONLY, 0)) < 0) + die("Unable to open `%s': %m", name); +} + +void usage(void) +{ + die("Usage: build [-b] bootsect setup system [rootdev] [> image]"); +} + +int main(int argc, char ** argv) +{ + unsigned int i, c, sz, setup_sectors; + u32 sys_size; + byte major_root, minor_root; + struct stat sb; + + if (argc > 2 && !strcmp(argv[1], "-b")) + { + is_big_kernel = 1; + argc--, argv++; + } + if ((argc < 4) || (argc > 5)) + usage(); + if (argc > 4) { + if (!strcmp(argv[4], "CURRENT")) { + if (stat("/", &sb)) { + perror("/"); + die("Couldn't stat /"); + } + major_root = major(sb.st_dev); + minor_root = minor(sb.st_dev); + } else if (strcmp(argv[4], "FLOPPY")) { + if (stat(argv[4], &sb)) { + perror(argv[4]); + die("Couldn't stat root device."); + } + major_root = major(sb.st_rdev); + minor_root = minor(sb.st_rdev); + } else { + major_root = 0; + minor_root = 0; + } + } else { + major_root = DEFAULT_MAJOR_ROOT; + minor_root = DEFAULT_MINOR_ROOT; + } + fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); + + file_open(argv[1]); + i = read(fd, buf, sizeof(buf)); + fprintf(stderr,"Boot sector %d bytes.\n",i); + if (i != 512) + die("Boot block must be exactly 512 bytes"); + if (buf[510] != 0x55 || buf[511] != 0xaa) + die("Boot block hasn't got boot flag (0xAA55)"); + buf[508] = minor_root; + buf[509] = major_root; + if (write(1, buf, 512) != 512) + die("Write call failed"); + close (fd); + + file_open(argv[2]); /* Copy the setup code */ + for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c ) + if (write(1, buf, c) != c) + die("Write call failed"); + if (c != 0) + die("read-error on `setup'"); + close (fd); + + setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */ + /* for compatibility with ancient versions of LILO. */ + if (setup_sectors < SETUP_SECTS) + setup_sectors = SETUP_SECTS; + fprintf(stderr, "Setup is %d bytes.\n", i); + memset(buf, 0, sizeof(buf)); + while (i < setup_sectors * 512) { + c = setup_sectors * 512 - i; + if (c > sizeof(buf)) + c = sizeof(buf); + if (write(1, buf, c) != c) + die("Write call failed"); + i += c; + } + + file_open(argv[3]); + if (fstat (fd, &sb)) + die("Unable to stat `%s': %m", argv[3]); + sz = sb.st_size; + fprintf (stderr, "System is %d kB\n", sz/1024); + sys_size = (sz + 15) / 16; + if (!is_big_kernel && sys_size > DEF_SYSSIZE) + die("System is too big. Try using bzImage or modules."); + while (sz > 0) { + int l, n; + + l = (sz > sizeof(buf)) ? sizeof(buf) : sz; + if ((n=read(fd, buf, l)) != l) { + if (n < 0) + die("Error reading %s: %m", argv[3]); + else + die("%s: Unexpected EOF", argv[3]); + } + if (write(1, buf, l) != l) + die("Write failed"); + sz -= l; + } + close(fd); + + if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */ + die("Output: seek failed"); + buf[0] = setup_sectors; + if (write(1, buf, 1) != 1) + die("Write of setup sector count failed"); + if (lseek(1, 500, SEEK_SET) != 500) + die("Output: seek failed"); + buf[0] = (sys_size & 0xff); + buf[1] = ((sys_size >> 8) & 0xff); + if (write(1, buf, 2) != 2) + die("Write of image length failed"); + + return 0; /* Everything is OK */ +} diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S new file mode 100644 index 000000000000..925d3f5a3824 --- /dev/null +++ b/arch/i386/boot/video.S @@ -0,0 +1,2007 @@ +/* video.S + * + * Display adapter & video mode setup, version 2.13 (14-May-99) + * + * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz> + * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson + * + * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999 + * + * For further information, look at Documentation/svga.txt. + * + */ + +#include <linux/config.h> /* for CONFIG_VIDEO_* */ + +/* Enable autodetection of SVGA adapters and modes. */ +#undef CONFIG_VIDEO_SVGA + +/* Enable autodetection of VESA modes */ +#define CONFIG_VIDEO_VESA + +/* Enable compacting of mode table */ +#define CONFIG_VIDEO_COMPACT + +/* Retain screen contents when switching modes */ +#define CONFIG_VIDEO_RETAIN + +/* Enable local mode list */ +#undef CONFIG_VIDEO_LOCAL + +/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ +#undef CONFIG_VIDEO_400_HACK + +/* Hack that lets you force specific BIOS mode ID and specific dimensions */ +#undef CONFIG_VIDEO_GFX_HACK +#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ +#define VIDEO_GFX_BIOS_BX 0x0102 +#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +/* Special video modes */ +#define VIDEO_FIRST_SPECIAL 0x0f00 +#define VIDEO_80x25 0x0f00 +#define VIDEO_8POINT 0x0f01 +#define VIDEO_80x43 0x0f02 +#define VIDEO_80x28 0x0f03 +#define VIDEO_CURRENT_MODE 0x0f04 +#define VIDEO_80x30 0x0f05 +#define VIDEO_80x34 0x0f06 +#define VIDEO_80x60 0x0f07 +#define VIDEO_GFX_HACK 0x0f08 +#define VIDEO_LAST_SPECIAL 0x0f09 + +/* Video modes given by resolution */ +#define VIDEO_FIRST_RESOLUTION 0x1000 + +/* The "recalculate timings" flag */ +#define VIDEO_RECALC 0x8000 + +/* Positions of various video parameters passed to the kernel */ +/* (see also include/linux/tty.h) */ +#define PARAM_CURSOR_POS 0x00 +#define PARAM_VIDEO_PAGE 0x04 +#define PARAM_VIDEO_MODE 0x06 +#define PARAM_VIDEO_COLS 0x07 +#define PARAM_VIDEO_EGA_BX 0x0a +#define PARAM_VIDEO_LINES 0x0e +#define PARAM_HAVE_VGA 0x0f +#define PARAM_FONT_POINTS 0x10 + +#define PARAM_LFB_WIDTH 0x12 +#define PARAM_LFB_HEIGHT 0x14 +#define PARAM_LFB_DEPTH 0x16 +#define PARAM_LFB_BASE 0x18 +#define PARAM_LFB_SIZE 0x1c +#define PARAM_LFB_LINELENGTH 0x24 +#define PARAM_LFB_COLORS 0x26 +#define PARAM_VESAPM_SEG 0x2e +#define PARAM_VESAPM_OFF 0x30 +#define PARAM_LFB_PAGES 0x32 +#define PARAM_VESA_ATTRIB 0x34 + +/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ +#ifdef CONFIG_VIDEO_RETAIN +#define DO_STORE call store_screen +#else +#define DO_STORE +#endif /* CONFIG_VIDEO_RETAIN */ + +# This is the main entry point called by setup.S +# %ds *must* be pointing to the bootsector +video: pushw %ds # We use different segments + pushw %ds # FS contains original DS + popw %fs + pushw %cs # DS is equal to CS + popw %ds + pushw %cs # ES is equal to CS + popw %es + xorw %ax, %ax + movw %ax, %gs # GS is zero + cld + call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) +#ifdef CONFIG_VIDEO_SELECT + movw %fs:(0x01fa), %ax # User selected video mode + cmpw $ASK_VGA, %ax # Bring up the menu + jz vid2 + + call mode_set # Set the mode + jc vid1 + + leaw badmdt, %si # Invalid mode ID + call prtstr +vid2: call mode_menu +vid1: +#ifdef CONFIG_VIDEO_RETAIN + call restore_screen # Restore screen contents +#endif /* CONFIG_VIDEO_RETAIN */ + call store_edid +#endif /* CONFIG_VIDEO_SELECT */ + call mode_params # Store mode parameters + popw %ds # Restore original DS + ret + +# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. +basic_detect: + movb $0, %fs:(PARAM_HAVE_VGA) + movb $0x12, %ah # Check EGA/VGA + movb $0x10, %bl + int $0x10 + movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel + cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. + je basret + + incb adapter + movw $0x1a00, %ax # Check EGA or VGA? + int $0x10 + cmpb $0x1a, %al # 1a means VGA... + jne basret # anything else is EGA. + + incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA + incb adapter +basret: ret + +# Store the video mode parameters for later usage by the kernel. +# This is done by asking the BIOS except for the rows/columns +# parameters in the default 80x25 mode -- these are set directly, +# because some very obscure BIOSes supply insane values. +mode_params: +#ifdef CONFIG_VIDEO_SELECT + cmpb $0, graphic_mode + jnz mopar_gr +#endif + movb $0x03, %ah # Read cursor position + xorb %bh, %bh + int $0x10 + movw %dx, %fs:(PARAM_CURSOR_POS) + movb $0x0f, %ah # Read page/mode/width + int $0x10 + movw %bx, %fs:(PARAM_VIDEO_PAGE) + movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width + cmpb $0x7, %al # MDA/HGA => segment differs + jnz mopar0 + + movw $0xb000, video_segment +mopar0: movw %gs:(0x485), %ax # Font size + movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) + movw force_size, %ax # Forced size? + orw %ax, %ax + jz mopar1 + + movb %ah, %fs:(PARAM_VIDEO_COLS) + movb %al, %fs:(PARAM_VIDEO_LINES) + ret + +mopar1: movb $25, %al + cmpb $0, adapter # If we are on CGA/MDA/HGA, the + jz mopar2 # screen must have 25 lines. + + movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS + incb %al # location of max lines. +mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) + ret + +#ifdef CONFIG_VIDEO_SELECT +# Fetching of VESA frame buffer parameters +mopar_gr: + leaw modelist+1024, %di + movb $0x23, %fs:(PARAM_HAVE_VGA) + movw 16(%di), %ax + movw %ax, %fs:(PARAM_LFB_LINELENGTH) + movw 18(%di), %ax + movw %ax, %fs:(PARAM_LFB_WIDTH) + movw 20(%di), %ax + movw %ax, %fs:(PARAM_LFB_HEIGHT) + movb 25(%di), %al + movb $0, %ah + movw %ax, %fs:(PARAM_LFB_DEPTH) + movb 29(%di), %al + movb $0, %ah + movw %ax, %fs:(PARAM_LFB_PAGES) + movl 40(%di), %eax + movl %eax, %fs:(PARAM_LFB_BASE) + movl 31(%di), %eax + movl %eax, %fs:(PARAM_LFB_COLORS) + movl 35(%di), %eax + movl %eax, %fs:(PARAM_LFB_COLORS+4) + movw 0(%di), %ax + movw %ax, %fs:(PARAM_VESA_ATTRIB) + +# get video mem size + leaw modelist+1024, %di + movw $0x4f00, %ax + int $0x10 + xorl %eax, %eax + movw 18(%di), %ax + movl %eax, %fs:(PARAM_LFB_SIZE) + +# switching the DAC to 8-bit is for <= 8 bpp only + movw %fs:(PARAM_LFB_DEPTH), %ax + cmpw $8, %ax + jg dac_done + +# get DAC switching capability + xorl %eax, %eax + movb 10(%di), %al + testb $1, %al + jz dac_set + +# attempt to switch DAC to 8-bit + movw $0x4f08, %ax + movw $0x0800, %bx + int $0x10 + cmpw $0x004f, %ax + jne dac_set + movb %bh, dac_size # store actual DAC size + +dac_set: +# set color size to DAC size + movb dac_size, %al + movb %al, %fs:(PARAM_LFB_COLORS+0) + movb %al, %fs:(PARAM_LFB_COLORS+2) + movb %al, %fs:(PARAM_LFB_COLORS+4) + movb %al, %fs:(PARAM_LFB_COLORS+6) + +# set color offsets to 0 + movb $0, %fs:(PARAM_LFB_COLORS+1) + movb $0, %fs:(PARAM_LFB_COLORS+3) + movb $0, %fs:(PARAM_LFB_COLORS+5) + movb $0, %fs:(PARAM_LFB_COLORS+7) + +dac_done: +# get protected mode interface informations + movw $0x4f0a, %ax + xorw %bx, %bx + xorw %di, %di + int $0x10 + cmp $0x004f, %ax + jnz no_pm + + movw %es, %fs:(PARAM_VESAPM_SEG) + movw %di, %fs:(PARAM_VESAPM_OFF) +no_pm: ret + +# The video mode menu +mode_menu: + leaw keymsg, %si # "Return/Space/Timeout" message + call prtstr + call flush +nokey: call getkt + + cmpb $0x0d, %al # ENTER ? + je listm # yes - manual mode selection + + cmpb $0x20, %al # SPACE ? + je defmd1 # no - repeat + + call beep + jmp nokey + +defmd1: ret # No mode chosen? Default 80x25 + +listm: call mode_table # List mode table +listm0: leaw name_bann, %si # Print adapter name + call prtstr + movw card_name, %si + orw %si, %si + jnz an2 + + movb adapter, %al + leaw old_name, %si + orb %al, %al + jz an1 + + leaw ega_name, %si + decb %al + jz an1 + + leaw vga_name, %si + jmp an1 + +an2: call prtstr + leaw svga_name, %si +an1: call prtstr + leaw listhdr, %si # Table header + call prtstr + movb $0x30, %dl # DL holds mode number + leaw modelist, %si +lm1: cmpw $ASK_VGA, (%si) # End? + jz lm2 + + movb %dl, %al # Menu selection number + call prtchr + call prtsp2 + lodsw + call prthw # Mode ID + call prtsp2 + movb 0x1(%si), %al + call prtdec # Rows + movb $0x78, %al # the letter 'x' + call prtchr + lodsw + call prtdec # Columns + movb $0x0d, %al # New line + call prtchr + movb $0x0a, %al + call prtchr + incb %dl # Next character + cmpb $0x3a, %dl + jnz lm1 + + movb $0x61, %dl + jmp lm1 + +lm2: leaw prompt, %si # Mode prompt + call prtstr + leaw edit_buf, %di # Editor buffer +lm3: call getkey + cmpb $0x0d, %al # Enter? + jz lment + + cmpb $0x08, %al # Backspace? + jz lmbs + + cmpb $0x20, %al # Printable? + jc lm3 + + cmpw $edit_buf+4, %di # Enough space? + jz lm3 + + stosb + call prtchr + jmp lm3 + +lmbs: cmpw $edit_buf, %di # Backspace + jz lm3 + + decw %di + movb $0x08, %al + call prtchr + call prtspc + movb $0x08, %al + call prtchr + jmp lm3 + +lment: movb $0, (%di) + leaw crlft, %si + call prtstr + leaw edit_buf, %si + cmpb $0, (%si) # Empty string = default mode + jz lmdef + + cmpb $0, 1(%si) # One character = menu selection + jz mnusel + + cmpw $0x6373, (%si) # "scan" => mode scanning + jnz lmhx + + cmpw $0x6e61, 2(%si) + jz lmscan + +lmhx: xorw %bx, %bx # Else => mode ID in hex +lmhex: lodsb + orb %al, %al + jz lmuse1 + + subb $0x30, %al + jc lmbad + + cmpb $10, %al + jc lmhx1 + + subb $7, %al + andb $0xdf, %al + cmpb $10, %al + jc lmbad + + cmpb $16, %al + jnc lmbad + +lmhx1: shlw $4, %bx + orb %al, %bl + jmp lmhex + +lmuse1: movw %bx, %ax + jmp lmuse + +mnusel: lodsb # Menu selection + xorb %ah, %ah + subb $0x30, %al + jc lmbad + + cmpb $10, %al + jc lmuse + + cmpb $0x61-0x30, %al + jc lmbad + + subb $0x61-0x30-10, %al + cmpb $36, %al + jnc lmbad + +lmuse: call mode_set + jc lmdef + +lmbad: leaw unknt, %si + call prtstr + jmp lm2 +lmscan: cmpb $0, adapter # Scanning only on EGA/VGA + jz lmbad + + movw $0, mt_end # Scanning of modes is + movb $1, scanning # done as new autodetection. + call mode_table + jmp listm0 +lmdef: ret + +# Additional parts of mode_set... (relative jumps, you know) +setv7: # Video7 extended modes + DO_STORE + subb $VIDEO_FIRST_V7>>8, %bh + movw $0x6f05, %ax + int $0x10 + stc + ret + +_setrec: jmp setrec # Ugly... +_set_80x25: jmp set_80x25 + +# Aliases for backward compatibility. +setalias: + movw $VIDEO_80x25, %ax + incw %bx + jz mode_set + + movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al + incw %bx + jnz setbad # Fall-through! + +# Setting of user mode (AX=mode ID) => CF=success +mode_set: + movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S + movw %ax, %bx + cmpb $0xff, %ah + jz setalias + + testb $VIDEO_RECALC>>8, %ah + jnz _setrec + + cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah + jnc setres + + cmpb $VIDEO_FIRST_SPECIAL>>8, %ah + jz setspc + + cmpb $VIDEO_FIRST_V7>>8, %ah + jz setv7 + + cmpb $VIDEO_FIRST_VESA>>8, %ah + jnc check_vesa + + orb %ah, %ah + jz setmenu + + decb %ah + jz setbios + +setbad: clc + movb $0, do_restore # The screen needn't be restored + ret + +setvesa: + DO_STORE + subb $VIDEO_FIRST_VESA>>8, %bh + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz setbad # AH=0 if OK + + stc + ret + +setbios: + DO_STORE + int $0x10 # Standard BIOS mode set call + pushw %bx + movb $0x0f, %ah # Check if really set + int $0x10 + popw %bx + cmpb %bl, %al + jnz setbad + + stc + ret + +setspc: xorb %bh, %bh # Set special mode + cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl + jnc setbad + + addw %bx, %bx + jmp *spec_inits(%bx) + +setmenu: + orb %al, %al # 80x25 is an exception + jz _set_80x25 + + pushw %bx # Set mode chosen from menu + call mode_table # Build the mode table + popw %ax + shlw $2, %ax + addw %ax, %si + cmpw %di, %si + jnc setbad + + movw (%si), %ax # Fetch mode ID +_m_s: jmp mode_set + +setres: pushw %bx # Set mode chosen by resolution + call mode_table + popw %bx + xchgb %bl, %bh +setr1: lodsw + cmpw $ASK_VGA, %ax # End of the list? + jz setbad + + lodsw + cmpw %bx, %ax + jnz setr1 + + movw -4(%si), %ax # Fetch mode ID + jmp _m_s + +check_vesa: + leaw modelist+1024, %di + subb $VIDEO_FIRST_VESA>>8, %bh + movw %bx, %cx # Get mode information structure + movw $0x4f01, %ax + int $0x10 + addb $VIDEO_FIRST_VESA>>8, %bh + cmpw $0x004f, %ax + jnz setbad + + movb (%di), %al # Check capabilities. + andb $0x19, %al + cmpb $0x09, %al + jz setvesa # This is a text mode + + movb (%di), %al # Check capabilities. + andb $0x99, %al + cmpb $0x99, %al + jnz _setbad # Doh! No linear frame buffer. + + subb $VIDEO_FIRST_VESA>>8, %bh + orw $0x4000, %bx # Use linear frame buffer + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz _setbad # AH=0 if OK + + movb $1, graphic_mode # flag graphic mode + movb $0, do_restore # no screen restore + stc + ret + +_setbad: jmp setbad # Ugly... + +# Recalculate vertical display end registers -- this fixes various +# inconsistencies of extended modes on many adapters. Called when +# the VIDEO_RECALC flag is set in the mode ID. + +setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode + call mode_set + jnc rct3 + + movw %gs:(0x485), %ax # Font size in pixels + movb %gs:(0x484), %bl # Number of rows + incb %bl + mulb %bl # Number of visible + decw %ax # scan lines - 1 + movw $0x3d4, %dx + movw %ax, %bx + movb $0x12, %al # Lower 8 bits + movb %bl, %ah + outw %ax, %dx + movb $0x07, %al # Bits 8 and 9 in the overflow register + call inidx + xchgb %al, %ah + andb $0xbd, %ah + shrb %bh + jnc rct1 + orb $0x02, %ah +rct1: shrb %bh + jnc rct2 + orb $0x40, %ah +rct2: movb $0x07, %al + outw %ax, %dx + stc +rct3: ret + +# Table of routines for setting of the special modes. +spec_inits: + .word set_80x25 + .word set_8pixel + .word set_80x43 + .word set_80x28 + .word set_current + .word set_80x30 + .word set_80x34 + .word set_80x60 + .word set_gfx + +# Set the 80x25 mode. If already set, do nothing. +set_80x25: + movw $0x5019, force_size # Override possibly broken BIOS +use_80x25: +#ifdef CONFIG_VIDEO_400_HACK + movw $0x1202, %ax # Force 400 scan lines + movb $0x30, %bl + int $0x10 +#else + movb $0x0f, %ah # Get current mode ID + int $0x10 + cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available + jz st80 # on CGA/MDA/HGA and is also available on EGAM + + cmpw $0x5003, %ax # Unknown mode, force 80x25 color + jnz force3 + +st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 + jz set80 + + movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. + orb %al, %al # Some buggy BIOS'es set 0 rows + jz set80 + + cmpb $24, %al # It's hopefully correct + jz set80 +#endif /* CONFIG_VIDEO_400_HACK */ +force3: DO_STORE + movw $0x0003, %ax # Forced set + int $0x10 +set80: stc + ret + +# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. +set_8pixel: + DO_STORE + call use_80x25 # The base is 80x25 +set_8pt: + movw $0x1112, %ax # Use 8x8 font + xorb %bl, %bl + int $0x10 + movw $0x1200, %ax # Use alternate print screen + movb $0x20, %bl + int $0x10 + movw $0x1201, %ax # Turn off cursor emulation + movb $0x34, %bl + int $0x10 + movb $0x01, %ah # Define cursor scan lines 6-7 + movw $0x0607, %cx + int $0x10 +set_current: + stc + ret + +# Set the 80x28 mode. This mode works on all VGA's, because it's a standard +# 80x25 mode with 14-point fonts instead of 16-point. +set_80x28: + DO_STORE + call use_80x25 # The base is 80x25 +set14: movw $0x1111, %ax # Use 9x14 font + xorb %bl, %bl + int $0x10 + movb $0x01, %ah # Define cursor scan lines 11-12 + movw $0x0b0c, %cx + int $0x10 + stc + ret + +# Set the 80x43 mode. This mode is works on all VGA's. +# It's a 350-scanline mode with 8-pixel font. +set_80x43: + DO_STORE + movw $0x1201, %ax # Set 350 scans + movb $0x30, %bl + int $0x10 + movw $0x0003, %ax # Reset video mode + int $0x10 + jmp set_8pt # Use 8-pixel font + +# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. +set_80x30: + call use_80x25 # Start with real 80x25 + DO_STORE + movw $0x3cc, %dx # Get CRTC port + inb %dx, %al + movb $0xd4, %dl + rorb %al # Mono or color? + jc set48a + + movb $0xb4, %dl +set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) + call outidx + movw $0x0b06, %ax # Vertical total + call outidx + movw $0x3e07, %ax # (Vertical) overflow + call outidx + movw $0xea10, %ax # Vertical sync start + call outidx + movw $0xdf12, %ax # Vertical display end + call outidx + movw $0xe715, %ax # Vertical blank start + call outidx + movw $0x0416, %ax # Vertical blank end + call outidx + pushw %dx + movb $0xcc, %dl # Misc output register (read) + inb %dx, %al + movb $0xc2, %dl # (write) + andb $0x0d, %al # Preserve clock select bits and color bit + orb $0xe2, %al # Set correct sync polarity + outb %al, %dx + popw %dx + movw $0x501e, force_size + stc # That's all. + ret + +# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. +set_80x34: + call set_80x30 # Set 480 scans + call set14 # And 14-pt font + movw $0xdb12, %ax # VGA vertical display end + movw $0x5022, force_size +setvde: call outidx + stc + ret + +# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. +set_80x60: + call set_80x30 # Set 480 scans + call set_8pt # And 8-pt font + movw $0xdf12, %ax # VGA vertical display end + movw $0x503c, force_size + jmp setvde + +# Special hack for ThinkPad graphics +set_gfx: +#ifdef CONFIG_VIDEO_GFX_HACK + movw $VIDEO_GFX_BIOS_AX, %ax + movw $VIDEO_GFX_BIOS_BX, %bx + int $0x10 + movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size + stc +#endif + ret + +#ifdef CONFIG_VIDEO_RETAIN + +# Store screen contents to temporary buffer. +store_screen: + cmpb $0, do_restore # Already stored? + jnz stsr + + testb $CAN_USE_HEAP, loadflags # Have we space for storing? + jz stsr + + pushw %ax + pushw %bx + pushw force_size # Don't force specific size + movw $0, force_size + call mode_params # Obtain params of current mode + popw force_size + movb %fs:(PARAM_VIDEO_LINES), %ah + movb %fs:(PARAM_VIDEO_COLS), %al + movw %ax, %bx # BX=dimensions + mulb %ah + movw %ax, %cx # CX=number of characters + addw %ax, %ax # Calculate image size + addw $modelist+1024+4, %ax + cmpw heap_end_ptr, %ax + jnc sts1 # Unfortunately, out of memory + + movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params + leaw modelist+1024, %di + stosw + movw %bx, %ax + stosw + pushw %ds # Store the screen + movw video_segment, %ds + xorw %si, %si + rep + movsw + popw %ds + incb do_restore # Screen will be restored later +sts1: popw %bx + popw %ax +stsr: ret + +# Restore screen contents from temporary buffer. +restore_screen: + cmpb $0, do_restore # Has the screen been stored? + jz res1 + + call mode_params # Get parameters of current mode + movb %fs:(PARAM_VIDEO_LINES), %cl + movb %fs:(PARAM_VIDEO_COLS), %ch + leaw modelist+1024, %si # Screen buffer + lodsw # Set cursor position + movw %ax, %dx + cmpb %cl, %dh + jc res2 + + movb %cl, %dh + decb %dh +res2: cmpb %ch, %dl + jc res3 + + movb %ch, %dl + decb %dl +res3: movb $0x02, %ah + movb $0x00, %bh + int $0x10 + lodsw # Display size + movb %ah, %dl # DL=number of lines + movb $0, %ah # BX=phys. length of orig. line + movw %ax, %bx + cmpb %cl, %dl # Too many? + jc res4 + + pushw %ax + movb %dl, %al + subb %cl, %al + mulb %bl + addw %ax, %si + addw %ax, %si + popw %ax + movb %cl, %dl +res4: cmpb %ch, %al # Too wide? + jc res5 + + movb %ch, %al # AX=width of src. line +res5: movb $0, %cl + xchgb %ch, %cl + movw %cx, %bp # BP=width of dest. line + pushw %es + movw video_segment, %es + xorw %di, %di # Move the data + addw %bx, %bx # Convert BX and BP to _bytes_ + addw %bp, %bp +res6: pushw %si + pushw %di + movw %ax, %cx + rep + movsw + popw %di + popw %si + addw %bp, %di + addw %bx, %si + decb %dl + jnz res6 + + popw %es # Done +res1: ret +#endif /* CONFIG_VIDEO_RETAIN */ + +# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) +outidx: outb %al, %dx + pushw %ax + movb %ah, %al + incw %dx + outb %al, %dx + decw %dx + popw %ax + ret + +# Build the table of video modes (stored after the setup.S code at the +# `modelist' label. Each video mode record looks like: +# .word MODE-ID (our special mode ID (see above)) +# .byte rows (number of rows) +# .byte columns (number of columns) +# Returns address of the end of the table in DI, the end is marked +# with a ASK_VGA ID. +mode_table: + movw mt_end, %di # Already filled? + orw %di, %di + jnz mtab1x + + leaw modelist, %di # Store standard modes: + movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) + stosl + movb adapter, %al # CGA/MDA/HGA -- no more modes + orb %al, %al + jz mtabe + + decb %al + jnz mtabv + + movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode + stosl + jmp mtabe + +mtab1x: jmp mtab1 + +mtabv: leaw vga_modes, %si # All modes for std VGA + movw $vga_modes_end-vga_modes, %cx + rep # I'm unable to use movsw as I don't know how to store a half + movsb # of the expression above to cx without using explicit shr. + + cmpb $0, scanning # Mode scan requested? + jz mscan1 + + call mode_scan +mscan1: + +#ifdef CONFIG_VIDEO_LOCAL + call local_modes +#endif /* CONFIG_VIDEO_LOCAL */ + +#ifdef CONFIG_VIDEO_VESA + call vesa_modes # Detect VESA VGA modes +#endif /* CONFIG_VIDEO_VESA */ + +#ifdef CONFIG_VIDEO_SVGA + cmpb $0, scanning # Bypass when scanning + jnz mscan2 + + call svga_modes # Detect SVGA cards & modes +mscan2: +#endif /* CONFIG_VIDEO_SVGA */ + +mtabe: + +#ifdef CONFIG_VIDEO_COMPACT + leaw modelist, %si + movw %di, %dx + movw %si, %di +cmt1: cmpw %dx, %si # Scan all modes + jz cmt2 + + leaw modelist, %bx # Find in previous entries + movw 2(%si), %cx +cmt3: cmpw %bx, %si + jz cmt4 + + cmpw 2(%bx), %cx # Found => don't copy this entry + jz cmt5 + + addw $4, %bx + jmp cmt3 + +cmt4: movsl # Copy entry + jmp cmt1 + +cmt5: addw $4, %si # Skip entry + jmp cmt1 + +cmt2: +#endif /* CONFIG_VIDEO_COMPACT */ + + movw $ASK_VGA, (%di) # End marker + movw %di, mt_end +mtab1: leaw modelist, %si # SI=mode list, DI=list end +ret0: ret + +# Modes usable on all standard VGAs +vga_modes: + .word VIDEO_8POINT + .word 0x5032 # 80x50 + .word VIDEO_80x43 + .word 0x502b # 80x43 + .word VIDEO_80x28 + .word 0x501c # 80x28 + .word VIDEO_80x30 + .word 0x501e # 80x30 + .word VIDEO_80x34 + .word 0x5022 # 80x34 + .word VIDEO_80x60 + .word 0x503c # 80x60 +#ifdef CONFIG_VIDEO_GFX_HACK + .word VIDEO_GFX_HACK + .word VIDEO_GFX_DUMMY_RESOLUTION +#endif + +vga_modes_end: +# Detect VESA modes. + +#ifdef CONFIG_VIDEO_VESA +vesa_modes: + cmpb $2, adapter # VGA only + jnz ret0 + + movw %di, %bp # BP=original mode table end + addw $0x200, %di # Buffer space + movw $0x4f00, %ax # VESA Get card info call + int $0x10 + movw %bp, %di + cmpw $0x004f, %ax # Successful? + jnz ret0 + + cmpw $0x4556, 0x200(%di) + jnz ret0 + + cmpw $0x4153, 0x202(%di) + jnz ret0 + + movw $vesa_name, card_name # Set name to "VESA VGA" + pushw %gs + lgsw 0x20e(%di), %si # GS:SI=mode list + movw $128, %cx # Iteration limit +vesa1: +# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. +# XXX: lodsw %gs:(%si), %ax # Get next mode in the list + gs; lodsw + cmpw $0xffff, %ax # End of the table? + jz vesar + + cmpw $0x0080, %ax # Check validity of mode ID + jc vesa2 + + orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff + jz vesan # Certain BIOSes report 0x80-0xff! + + cmpw $0x0800, %ax + jnc vesae + +vesa2: pushw %cx + movw %ax, %cx # Get mode information structure + movw $0x4f01, %ax + int $0x10 + movw %cx, %bx # BX=mode number + addb $VIDEO_FIRST_VESA>>8, %bh + popw %cx + cmpw $0x004f, %ax + jnz vesan # Don't report errors (buggy BIOSES) + + movb (%di), %al # Check capabilities. We require + andb $0x19, %al # a color text mode. + cmpb $0x09, %al + jnz vesan + + cmpw $0xb800, 8(%di) # Standard video memory address required + jnz vesan + + testb $2, (%di) # Mode characteristics supplied? + movw %bx, (%di) # Store mode number + jz vesa3 + + xorw %dx, %dx + movw 0x12(%di), %bx # Width + orb %bh, %bh + jnz vesan + + movb %bl, 0x3(%di) + movw 0x14(%di), %ax # Height + orb %ah, %ah + jnz vesan + + movb %al, 2(%di) + mulb %bl + cmpw $8193, %ax # Small enough for Linux console driver? + jnc vesan + + jmp vesaok + +vesa3: subw $0x8108, %bx # This mode has no detailed info specified, + jc vesan # so it must be a standard VESA mode. + + cmpw $5, %bx + jnc vesan + + movw vesa_text_mode_table(%bx), %ax + movw %ax, 2(%di) +vesaok: addw $4, %di # The mode is valid. Store it. +vesan: loop vesa1 # Next mode. Limit exceeded => error +vesae: leaw vesaer, %si + call prtstr + movw %bp, %di # Discard already found modes. +vesar: popw %gs + ret + +# Dimensions of standard VESA text modes +vesa_text_mode_table: + .byte 60, 80 # 0108 + .byte 25, 132 # 0109 + .byte 43, 132 # 010A + .byte 50, 132 # 010B + .byte 60, 132 # 010C +#endif /* CONFIG_VIDEO_VESA */ + +# Scan for video modes. A bit dirty, but should work. +mode_scan: + movw $0x0100, %cx # Start with mode 0 +scm1: movb $0, %ah # Test the mode + movb %cl, %al + int $0x10 + movb $0x0f, %ah + int $0x10 + cmpb %cl, %al + jnz scm2 # Mode not set + + movw $0x3c0, %dx # Test if it's a text mode + movb $0x10, %al # Mode bits + call inidx + andb $0x03, %al + jnz scm2 + + movb $0xce, %dl # Another set of mode bits + movb $0x06, %al + call inidx + shrb %al + jc scm2 + + movb $0xd4, %dl # Cursor location + movb $0x0f, %al + call inidx + orb %al, %al + jnz scm2 + + movw %cx, %ax # Ok, store the mode + stosw + movb %gs:(0x484), %al # Number of rows + incb %al + stosb + movw %gs:(0x44a), %ax # Number of columns + stosb +scm2: incb %cl + jns scm1 + + movw $0x0003, %ax # Return back to mode 3 + int $0x10 + ret + +tstidx: outw %ax, %dx # OUT DX,AX and inidx +inidx: outb %al, %dx # Read from indexed VGA register + incw %dx # AL=index, DX=index reg port -> AL=data + inb %dx, %al + decw %dx + ret + +# Try to detect type of SVGA card and supply (usually approximate) video +# mode table for it. + +#ifdef CONFIG_VIDEO_SVGA +svga_modes: + leaw svga_table, %si # Test all known SVGA adapters +dosvga: lodsw + movw %ax, %bp # Default mode table + orw %ax, %ax + jz didsv1 + + lodsw # Pointer to test routine + pushw %si + pushw %di + pushw %es + movw $0xc000, %bx + movw %bx, %es + call *%ax # Call test routine + popw %es + popw %di + popw %si + orw %bp, %bp + jz dosvga + + movw %bp, %si # Found, copy the modes + movb svga_prefix, %ah +cpsvga: lodsb + orb %al, %al + jz didsv + + stosw + movsw + jmp cpsvga + +didsv: movw %si, card_name # Store pointer to card name +didsv1: ret + +# Table of all known SVGA cards. For each card, we store a pointer to +# a table of video modes supported by the card and a pointer to a routine +# used for testing of presence of the card. The video mode table is always +# followed by the name of the card or the chipset. +svga_table: + .word ati_md, ati_test + .word oak_md, oak_test + .word paradise_md, paradise_test + .word realtek_md, realtek_test + .word s3_md, s3_test + .word chips_md, chips_test + .word video7_md, video7_test + .word cirrus5_md, cirrus5_test + .word cirrus6_md, cirrus6_test + .word cirrus1_md, cirrus1_test + .word ahead_md, ahead_test + .word everex_md, everex_test + .word genoa_md, genoa_test + .word trident_md, trident_test + .word tseng_md, tseng_test + .word 0 + +# Test routines and mode tables: + +# S3 - The test algorithm was taken from the SuperProbe package +# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org +s3_test: + movw $0x0f35, %cx # we store some constants in cl/ch + movw $0x03d4, %dx + movb $0x38, %al + call inidx + movb %al, %bh # store current CRT-register 0x38 + movw $0x0038, %ax + call outidx # disable writing to special regs + movb %cl, %al # check whether we can write special reg 0x35 + call inidx + movb %al, %bl # save the current value of CRT reg 0x35 + andb $0xf0, %al # clear bits 0-3 + movb %al, %ah + movb %cl, %al # and write it to CRT reg 0x35 + call outidx + call inidx # now read it back + andb %ch, %al # clear the upper 4 bits + jz s3_2 # the first test failed. But we have a + + movb %bl, %ah # second chance + movb %cl, %al + call outidx + jmp s3_1 # do the other tests + +s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 + orb %bl, %ah # set the upper 4 bits of ah with the orig value + call outidx # write ... + call inidx # ... and reread + andb %cl, %al # turn off the upper 4 bits + pushw %ax + movb %bl, %ah # restore old value in register 0x35 + movb %cl, %al + call outidx + popw %ax + cmpb %ch, %al # setting lower 4 bits was successful => bad + je no_s3 # writing is allowed => this is not an S3 + +s3_1: movw $0x4838, %ax # allow writing to special regs by putting + call outidx # magic number into CRT-register 0x38 + movb %cl, %al # check whether we can write special reg 0x35 + call inidx + movb %al, %bl + andb $0xf0, %al + movb %al, %ah + movb %cl, %al + call outidx + call inidx + andb %ch, %al + jnz no_s3 # no, we can't write => no S3 + + movw %cx, %ax + orb %bl, %ah + call outidx + call inidx + andb %ch, %al + pushw %ax + movb %bl, %ah # restore old value in register 0x35 + movb %cl, %al + call outidx + popw %ax + cmpb %ch, %al + jne no_s31 # writing not possible => no S3 + movb $0x30, %al + call inidx # now get the S3 id ... + leaw idS3, %di + movw $0x10, %cx + repne + scasb + je no_s31 + + movb %bh, %ah + movb $0x38, %al + jmp s3rest + +no_s3: movb $0x35, %al # restore CRT register 0x35 + movb %bl, %ah + call outidx +no_s31: xorw %bp, %bp # Detection failed +s3rest: movb %bh, %ah + movb $0x38, %al # restore old value of CRT register 0x38 + jmp outidx + +idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 + .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 + +s3_md: .byte 0x54, 0x2b, 0x84 + .byte 0x55, 0x19, 0x84 + .byte 0 + .ascii "S3" + .byte 0 + +# ATI cards. +ati_test: + leaw idati, %si + movw $0x31, %di + movw $0x09, %cx + repe + cmpsb + je atiok + + xorw %bp, %bp +atiok: ret + +idati: .ascii "761295520" + +ati_md: .byte 0x23, 0x19, 0x84 + .byte 0x33, 0x2c, 0x84 + .byte 0x22, 0x1e, 0x64 + .byte 0x21, 0x19, 0x64 + .byte 0x58, 0x21, 0x50 + .byte 0x5b, 0x1e, 0x50 + .byte 0 + .ascii "ATI" + .byte 0 + +# AHEAD +ahead_test: + movw $0x200f, %ax + movw $0x3ce, %dx + outw %ax, %dx + incw %dx + inb %dx, %al + cmpb $0x20, %al + je isahed + + cmpb $0x21, %al + je isahed + + xorw %bp, %bp +isahed: ret + +ahead_md: + .byte 0x22, 0x2c, 0x84 + .byte 0x23, 0x19, 0x84 + .byte 0x24, 0x1c, 0x84 + .byte 0x2f, 0x32, 0xa0 + .byte 0x32, 0x22, 0x50 + .byte 0x34, 0x42, 0x50 + .byte 0 + .ascii "Ahead" + .byte 0 + +# Chips & Tech. +chips_test: + movw $0x3c3, %dx + inb %dx, %al + orb $0x10, %al + outb %al, %dx + movw $0x104, %dx + inb %dx, %al + movb %al, %bl + movw $0x3c3, %dx + inb %dx, %al + andb $0xef, %al + outb %al, %dx + cmpb $0xa5, %bl + je cantok + + xorw %bp, %bp +cantok: ret + +chips_md: + .byte 0x60, 0x19, 0x84 + .byte 0x61, 0x32, 0x84 + .byte 0 + .ascii "Chips & Technologies" + .byte 0 + +# Cirrus Logic 5X0 +cirrus1_test: + movw $0x3d4, %dx + movb $0x0c, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bl + xorb %al, %al + outb %al, %dx + decw %dx + movb $0x1f, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bh + xorb %ah, %ah + shlb $4, %al + movw %ax, %cx + movb %bh, %al + shrb $4, %al + addw %ax, %cx + shlw $8, %cx + addw $6, %cx + movw %cx, %ax + movw $0x3c4, %dx + outw %ax, %dx + incw %dx + inb %dx, %al + andb %al, %al + jnz nocirr + + movb %bh, %al + outb %al, %dx + inb %dx, %al + cmpb $0x01, %al + je iscirr + +nocirr: xorw %bp, %bp +iscirr: movw $0x3d4, %dx + movb %bl, %al + xorb %ah, %ah + shlw $8, %ax + addw $0x0c, %ax + outw %ax, %dx + ret + +cirrus1_md: + .byte 0x1f, 0x19, 0x84 + .byte 0x20, 0x2c, 0x84 + .byte 0x22, 0x1e, 0x84 + .byte 0x31, 0x25, 0x64 + .byte 0 + .ascii "Cirrus Logic 5X0" + .byte 0 + +# Cirrus Logic 54XX +cirrus5_test: + movw $0x3c4, %dx + movb $6, %al + call inidx + movb %al, %bl # BL=backup + movw $6, %ax + call tstidx + cmpb $0x0f, %al + jne c5fail + + movw $0x1206, %ax + call tstidx + cmpb $0x12, %al + jne c5fail + + movb $0x1e, %al + call inidx + movb %al, %bh + movb %bh, %ah + andb $0xc0, %ah + movb $0x1e, %al + call tstidx + andb $0x3f, %al + jne c5xx + + movb $0x1e, %al + movb %bh, %ah + orb $0x3f, %ah + call tstidx + xorb $0x3f, %al + andb $0x3f, %al +c5xx: pushf + movb $0x1e, %al + movb %bh, %ah + outw %ax, %dx + popf + je c5done + +c5fail: xorw %bp, %bp +c5done: movb $6, %al + movb %bl, %ah + outw %ax, %dx + ret + +cirrus5_md: + .byte 0x14, 0x19, 0x84 + .byte 0x54, 0x2b, 0x84 + .byte 0 + .ascii "Cirrus Logic 54XX" + .byte 0 + +# Cirrus Logic 64XX -- no known extra modes, but must be identified, because +# it's misidentified by the Ahead test. +cirrus6_test: + movw $0x3ce, %dx + movb $0x0a, %al + call inidx + movb %al, %bl # BL=backup + movw $0xce0a, %ax + call tstidx + orb %al, %al + jne c2fail + + movw $0xec0a, %ax + call tstidx + cmpb $0x01, %al + jne c2fail + + movb $0xaa, %al + call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. + shrb $4, %al + subb $4, %al + jz c6done + + decb %al + jz c6done + + subb $2, %al + jz c6done + + decb %al + jz c6done + +c2fail: xorw %bp, %bp +c6done: movb $0x0a, %al + movb %bl, %ah + outw %ax, %dx + ret + +cirrus6_md: + .byte 0 + .ascii "Cirrus Logic 64XX" + .byte 0 + +# Everex / Trident +everex_test: + movw $0x7000, %ax + xorw %bx, %bx + int $0x10 + cmpb $0x70, %al + jne noevrx + + shrw $4, %dx + cmpw $0x678, %dx + je evtrid + + cmpw $0x236, %dx + jne evrxok + +evtrid: leaw trident_md, %bp +evrxok: ret + +noevrx: xorw %bp, %bp + ret + +everex_md: + .byte 0x03, 0x22, 0x50 + .byte 0x04, 0x3c, 0x50 + .byte 0x07, 0x2b, 0x64 + .byte 0x08, 0x4b, 0x64 + .byte 0x0a, 0x19, 0x84 + .byte 0x0b, 0x2c, 0x84 + .byte 0x16, 0x1e, 0x50 + .byte 0x18, 0x1b, 0x64 + .byte 0x21, 0x40, 0xa0 + .byte 0x40, 0x1e, 0x84 + .byte 0 + .ascii "Everex/Trident" + .byte 0 + +# Genoa. +genoa_test: + leaw idgenoa, %si # Check Genoa 'clues' + xorw %ax, %ax + movb %es:(0x37), %al + movw %ax, %di + movw $0x04, %cx + decw %si + decw %di +l1: incw %si + incw %di + movb (%si), %al + testb %al, %al + jz l2 + + cmpb %es:(%di), %al +l2: loope l1 + orw %cx, %cx + je isgen + + xorw %bp, %bp +isgen: ret + +idgenoa: .byte 0x77, 0x00, 0x99, 0x66 + +genoa_md: + .byte 0x58, 0x20, 0x50 + .byte 0x5a, 0x2a, 0x64 + .byte 0x60, 0x19, 0x84 + .byte 0x61, 0x1d, 0x84 + .byte 0x62, 0x20, 0x84 + .byte 0x63, 0x2c, 0x84 + .byte 0x64, 0x3c, 0x84 + .byte 0x6b, 0x4f, 0x64 + .byte 0x72, 0x3c, 0x50 + .byte 0x74, 0x42, 0x50 + .byte 0x78, 0x4b, 0x64 + .byte 0 + .ascii "Genoa" + .byte 0 + +# OAK +oak_test: + leaw idoakvga, %si + movw $0x08, %di + movw $0x08, %cx + repe + cmpsb + je isoak + + xorw %bp, %bp +isoak: ret + +idoakvga: .ascii "OAK VGA " + +oak_md: .byte 0x4e, 0x3c, 0x50 + .byte 0x4f, 0x3c, 0x84 + .byte 0x50, 0x19, 0x84 + .byte 0x51, 0x2b, 0x84 + .byte 0 + .ascii "OAK" + .byte 0 + +# WD Paradise. +paradise_test: + leaw idparadise, %si + movw $0x7d, %di + movw $0x04, %cx + repe + cmpsb + je ispara + + xorw %bp, %bp +ispara: ret + +idparadise: .ascii "VGA=" + +paradise_md: + .byte 0x41, 0x22, 0x50 + .byte 0x47, 0x1c, 0x84 + .byte 0x55, 0x19, 0x84 + .byte 0x54, 0x2c, 0x84 + .byte 0 + .ascii "Paradise" + .byte 0 + +# Trident. +trident_test: + movw $0x3c4, %dx + movb $0x0e, %al + outb %al, %dx + incw %dx + inb %dx, %al + xchgb %al, %ah + xorb %al, %al + outb %al, %dx + inb %dx, %al + xchgb %ah, %al + movb %al, %bl # Strange thing ... in the book this wasn't + andb $0x02, %bl # necessary but it worked on my card which + jz setb2 # is a trident. Without it the screen goes + # blurred ... + andb $0xfd, %al + jmp clrb2 + +setb2: orb $0x02, %al +clrb2: outb %al, %dx + andb $0x0f, %ah + cmpb $0x02, %ah + je istrid + + xorw %bp, %bp +istrid: ret + +trident_md: + .byte 0x50, 0x1e, 0x50 + .byte 0x51, 0x2b, 0x50 + .byte 0x52, 0x3c, 0x50 + .byte 0x57, 0x19, 0x84 + .byte 0x58, 0x1e, 0x84 + .byte 0x59, 0x2b, 0x84 + .byte 0x5a, 0x3c, 0x84 + .byte 0 + .ascii "Trident" + .byte 0 + +# Tseng. +tseng_test: + movw $0x3cd, %dx + inb %dx, %al # Could things be this simple ! :-) + movb %al, %bl + movb $0x55, %al + outb %al, %dx + inb %dx, %al + movb %al, %ah + movb %bl, %al + outb %al, %dx + cmpb $0x55, %ah + je istsen + +isnot: xorw %bp, %bp +istsen: ret + +tseng_md: + .byte 0x26, 0x3c, 0x50 + .byte 0x2a, 0x28, 0x64 + .byte 0x23, 0x19, 0x84 + .byte 0x24, 0x1c, 0x84 + .byte 0x22, 0x2c, 0x84 + .byte 0x21, 0x3c, 0x84 + .byte 0 + .ascii "Tseng" + .byte 0 + +# Video7. +video7_test: + movw $0x3cc, %dx + inb %dx, %al + movw $0x3b4, %dx + andb $0x01, %al + jz even7 + + movw $0x3d4, %dx +even7: movb $0x0c, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bl + movb $0x55, %al + outb %al, %dx + inb %dx, %al + decw %dx + movb $0x1f, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bh + decw %dx + movb $0x0c, %al + outb %al, %dx + incw %dx + movb %bl, %al + outb %al, %dx + movb $0x55, %al + xorb $0xea, %al + cmpb %bh, %al + jne isnot + + movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching + ret + +video7_md: + .byte 0x40, 0x2b, 0x50 + .byte 0x43, 0x3c, 0x50 + .byte 0x44, 0x3c, 0x64 + .byte 0x41, 0x19, 0x84 + .byte 0x42, 0x2c, 0x84 + .byte 0x45, 0x1c, 0x84 + .byte 0 + .ascii "Video 7" + .byte 0 + +# Realtek VGA +realtek_test: + leaw idrtvga, %si + movw $0x45, %di + movw $0x0b, %cx + repe + cmpsb + je isrt + + xorw %bp, %bp +isrt: ret + +idrtvga: .ascii "REALTEK VGA" + +realtek_md: + .byte 0x1a, 0x3c, 0x50 + .byte 0x1b, 0x19, 0x84 + .byte 0x1c, 0x1e, 0x84 + .byte 0x1d, 0x2b, 0x84 + .byte 0x1e, 0x3c, 0x84 + .byte 0 + .ascii "REALTEK" + .byte 0 + +#endif /* CONFIG_VIDEO_SVGA */ + +# User-defined local mode table (VGA only) +#ifdef CONFIG_VIDEO_LOCAL +local_modes: + leaw local_mode_table, %si +locm1: lodsw + orw %ax, %ax + jz locm2 + + stosw + movsw + jmp locm1 + +locm2: ret + +# This is the table of local video modes which can be supplied manually +# by the user. Each entry consists of mode ID (word) and dimensions +# (byte for column count and another byte for row count). These modes +# are placed before all SVGA and VESA modes and override them if table +# compacting is enabled. The table must end with a zero word followed +# by NUL-terminated video adapter name. +local_mode_table: + .word 0x0100 # Example: 40x25 + .byte 25,40 + .word 0 + .ascii "Local" + .byte 0 +#endif /* CONFIG_VIDEO_LOCAL */ + +# Read a key and return the ASCII code in al, scan code in ah +getkey: xorb %ah, %ah + int $0x16 + ret + +# Read a key with a timeout of 30 seconds. +# The hardware clock is used to get the time. +getkt: call gettime + addb $30, %al # Wait 30 seconds + cmpb $60, %al + jl lminute + + subb $60, %al +lminute: + movb %al, %cl +again: movb $0x01, %ah + int $0x16 + jnz getkey # key pressed, so get it + + call gettime + cmpb %cl, %al + jne again + + movb $0x20, %al # timeout, return `space' + ret + +# Flush the keyboard buffer +flush: movb $0x01, %ah + int $0x16 + jz empty + + xorb %ah, %ah + int $0x16 + jmp flush + +empty: ret + +# Print hexadecimal number. +prthw: pushw %ax + movb %ah, %al + call prthb + popw %ax +prthb: pushw %ax + shrb $4, %al + call prthn + popw %ax + andb $0x0f, %al +prthn: cmpb $0x0a, %al + jc prth1 + + addb $0x07, %al +prth1: addb $0x30, %al + jmp prtchr + +# Print decimal number in al +prtdec: pushw %ax + pushw %cx + xorb %ah, %ah + movb $0x0a, %cl + idivb %cl + cmpb $0x09, %al + jbe lt100 + + call prtdec + jmp skip10 + +lt100: addb $0x30, %al + call prtchr +skip10: movb %ah, %al + addb $0x30, %al + call prtchr + popw %cx + popw %ax + ret + +store_edid: + pushw %es # just save all registers + pushw %ax + pushw %bx + pushw %cx + pushw %dx + pushw %di + + pushw %fs + popw %es + + movl $0x13131313, %eax # memset block with 0x13 + movw $32, %cx + movw $0x140, %di + cld + rep + stosl + + movw $0x4f15, %ax # do VBE/DDC + movw $0x01, %bx + movw $0x00, %cx + movw $0x01, %dx + movw $0x140, %di + int $0x10 + + popw %di # restore all registers + popw %dx + popw %cx + popw %bx + popw %ax + popw %es + ret + +# VIDEO_SELECT-only variables +mt_end: .word 0 # End of video mode table if built +edit_buf: .space 6 # Line editor buffer +card_name: .word 0 # Pointer to adapter name +scanning: .byte 0 # Performing mode scan +do_restore: .byte 0 # Screen contents altered during mode change +svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes +graphic_mode: .byte 0 # Graphic mode with a linear frame buffer +dac_size: .byte 6 # DAC bit depth + +# Status messages +keymsg: .ascii "Press <RETURN> to see video modes available, " + .ascii "<SPACE> to continue or wait 30 secs" + .byte 0x0d, 0x0a, 0 + +listhdr: .byte 0x0d, 0x0a + .ascii "Mode: COLSxROWS:" + +crlft: .byte 0x0d, 0x0a, 0 + +prompt: .byte 0x0d, 0x0a + .asciz "Enter mode number or `scan': " + +unknt: .asciz "Unknown mode ID. Try again." + +badmdt: .ascii "You passed an undefined mode number." + .byte 0x0d, 0x0a, 0 + +vesaer: .ascii "Error: Scanning of VESA modes failed. Please " + .ascii "report to <mj@ucw.cz>." + .byte 0x0d, 0x0a, 0 + +old_name: .asciz "CGA/MDA/HGA" + +ega_name: .asciz "EGA" + +svga_name: .ascii " " + +vga_name: .asciz "VGA" + +vesa_name: .asciz "VESA" + +name_bann: .asciz "Video adapter: " +#endif /* CONFIG_VIDEO_SELECT */ + +# Other variables: +adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA +video_segment: .word 0xb800 # Video memory segment +force_size: .word 0 # Use this size instead of the one in BIOS vars diff --git a/arch/i386/crypto/Makefile b/arch/i386/crypto/Makefile new file mode 100644 index 000000000000..103c353d0a63 --- /dev/null +++ b/arch/i386/crypto/Makefile @@ -0,0 +1,9 @@ +# +# i386/crypto/Makefile +# +# Arch-specific CryptoAPI modules. +# + +obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o + +aes-i586-y := aes-i586-asm.o aes.o diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S new file mode 100644 index 000000000000..7b73c67cb4e8 --- /dev/null +++ b/arch/i386/crypto/aes-i586-asm.S @@ -0,0 +1,376 @@ +// ------------------------------------------------------------------------- +// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK. +// All rights reserved. +// +// LICENSE TERMS +// +// The free distribution and use of this software in both source and binary +// form is allowed (with or without changes) provided that: +// +// 1. distributions of this source code include the above copyright +// notice, this list of conditions and the following disclaimer// +// +// 2. distributions in binary form include the above copyright +// notice, this list of conditions and the following disclaimer +// in the documentation and/or other associated materials// +// +// 3. the copyright holder's name is not used to endorse products +// built using this software without specific written permission. +// +// +// ALTERNATIVELY, provided that this notice is retained in full, this product +// may be distributed under the terms of the GNU General Public License (GPL), +// in which case the provisions of the GPL apply INSTEAD OF those given above. +// +// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org> +// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> + +// DISCLAIMER +// +// This software is provided 'as is' with no explicit or implied warranties +// in respect of its properties including, but not limited to, correctness +// and fitness for purpose. +// ------------------------------------------------------------------------- +// Issue Date: 29/07/2002 + +.file "aes-i586-asm.S" +.text + +// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// +// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// + +#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) + +// offsets to parameters with one register pushed onto stack + +#define in_blk 8 // input byte array address parameter +#define out_blk 12 // output byte array address parameter +#define ctx 16 // AES context structure + +// offsets in context structure + +#define ekey 0 // encryption key schedule base address +#define nrnd 256 // number of rounds +#define dkey 260 // decryption key schedule base address + +// register mapping for encrypt and decrypt subroutines + +#define r0 eax +#define r1 ebx +#define r2 ecx +#define r3 edx +#define r4 esi +#define r5 edi + +#define eaxl al +#define eaxh ah +#define ebxl bl +#define ebxh bh +#define ecxl cl +#define ecxh ch +#define edxl dl +#define edxh dh + +#define _h(reg) reg##h +#define h(reg) _h(reg) + +#define _l(reg) reg##l +#define l(reg) _l(reg) + +// This macro takes a 32-bit word representing a column and uses +// each of its four bytes to index into four tables of 256 32-bit +// words to obtain values that are then xored into the appropriate +// output registers r0, r1, r4 or r5. + +// Parameters: +// table table base address +// %1 out_state[0] +// %2 out_state[1] +// %3 out_state[2] +// %4 out_state[3] +// idx input register for the round (destroyed) +// tmp scratch register for the round +// sched key schedule + +#define do_col(table, a1,a2,a3,a4, idx, tmp) \ + movzx %l(idx),%tmp; \ + xor table(,%tmp,4),%a1; \ + movzx %h(idx),%tmp; \ + shr $16,%idx; \ + xor table+tlen(,%tmp,4),%a2; \ + movzx %l(idx),%tmp; \ + movzx %h(idx),%idx; \ + xor table+2*tlen(,%tmp,4),%a3; \ + xor table+3*tlen(,%idx,4),%a4; + +// initialise output registers from the key schedule +// NB1: original value of a3 is in idx on exit +// NB2: original values of a1,a2,a4 aren't used +#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \ + mov 0 sched,%a1; \ + movzx %l(idx),%tmp; \ + mov 12 sched,%a2; \ + xor table(,%tmp,4),%a1; \ + mov 4 sched,%a4; \ + movzx %h(idx),%tmp; \ + shr $16,%idx; \ + xor table+tlen(,%tmp,4),%a2; \ + movzx %l(idx),%tmp; \ + movzx %h(idx),%idx; \ + xor table+3*tlen(,%idx,4),%a4; \ + mov %a3,%idx; \ + mov 8 sched,%a3; \ + xor table+2*tlen(,%tmp,4),%a3; + +// initialise output registers from the key schedule +// NB1: original value of a3 is in idx on exit +// NB2: original values of a1,a2,a4 aren't used +#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \ + mov 0 sched,%a1; \ + movzx %l(idx),%tmp; \ + mov 4 sched,%a2; \ + xor table(,%tmp,4),%a1; \ + mov 12 sched,%a4; \ + movzx %h(idx),%tmp; \ + shr $16,%idx; \ + xor table+tlen(,%tmp,4),%a2; \ + movzx %l(idx),%tmp; \ + movzx %h(idx),%idx; \ + xor table+3*tlen(,%idx,4),%a4; \ + mov %a3,%idx; \ + mov 8 sched,%a3; \ + xor table+2*tlen(,%tmp,4),%a3; + + +// original Gladman had conditional saves to MMX regs. +#define save(a1, a2) \ + mov %a2,4*a1(%esp) + +#define restore(a1, a2) \ + mov 4*a2(%esp),%a1 + +// These macros perform a forward encryption cycle. They are entered with +// the first previous round column values in r0,r1,r4,r5 and +// exit with the final values in the same registers, using stack +// for temporary storage. + +// round column values +// on entry: r0,r1,r4,r5 +// on exit: r2,r1,r4,r5 +#define fwd_rnd1(arg, table) \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \ + do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \ + restore(r0,0); \ + do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \ + restore(r0,1); \ + do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */ + +// round column values +// on entry: r2,r1,r4,r5 +// on exit: r0,r1,r4,r5 +#define fwd_rnd2(arg, table) \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \ + do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \ + restore(r2,0); \ + do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \ + restore(r2,1); \ + do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */ + +// These macros performs an inverse encryption cycle. They are entered with +// the first previous round column values in r0,r1,r4,r5 and +// exit with the final values in the same registers, using stack +// for temporary storage + +// round column values +// on entry: r0,r1,r4,r5 +// on exit: r2,r1,r4,r5 +#define inv_rnd1(arg, table) \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \ + do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \ + restore(r0,0); \ + do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \ + restore(r0,1); \ + do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */ + +// round column values +// on entry: r2,r1,r4,r5 +// on exit: r0,r1,r4,r5 +#define inv_rnd2(arg, table) \ + save (0,r1); \ + save (1,r5); \ + \ + /* compute new column values */ \ + do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \ + do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \ + restore(r2,0); \ + do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \ + restore(r2,1); \ + do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */ + +// AES (Rijndael) Encryption Subroutine + +.global aes_enc_blk + +.extern ft_tab +.extern fl_tab + +.align 4 + +aes_enc_blk: + push %ebp + mov ctx(%esp),%ebp // pointer to context + +// CAUTION: the order and the values used in these assigns +// rely on the register mappings + +1: push %ebx + mov in_blk+4(%esp),%r2 + push %esi + mov nrnd(%ebp),%r3 // number of rounds + push %edi +#if ekey != 0 + lea ekey(%ebp),%ebp // key pointer +#endif + +// input four columns and xor in first round key + + mov (%r2),%r0 + mov 4(%r2),%r1 + mov 8(%r2),%r4 + mov 12(%r2),%r5 + xor (%ebp),%r0 + xor 4(%ebp),%r1 + xor 8(%ebp),%r4 + xor 12(%ebp),%r5 + + sub $8,%esp // space for register saves on stack + add $16,%ebp // increment to next round key + sub $10,%r3 + je 4f // 10 rounds for 128-bit key + add $32,%ebp + sub $2,%r3 + je 3f // 12 rounds for 128-bit key + add $32,%ebp + +2: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 128-bit key + fwd_rnd2( -48(%ebp) ,ft_tab) +3: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 128-bit key + fwd_rnd2( -16(%ebp) ,ft_tab) +4: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key + fwd_rnd2( +16(%ebp) ,ft_tab) + fwd_rnd1( +32(%ebp) ,ft_tab) + fwd_rnd2( +48(%ebp) ,ft_tab) + fwd_rnd1( +64(%ebp) ,ft_tab) + fwd_rnd2( +80(%ebp) ,ft_tab) + fwd_rnd1( +96(%ebp) ,ft_tab) + fwd_rnd2(+112(%ebp) ,ft_tab) + fwd_rnd1(+128(%ebp) ,ft_tab) + fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table + +// move final values to the output array. CAUTION: the +// order of these assigns rely on the register mappings + + add $8,%esp + mov out_blk+12(%esp),%ebp + mov %r5,12(%ebp) + pop %edi + mov %r4,8(%ebp) + pop %esi + mov %r1,4(%ebp) + pop %ebx + mov %r0,(%ebp) + pop %ebp + mov $1,%eax + ret + +// AES (Rijndael) Decryption Subroutine + +.global aes_dec_blk + +.extern it_tab +.extern il_tab + +.align 4 + +aes_dec_blk: + push %ebp + mov ctx(%esp),%ebp // pointer to context + +// CAUTION: the order and the values used in these assigns +// rely on the register mappings + +1: push %ebx + mov in_blk+4(%esp),%r2 + push %esi + mov nrnd(%ebp),%r3 // number of rounds + push %edi +#if dkey != 0 + lea dkey(%ebp),%ebp // key pointer +#endif + mov %r3,%r0 + shl $4,%r0 + add %r0,%ebp + +// input four columns and xor in first round key + + mov (%r2),%r0 + mov 4(%r2),%r1 + mov 8(%r2),%r4 + mov 12(%r2),%r5 + xor (%ebp),%r0 + xor 4(%ebp),%r1 + xor 8(%ebp),%r4 + xor 12(%ebp),%r5 + + sub $8,%esp // space for register saves on stack + sub $16,%ebp // increment to next round key + sub $10,%r3 + je 4f // 10 rounds for 128-bit key + sub $32,%ebp + sub $2,%r3 + je 3f // 12 rounds for 128-bit key + sub $32,%ebp + +2: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 128-bit key + inv_rnd2( +48(%ebp), it_tab) +3: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 128-bit key + inv_rnd2( +16(%ebp), it_tab) +4: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key + inv_rnd2( -16(%ebp), it_tab) + inv_rnd1( -32(%ebp), it_tab) + inv_rnd2( -48(%ebp), it_tab) + inv_rnd1( -64(%ebp), it_tab) + inv_rnd2( -80(%ebp), it_tab) + inv_rnd1( -96(%ebp), it_tab) + inv_rnd2(-112(%ebp), it_tab) + inv_rnd1(-128(%ebp), it_tab) + inv_rnd2(-144(%ebp), il_tab) // last round uses a different table + +// move final values to the output array. CAUTION: the +// order of these assigns rely on the register mappings + + add $8,%esp + mov out_blk+12(%esp),%ebp + mov %r5,12(%ebp) + pop %edi + mov %r4,8(%ebp) + pop %esi + mov %r1,4(%ebp) + pop %ebx + mov %r0,(%ebp) + pop %ebp + mov $1,%eax + ret + diff --git a/arch/i386/crypto/aes.c b/arch/i386/crypto/aes.c new file mode 100644 index 000000000000..1019430fc1f1 --- /dev/null +++ b/arch/i386/crypto/aes.c @@ -0,0 +1,520 @@ +/* + * + * Glue Code for optimized 586 assembler version of AES + * + * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK. + * All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software in both source and binary + * form is allowed (with or without changes) provided that: + * + * 1. distributions of this source code include the above copyright + * notice, this list of conditions and the following disclaimer; + * + * 2. distributions in binary form include the above copyright + * notice, this list of conditions and the following disclaimer + * in the documentation and/or other associated materials; + * + * 3. the copyright holder's name is not used to endorse products + * built using this software without specific written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this product + * may be distributed under the terms of the GNU General Public License (GPL), + * in which case the provisions of the GPL apply INSTEAD OF those given above. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * + * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to + * 2.5 API). + * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org> + * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/crypto.h> +#include <linux/linkage.h> + +asmlinkage void aes_enc_blk(const u8 *src, u8 *dst, void *ctx); +asmlinkage void aes_dec_blk(const u8 *src, u8 *dst, void *ctx); + +#define AES_MIN_KEY_SIZE 16 +#define AES_MAX_KEY_SIZE 32 +#define AES_BLOCK_SIZE 16 +#define AES_KS_LENGTH 4 * AES_BLOCK_SIZE +#define RC_LENGTH 29 + +struct aes_ctx { + u32 ekey[AES_KS_LENGTH]; + u32 rounds; + u32 dkey[AES_KS_LENGTH]; +}; + +#define WPOLY 0x011b +#define u32_in(x) le32_to_cpu(*(const u32 *)(x)) +#define bytes2word(b0, b1, b2, b3) \ + (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0)) + +/* define the finite field multiplies required for Rijndael */ +#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) +#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) +#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) +#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) +#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) +#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) +#define fi(x) ((x) ? pow[255 - log[x]]: 0) + +static inline u32 upr(u32 x, int n) +{ + return (x << 8 * n) | (x >> (32 - 8 * n)); +} + +static inline u8 bval(u32 x, int n) +{ + return x >> 8 * n; +} + +/* The forward and inverse affine transformations used in the S-box */ +#define fwd_affine(x) \ + (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8))) + +#define inv_affine(x) \ + (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8))) + +static u32 rcon_tab[RC_LENGTH]; + +u32 ft_tab[4][256]; +u32 fl_tab[4][256]; +static u32 ls_tab[4][256]; +static u32 im_tab[4][256]; +u32 il_tab[4][256]; +u32 it_tab[4][256]; + +static void gen_tabs(void) +{ + u32 i, w; + u8 pow[512], log[256]; + + /* + * log and power tables for GF(2^8) finite field with + * WPOLY as modular polynomial - the simplest primitive + * root is 0x03, used here to generate the tables. + */ + i = 0; w = 1; + + do { + pow[i] = (u8)w; + pow[i + 255] = (u8)w; + log[w] = (u8)i++; + w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); + } while (w != 1); + + for(i = 0, w = 1; i < RC_LENGTH; ++i) { + rcon_tab[i] = bytes2word(w, 0, 0, 0); + w = f2(w); + } + + for(i = 0; i < 256; ++i) { + u8 b; + + b = fwd_affine(fi((u8)i)); + w = bytes2word(f2(b), b, b, f3(b)); + + /* tables for a normal encryption round */ + ft_tab[0][i] = w; + ft_tab[1][i] = upr(w, 1); + ft_tab[2][i] = upr(w, 2); + ft_tab[3][i] = upr(w, 3); + w = bytes2word(b, 0, 0, 0); + + /* + * tables for last encryption round + * (may also be used in the key schedule) + */ + fl_tab[0][i] = w; + fl_tab[1][i] = upr(w, 1); + fl_tab[2][i] = upr(w, 2); + fl_tab[3][i] = upr(w, 3); + + /* + * table for key schedule if fl_tab above is + * not of the required form + */ + ls_tab[0][i] = w; + ls_tab[1][i] = upr(w, 1); + ls_tab[2][i] = upr(w, 2); + ls_tab[3][i] = upr(w, 3); + + b = fi(inv_affine((u8)i)); + w = bytes2word(fe(b), f9(b), fd(b), fb(b)); + + /* tables for the inverse mix column operation */ + im_tab[0][b] = w; + im_tab[1][b] = upr(w, 1); + im_tab[2][b] = upr(w, 2); + im_tab[3][b] = upr(w, 3); + + /* tables for a normal decryption round */ + it_tab[0][i] = w; + it_tab[1][i] = upr(w,1); + it_tab[2][i] = upr(w,2); + it_tab[3][i] = upr(w,3); + + w = bytes2word(b, 0, 0, 0); + + /* tables for last decryption round */ + il_tab[0][i] = w; + il_tab[1][i] = upr(w,1); + il_tab[2][i] = upr(w,2); + il_tab[3][i] = upr(w,3); + } +} + +#define four_tables(x,tab,vf,rf,c) \ +( tab[0][bval(vf(x,0,c),rf(0,c))] ^ \ + tab[1][bval(vf(x,1,c),rf(1,c))] ^ \ + tab[2][bval(vf(x,2,c),rf(2,c))] ^ \ + tab[3][bval(vf(x,3,c),rf(3,c))] \ +) + +#define vf1(x,r,c) (x) +#define rf1(r,c) (r) +#define rf2(r,c) ((r-c)&3) + +#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0) +#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c) + +#define ff(x) inv_mcol(x) + +#define ke4(k,i) \ +{ \ + k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ + k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; \ + k[4*(i)+7] = ss[3] ^= ss[2]; \ +} + +#define kel4(k,i) \ +{ \ + k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i]; \ + k[4*(i)+5] = ss[1] ^= ss[0]; \ + k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ +} + +#define ke6(k,i) \ +{ \ + k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; \ + k[6*(i)+ 9] = ss[3] ^= ss[2]; \ + k[6*(i)+10] = ss[4] ^= ss[3]; \ + k[6*(i)+11] = ss[5] ^= ss[4]; \ +} + +#define kel6(k,i) \ +{ \ + k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 7] = ss[1] ^= ss[0]; \ + k[6*(i)+ 8] = ss[2] ^= ss[1]; \ + k[6*(i)+ 9] = ss[3] ^= ss[2]; \ +} + +#define ke8(k,i) \ +{ \ + k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; \ + k[8*(i)+11] = ss[3] ^= ss[2]; \ + k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); \ + k[8*(i)+13] = ss[5] ^= ss[4]; \ + k[8*(i)+14] = ss[6] ^= ss[5]; \ + k[8*(i)+15] = ss[7] ^= ss[6]; \ +} + +#define kel8(k,i) \ +{ \ + k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 9] = ss[1] ^= ss[0]; \ + k[8*(i)+10] = ss[2] ^= ss[1]; \ + k[8*(i)+11] = ss[3] ^= ss[2]; \ +} + +#define kdf4(k,i) \ +{ \ + ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ + ss[1] = ss[1] ^ ss[3]; \ + ss[2] = ss[2] ^ ss[3]; \ + ss[3] = ss[3]; \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] ^= k[4*(i)]; \ + k[4*(i)+4] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+1]; \ + k[4*(i)+5] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+2]; \ + k[4*(i)+6] = ff(ss[4]); \ + ss[4] ^= k[4*(i)+3]; \ + k[4*(i)+7] = ff(ss[4]); \ +} + +#define kd4(k,i) \ +{ \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] = ff(ss[4]); \ + k[4*(i)+4] = ss[4] ^= k[4*(i)]; \ + k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ + k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; \ + k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ +} + +#define kdl4(k,i) \ +{ \ + ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i]; \ + ss[i % 4] ^= ss[4]; \ + k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ + k[4*(i)+5] = ss[1] ^ ss[3]; \ + k[4*(i)+6] = ss[0]; \ + k[4*(i)+7] = ss[1]; \ +} + +#define kdf6(k,i) \ +{ \ + ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 6] = ff(ss[0]); \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ff(ss[1]); \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ff(ss[2]); \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ff(ss[3]); \ + ss[4] ^= ss[3]; \ + k[6*(i)+10] = ff(ss[4]); \ + ss[5] ^= ss[4]; \ + k[6*(i)+11] = ff(ss[5]); \ +} + +#define kd6(k,i) \ +{ \ + ss[6] = ls_box(ss[5],3) ^ rcon_tab[i]; \ + ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ + k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ + ss[4] ^= ss[3]; \ + k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ + ss[5] ^= ss[4]; \ + k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ +} + +#define kdl6(k,i) \ +{ \ + ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i]; \ + k[6*(i)+ 6] = ss[0]; \ + ss[1] ^= ss[0]; \ + k[6*(i)+ 7] = ss[1]; \ + ss[2] ^= ss[1]; \ + k[6*(i)+ 8] = ss[2]; \ + ss[3] ^= ss[2]; \ + k[6*(i)+ 9] = ss[3]; \ +} + +#define kdf8(k,i) \ +{ \ + ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 8] = ff(ss[0]); \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = ff(ss[1]); \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = ff(ss[2]); \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = ff(ss[3]); \ + ss[4] ^= ls_box(ss[3],0); \ + k[8*(i)+12] = ff(ss[4]); \ + ss[5] ^= ss[4]; \ + k[8*(i)+13] = ff(ss[5]); \ + ss[6] ^= ss[5]; \ + k[8*(i)+14] = ff(ss[6]); \ + ss[7] ^= ss[6]; \ + k[8*(i)+15] = ff(ss[7]); \ +} + +#define kd8(k,i) \ +{ \ + u32 __g = ls_box(ss[7],3) ^ rcon_tab[i]; \ + ss[0] ^= __g; \ + __g = ff(__g); \ + k[8*(i)+ 8] = __g ^= k[8*(i)]; \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = __g ^= k[8*(i)+ 1]; \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = __g ^= k[8*(i)+ 2]; \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = __g ^= k[8*(i)+ 3]; \ + __g = ls_box(ss[3],0); \ + ss[4] ^= __g; \ + __g = ff(__g); \ + k[8*(i)+12] = __g ^= k[8*(i)+ 4]; \ + ss[5] ^= ss[4]; \ + k[8*(i)+13] = __g ^= k[8*(i)+ 5]; \ + ss[6] ^= ss[5]; \ + k[8*(i)+14] = __g ^= k[8*(i)+ 6]; \ + ss[7] ^= ss[6]; \ + k[8*(i)+15] = __g ^= k[8*(i)+ 7]; \ +} + +#define kdl8(k,i) \ +{ \ + ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i]; \ + k[8*(i)+ 8] = ss[0]; \ + ss[1] ^= ss[0]; \ + k[8*(i)+ 9] = ss[1]; \ + ss[2] ^= ss[1]; \ + k[8*(i)+10] = ss[2]; \ + ss[3] ^= ss[2]; \ + k[8*(i)+11] = ss[3]; \ +} + +static int +aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags) +{ + int i; + u32 ss[8]; + struct aes_ctx *ctx = ctx_arg; + + /* encryption schedule */ + + ctx->ekey[0] = ss[0] = u32_in(in_key); + ctx->ekey[1] = ss[1] = u32_in(in_key + 4); + ctx->ekey[2] = ss[2] = u32_in(in_key + 8); + ctx->ekey[3] = ss[3] = u32_in(in_key + 12); + + switch(key_len) { + case 16: + for (i = 0; i < 9; i++) + ke4(ctx->ekey, i); + kel4(ctx->ekey, 9); + ctx->rounds = 10; + break; + + case 24: + ctx->ekey[4] = ss[4] = u32_in(in_key + 16); + ctx->ekey[5] = ss[5] = u32_in(in_key + 20); + for (i = 0; i < 7; i++) + ke6(ctx->ekey, i); + kel6(ctx->ekey, 7); + ctx->rounds = 12; + break; + + case 32: + ctx->ekey[4] = ss[4] = u32_in(in_key + 16); + ctx->ekey[5] = ss[5] = u32_in(in_key + 20); + ctx->ekey[6] = ss[6] = u32_in(in_key + 24); + ctx->ekey[7] = ss[7] = u32_in(in_key + 28); + for (i = 0; i < 6; i++) + ke8(ctx->ekey, i); + kel8(ctx->ekey, 6); + ctx->rounds = 14; + break; + + default: + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* decryption schedule */ + + ctx->dkey[0] = ss[0] = u32_in(in_key); + ctx->dkey[1] = ss[1] = u32_in(in_key + 4); + ctx->dkey[2] = ss[2] = u32_in(in_key + 8); + ctx->dkey[3] = ss[3] = u32_in(in_key + 12); + + switch (key_len) { + case 16: + kdf4(ctx->dkey, 0); + for (i = 1; i < 9; i++) + kd4(ctx->dkey, i); + kdl4(ctx->dkey, 9); + break; + + case 24: + ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16)); + ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20)); + kdf6(ctx->dkey, 0); + for (i = 1; i < 7; i++) + kd6(ctx->dkey, i); + kdl6(ctx->dkey, 7); + break; + + case 32: + ctx->dkey[4] = ff(ss[4] = u32_in(in_key + 16)); + ctx->dkey[5] = ff(ss[5] = u32_in(in_key + 20)); + ctx->dkey[6] = ff(ss[6] = u32_in(in_key + 24)); + ctx->dkey[7] = ff(ss[7] = u32_in(in_key + 28)); + kdf8(ctx->dkey, 0); + for (i = 1; i < 6; i++) + kd8(ctx->dkey, i); + kdl8(ctx->dkey, 6); + break; + } + return 0; +} + +static inline void aes_encrypt(void *ctx, u8 *dst, const u8 *src) +{ + aes_enc_blk(src, dst, ctx); +} +static inline void aes_decrypt(void *ctx, u8 *dst, const u8 *src) +{ + aes_dec_blk(src, dst, ctx); +} + + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aes_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(aes_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}; + +static int __init aes_init(void) +{ + gen_tabs(); + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_fini(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_init(aes_init); +module_exit(aes_fini); + +MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter"); +MODULE_ALIAS("aes"); diff --git a/arch/i386/defconfig b/arch/i386/defconfig new file mode 100644 index 000000000000..28e620383799 --- /dev/null +++ b/arch/i386/defconfig @@ -0,0 +1,1247 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_X86=y +CONFIG_MMU=y +CONFIG_UID16=y +CONFIG_GENERIC_ISA_DMA=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +# CONFIG_BSD_PROCESS_ACCT is not set +CONFIG_SYSCTL=y +CONFIG_AUDIT=y +CONFIG_AUDITSYSCALL=y +CONFIG_LOG_BUF_SHIFT=15 +CONFIG_HOTPLUG=y +# CONFIG_IKCONFIG is not set +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set + +# +# Loadable module support +# +CONFIG_MODULES=y +# CONFIG_MODULE_UNLOAD is not set +CONFIG_OBSOLETE_MODPARM=y +# CONFIG_MODVERSIONS is not set +CONFIG_KMOD=y + +# +# Processor type and features +# +CONFIG_X86_PC=y +# CONFIG_X86_ELAN is not set +# CONFIG_X86_VOYAGER is not set +# CONFIG_X86_NUMAQ is not set +# CONFIG_X86_SUMMIT is not set +# CONFIG_X86_BIGSMP is not set +# CONFIG_X86_VISWS is not set +# CONFIG_X86_GENERICARCH is not set +# CONFIG_X86_ES7000 is not set +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMII is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +CONFIG_MPENTIUM4=y +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MEFFICEON is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +# CONFIG_X86_GENERIC is not set +CONFIG_X86_CMPXCHG=y +CONFIG_X86_XADD=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_INTEL_USERCOPY=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +# CONFIG_HPET_TIMER is not set +# CONFIG_HPET_EMULATE_RTC is not set +CONFIG_SMP=y +CONFIG_NR_CPUS=8 +CONFIG_SCHED_SMT=y +CONFIG_PREEMPT=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_TSC=y +CONFIG_X86_MCE=y +CONFIG_X86_MCE_NONFATAL=y +CONFIG_X86_MCE_P4THERMAL=y +# CONFIG_TOSHIBA is not set +# CONFIG_I8K is not set +# CONFIG_MICROCODE is not set +# CONFIG_X86_MSR is not set +# CONFIG_X86_CPUID is not set + +# +# Firmware Drivers +# +# CONFIG_EDD is not set +CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set +# CONFIG_MATH_EMULATION is not set +CONFIG_MTRR=y +# CONFIG_EFI is not set +CONFIG_IRQBALANCE=y +CONFIG_HAVE_DEC_LOCK=y +# CONFIG_REGPARM is not set + +# +# Power management options (ACPI, APM) +# +CONFIG_PM=y +CONFIG_SOFTWARE_SUSPEND=y +# CONFIG_PM_DISK is not set + +# +# ACPI (Advanced Configuration and Power Interface) Support +# +CONFIG_ACPI=y +CONFIG_ACPI_BOOT=y +CONFIG_ACPI_INTERPRETER=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_SLEEP_PROC_FS=y +CONFIG_ACPI_AC=y +CONFIG_ACPI_BATTERY=y +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_THERMAL=y +# CONFIG_ACPI_ASUS is not set +# CONFIG_ACPI_TOSHIBA is not set +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_BUS=y +CONFIG_ACPI_EC=y +CONFIG_ACPI_POWER=y +CONFIG_ACPI_PCI=y +CONFIG_ACPI_SYSTEM=y +# CONFIG_X86_PM_TIMER is not set + +# +# APM (Advanced Power Management) BIOS Support +# +# CONFIG_APM is not set + +# +# CPU Frequency scaling +# +# CONFIG_CPU_FREQ is not set + +# +# Bus options (PCI, PCMCIA, EISA, MCA, ISA) +# +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GOMMCONFIG is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +# CONFIG_PCI_USE_VECTOR is not set +CONFIG_PCI_LEGACY_PROC=y +CONFIG_PCI_NAMES=y +CONFIG_ISA=y +# CONFIG_EISA is not set +# CONFIG_MCA is not set +# CONFIG_SCx200 is not set + +# +# PCMCIA/CardBus support +# +# CONFIG_PCMCIA is not set +CONFIG_PCMCIA_PROBE=y + +# +# PCI Hotplug Support +# +# CONFIG_HOTPLUG_PCI is not set + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_AOUT=y +CONFIG_BINFMT_MISC=y + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_FW_LOADER=m + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Parallel port support +# +CONFIG_PARPORT=y +CONFIG_PARPORT_PC=y +CONFIG_PARPORT_PC_CML1=y +# CONFIG_PARPORT_SERIAL is not set +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +# CONFIG_PARPORT_OTHER is not set +# CONFIG_PARPORT_1284 is not set + +# +# Plug and Play support +# +CONFIG_PNP=y +# CONFIG_PNP_DEBUG is not set + +# +# Protocols +# +# CONFIG_ISAPNP is not set +# CONFIG_PNPBIOS is not set + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +# CONFIG_BLK_DEV_XD is not set +# CONFIG_PARIDE is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +# CONFIG_BLK_DEV_LOOP is not set +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_CARMEL is not set +# CONFIG_BLK_DEV_RAM is not set +CONFIG_LBD=y + +# +# ATA/ATAPI/MFM/RLL support +# +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +# CONFIG_BLK_DEV_HD_IDE is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_IDEFLOPPY is not set +# CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set +CONFIG_IDE_TASKFILE_IO=y + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +CONFIG_BLK_DEV_CMD640=y +# CONFIG_BLK_DEV_CMD640_ENHANCED is not set +# CONFIG_BLK_DEV_IDEPNP is not set +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +# CONFIG_BLK_DEV_OFFBOARD is not set +CONFIG_BLK_DEV_GENERIC=y +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_RZ1000=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_ADMA=y +# CONFIG_BLK_DEV_AEC62XX is not set +# CONFIG_BLK_DEV_ALI15X3 is not set +# CONFIG_BLK_DEV_AMD74XX is not set +# CONFIG_BLK_DEV_ATIIXP is not set +# CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_TRIFLEX is not set +# CONFIG_BLK_DEV_CY82C693 is not set +# CONFIG_BLK_DEV_CS5520 is not set +# CONFIG_BLK_DEV_CS5530 is not set +# CONFIG_BLK_DEV_HPT34X is not set +# CONFIG_BLK_DEV_HPT366 is not set +# CONFIG_BLK_DEV_SC1200 is not set +CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_NS87415 is not set +# CONFIG_BLK_DEV_PDC202XX_OLD is not set +# CONFIG_BLK_DEV_PDC202XX_NEW is not set +# CONFIG_BLK_DEV_SVWKS is not set +# CONFIG_BLK_DEV_SIIMAGE is not set +# CONFIG_BLK_DEV_SIS5513 is not set +# CONFIG_BLK_DEV_SLC90E66 is not set +# CONFIG_BLK_DEV_TRM290 is not set +# CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_IDE_ARM is not set +# CONFIG_IDE_CHIPSETS is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +CONFIG_SCSI=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +# CONFIG_CHR_DEV_ST is not set +# CONFIG_CHR_DEV_OSST is not set +# CONFIG_BLK_DEV_SR is not set +CONFIG_CHR_DEV_SG=y + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +# CONFIG_SCSI_MULTI_LUN is not set +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set + +# +# SCSI Transport Attributes +# +# CONFIG_SCSI_SPI_ATTRS is not set +# CONFIG_SCSI_FC_ATTRS is not set + +# +# SCSI low-level drivers +# +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_7000FASST is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AHA152X is not set +# CONFIG_SCSI_AHA1542 is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC7XXX_OLD is not set +# CONFIG_SCSI_AIC79XX is not set +CONFIG_SCSI_DPT_I2O=m +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_SCSI_IN2000 is not set +# CONFIG_SCSI_MEGARAID is not set +CONFIG_SCSI_SATA=y +# CONFIG_SCSI_SATA_SVW is not set +CONFIG_SCSI_ATA_PIIX=y +# CONFIG_SCSI_SATA_PROMISE is not set +CONFIG_SCSI_SATA_SX4=m +# CONFIG_SCSI_SATA_SIL is not set +CONFIG_SCSI_SATA_SIS=m +# CONFIG_SCSI_SATA_VIA is not set +# CONFIG_SCSI_SATA_VITESSE is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_CPQFCTS is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_DTC3280 is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_GENERIC_NCR5380 is not set +# CONFIG_SCSI_GENERIC_NCR5380_MMIO is not set +# CONFIG_SCSI_IPS is not set +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_PPA is not set +# CONFIG_SCSI_IMM is not set +# CONFIG_SCSI_NCR53C406A is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +# CONFIG_SCSI_PAS16 is not set +# CONFIG_SCSI_PSI240I is not set +# CONFIG_SCSI_QLOGIC_FAS is not set +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +CONFIG_SCSI_QLA2XXX=y +# CONFIG_SCSI_QLA21XX is not set +# CONFIG_SCSI_QLA22XX is not set +# CONFIG_SCSI_QLA2300 is not set +# CONFIG_SCSI_QLA2322 is not set +# CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_QLA6322 is not set +# CONFIG_SCSI_SYM53C416 is not set +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_DC390T is not set +# CONFIG_SCSI_T128 is not set +# CONFIG_SCSI_U14_34F is not set +# CONFIG_SCSI_ULTRASTOR is not set +# CONFIG_SCSI_NSP32 is not set +# CONFIG_SCSI_DEBUG is not set + +# +# Old CD-ROM drivers (not SCSI, not IDE) +# +# CONFIG_CD_NO_IDESCSI is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Fusion MPT device support +# +# CONFIG_FUSION is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_IEEE1394=y + +# +# Subsystem Options +# +# CONFIG_IEEE1394_VERBOSEDEBUG is not set +# CONFIG_IEEE1394_OUI_DB is not set +# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set + +# +# Device Drivers +# + +# +# Texas Instruments PCILynx requires I2C +# +CONFIG_IEEE1394_OHCI1394=y + +# +# Protocol Drivers +# +# CONFIG_IEEE1394_VIDEO1394 is not set +# CONFIG_IEEE1394_SBP2 is not set +# CONFIG_IEEE1394_ETH1394 is not set +# CONFIG_IEEE1394_DV1394 is not set +CONFIG_IEEE1394_RAWIO=y +# CONFIG_IEEE1394_CMP is not set + +# +# I2O device support +# +# CONFIG_I2O is not set + +# +# Networking support +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set +# CONFIG_NETLINK_DEV is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_IP_MROUTE is not set +# CONFIG_ARPD is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set + +# +# IP: Virtual Server Configuration +# +# CONFIG_IP_VS is not set +# CONFIG_IPV6 is not set +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=y +# CONFIG_IP_NF_FTP is not set +# CONFIG_IP_NF_IRC is not set +# CONFIG_IP_NF_TFTP is not set +# CONFIG_IP_NF_AMANDA is not set +CONFIG_IP_NF_QUEUE=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MATCH_LIMIT=y +CONFIG_IP_NF_MATCH_IPRANGE=y +CONFIG_IP_NF_MATCH_MAC=y +CONFIG_IP_NF_MATCH_PKTTYPE=y +CONFIG_IP_NF_MATCH_MARK=y +CONFIG_IP_NF_MATCH_MULTIPORT=y +CONFIG_IP_NF_MATCH_TOS=y +CONFIG_IP_NF_MATCH_RECENT=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_DSCP=y +CONFIG_IP_NF_MATCH_AH_ESP=y +CONFIG_IP_NF_MATCH_LENGTH=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_MATCH_TCPMSS=y +CONFIG_IP_NF_MATCH_HELPER=y +CONFIG_IP_NF_MATCH_STATE=y +CONFIG_IP_NF_MATCH_CONNTRACK=y +CONFIG_IP_NF_MATCH_OWNER=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_SAME=y +# CONFIG_IP_NF_NAT_SNMP_BASIC is not set +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_TARGET_TOS=y +CONFIG_IP_NF_TARGET_ECN=y +CONFIG_IP_NF_TARGET_DSCP=y +CONFIG_IP_NF_TARGET_MARK=y +CONFIG_IP_NF_TARGET_CLASSIFY=y +CONFIG_IP_NF_TARGET_LOG=y +CONFIG_IP_NF_TARGET_ULOG=y +CONFIG_IP_NF_TARGET_TCPMSS=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP_NF_TARGET_NOTRACK=m +CONFIG_IP_NF_RAW=m + +# +# SCTP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_BRIDGE is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_DECNET is not set +# CONFIG_LLC2 is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_NETPOLL is not set +# CONFIG_NET_POLL_CONTROLLER is not set +# CONFIG_HAMRADIO is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set +CONFIG_NETDEVICES=y +CONFIG_DUMMY=m +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +# CONFIG_TUN is not set +# CONFIG_NET_SB1000 is not set + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +CONFIG_MII=y +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_LANCE is not set +# CONFIG_NET_VENDOR_SMC is not set +# CONFIG_NET_VENDOR_RACAL is not set + +# +# Tulip family network device support +# +# CONFIG_NET_TULIP is not set +# CONFIG_AT1700 is not set +# CONFIG_DEPCA is not set +# CONFIG_HP100 is not set +# CONFIG_NET_ISA is not set +CONFIG_NET_PCI=y +# CONFIG_PCNET32 is not set +# CONFIG_AMD8111_ETH is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_AC3200 is not set +# CONFIG_APRICOT is not set +# CONFIG_B44 is not set +# CONFIG_FORCEDETH is not set +# CONFIG_CS89x0 is not set +# CONFIG_DGRS is not set +# CONFIG_EEPRO100 is not set +# CONFIG_E100 is not set +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +# CONFIG_NE2K_PCI is not set +# CONFIG_8139CP is not set +CONFIG_8139TOO=y +CONFIG_8139TOO_PIO=y +# CONFIG_8139TOO_TUNE_TWISTER is not set +# CONFIG_8139TOO_8129 is not set +# CONFIG_8139_OLD_RX_RESET is not set +# CONFIG_SIS900 is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_TLAN is not set +# CONFIG_VIA_RHINE is not set +# CONFIG_NET_POCKET is not set + +# +# Ethernet (1000 Mbit) +# +# CONFIG_ACENIC is not set +# CONFIG_DL2K is not set +# CONFIG_E1000 is not set +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_R8169 is not set +# CONFIG_SK98LIN is not set +# CONFIG_TIGON3 is not set + +# +# Ethernet (10000 Mbit) +# +# CONFIG_IXGB is not set +CONFIG_S2IO=m +# CONFIG_S2IO_NAPI is not set + +# +# Token Ring devices +# +# CONFIG_TR is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +# CONFIG_PLIP is not set +# CONFIG_PPP is not set +# CONFIG_SLIP is not set +# CONFIG_NET_FC is not set +# CONFIG_SHAPER is not set +# CONFIG_NETCONSOLE is not set + +# +# ISDN subsystem +# +# CONFIG_ISDN is not set + +# +# Telephony Support +# +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +# CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_TSDEV is not set +# CONFIG_INPUT_EVDEV is not set +# CONFIG_INPUT_EVBUG is not set + +# +# Input I/O drivers +# +# CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PARKBD is not set +# CONFIG_SERIO_PCIPS2 is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_INPORT is not set +# CONFIG_MOUSE_LOGIBM is not set +# CONFIG_MOUSE_PC110PAD is not set +# CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +# CONFIG_INPUT_MISC is not set + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_CONSOLE is not set +# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_EXTENDED is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_PRINTER=y +# CONFIG_LP_CONSOLE is not set +# CONFIG_PPDEV is not set +# CONFIG_TIPAR is not set +# CONFIG_QIC02_TAPE is not set + +# +# IPMI +# +# CONFIG_IPMI_HANDLER is not set + +# +# Watchdog Cards +# +# CONFIG_WATCHDOG is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_NVRAM is not set +# CONFIG_RTC is not set +# CONFIG_GEN_RTC is not set +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set +# CONFIG_SONYPI is not set + +# +# Ftape, the floppy tape device driver +# +CONFIG_AGP=y +# CONFIG_AGP_ALI is not set +# CONFIG_AGP_ATI is not set +# CONFIG_AGP_AMD is not set +# CONFIG_AGP_AMD64 is not set +CONFIG_AGP_INTEL=y +# CONFIG_AGP_NVIDIA is not set +# CONFIG_AGP_SIS is not set +# CONFIG_AGP_SWORKS is not set +# CONFIG_AGP_VIA is not set +# CONFIG_AGP_EFFICEON is not set +CONFIG_DRM=y +# CONFIG_DRM_TDFX is not set +# CONFIG_DRM_GAMMA is not set +# CONFIG_DRM_R128 is not set +# CONFIG_DRM_RADEON is not set +# CONFIG_DRM_I810 is not set +CONFIG_DRM_I830=y +# CONFIG_DRM_MGA is not set +# CONFIG_DRM_SIS is not set +# CONFIG_MWAVE is not set +# CONFIG_RAW_DRIVER is not set +# CONFIG_HANGCHECK_TIMER is not set + +# +# I2C support +# +# CONFIG_I2C is not set + +# +# Misc devices +# +# CONFIG_IBM_ASM is not set + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set + +# +# Digital Video Broadcasting Devices +# +# CONFIG_DVB is not set + +# +# Graphics support +# +# CONFIG_FB is not set +# CONFIG_VIDEO_SELECT is not set + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +# CONFIG_MDA_CONSOLE is not set +CONFIG_DUMMY_CONSOLE=y + +# +# Sound +# +CONFIG_SOUND=y + +# +# Advanced Linux Sound Architecture +# +CONFIG_SND=y +CONFIG_SND_TIMER=y +CONFIG_SND_PCM=y +CONFIG_SND_RAWMIDI=y +CONFIG_SND_SEQUENCER=y +# CONFIG_SND_SEQ_DUMMY is not set +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=y +CONFIG_SND_PCM_OSS=y +CONFIG_SND_SEQUENCER_OSS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set + +# +# Generic devices +# +CONFIG_SND_MPU401_UART=y +# CONFIG_SND_DUMMY is not set +# CONFIG_SND_VIRMIDI is not set +# CONFIG_SND_MTPAV is not set +# CONFIG_SND_SERIAL_U16550 is not set +# CONFIG_SND_MPU401 is not set + +# +# ISA devices +# +# CONFIG_SND_AD1848 is not set +# CONFIG_SND_CS4231 is not set +# CONFIG_SND_CS4232 is not set +# CONFIG_SND_CS4236 is not set +# CONFIG_SND_ES1688 is not set +# CONFIG_SND_ES18XX is not set +# CONFIG_SND_GUSCLASSIC is not set +# CONFIG_SND_GUSEXTREME is not set +# CONFIG_SND_GUSMAX is not set +# CONFIG_SND_INTERWAVE is not set +# CONFIG_SND_INTERWAVE_STB is not set +# CONFIG_SND_OPTI92X_AD1848 is not set +# CONFIG_SND_OPTI92X_CS4231 is not set +# CONFIG_SND_OPTI93X is not set +# CONFIG_SND_SB8 is not set +# CONFIG_SND_SB16 is not set +# CONFIG_SND_SBAWE is not set +# CONFIG_SND_WAVEFRONT is not set +# CONFIG_SND_CMI8330 is not set +# CONFIG_SND_OPL3SA2 is not set +# CONFIG_SND_SGALAXY is not set +# CONFIG_SND_SSCAPE is not set + +# +# PCI devices +# +CONFIG_SND_AC97_CODEC=y +# CONFIG_SND_ALI5451 is not set +# CONFIG_SND_ATIIXP is not set +# CONFIG_SND_AU8810 is not set +# CONFIG_SND_AU8820 is not set +# CONFIG_SND_AU8830 is not set +# CONFIG_SND_AZT3328 is not set +# CONFIG_SND_BT87X is not set +# CONFIG_SND_CS46XX is not set +# CONFIG_SND_CS4281 is not set +# CONFIG_SND_EMU10K1 is not set +# CONFIG_SND_KORG1212 is not set +# CONFIG_SND_MIXART is not set +# CONFIG_SND_NM256 is not set +# CONFIG_SND_RME32 is not set +# CONFIG_SND_RME96 is not set +# CONFIG_SND_RME9652 is not set +# CONFIG_SND_HDSP is not set +# CONFIG_SND_TRIDENT is not set +# CONFIG_SND_YMFPCI is not set +# CONFIG_SND_ALS4000 is not set +# CONFIG_SND_CMIPCI is not set +# CONFIG_SND_ENS1370 is not set +# CONFIG_SND_ENS1371 is not set +# CONFIG_SND_ES1938 is not set +# CONFIG_SND_ES1968 is not set +# CONFIG_SND_MAESTRO3 is not set +# CONFIG_SND_FM801 is not set +# CONFIG_SND_ICE1712 is not set +# CONFIG_SND_ICE1724 is not set +CONFIG_SND_INTEL8X0=y +# CONFIG_SND_INTEL8X0M is not set +# CONFIG_SND_SONICVIBES is not set +# CONFIG_SND_VIA82XX is not set +# CONFIG_SND_VX222 is not set + +# +# ALSA USB devices +# +# CONFIG_SND_USB_AUDIO is not set + +# +# Open Sound System +# +# CONFIG_SOUND_PRIME is not set + +# +# USB support +# +CONFIG_USB=y +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_SPLIT_ISO is not set +# CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_OHCI_HCD is not set +CONFIG_USB_UHCI_HCD=y + +# +# USB Device Class drivers +# +# CONFIG_USB_AUDIO is not set +# CONFIG_USB_BLUETOOTH_TTY is not set +# CONFIG_USB_MIDI is not set +# CONFIG_USB_ACM is not set +CONFIG_USB_PRINTER=y +CONFIG_USB_STORAGE=y +# CONFIG_USB_STORAGE_DEBUG is not set +# CONFIG_USB_STORAGE_DATAFAB is not set +# CONFIG_USB_STORAGE_FREECOM is not set +# CONFIG_USB_STORAGE_ISD200 is not set +# CONFIG_USB_STORAGE_DPCM is not set +# CONFIG_USB_STORAGE_HP8200e is not set +# CONFIG_USB_STORAGE_SDDR09 is not set +# CONFIG_USB_STORAGE_SDDR55 is not set +# CONFIG_USB_STORAGE_JUMPSHOT is not set + +# +# USB Human Interface Devices (HID) +# +CONFIG_USB_HID=y +CONFIG_USB_HIDINPUT=y +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set +# CONFIG_USB_AIPTEK is not set +# CONFIG_USB_WACOM is not set +# CONFIG_USB_KBTAB is not set +# CONFIG_USB_POWERMATE is not set +# CONFIG_USB_MTOUCH is not set +CONFIG_USB_EGALAX=m +# CONFIG_USB_XPAD is not set +# CONFIG_USB_ATI_REMOTE is not set + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set +# CONFIG_USB_HPUSBSCSI is not set + +# +# USB Multimedia devices +# +# CONFIG_USB_DABUSB is not set + +# +# Video4Linux support is needed for USB Multimedia device support +# + +# +# USB Network adaptors +# +# CONFIG_USB_CATC is not set +# CONFIG_USB_KAWETH is not set +# CONFIG_USB_PEGASUS is not set +# CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET is not set + +# +# USB port drivers +# +# CONFIG_USB_USS720 is not set + +# +# USB Serial Converter support +# +# CONFIG_USB_SERIAL is not set + +# +# USB Miscellaneous drivers +# +# CONFIG_USB_EMI62 is not set +# CONFIG_USB_EMI26 is not set +# CONFIG_USB_TIGL is not set +# CONFIG_USB_AUERSWALD is not set +# CONFIG_USB_RIO500 is not set +# CONFIG_USB_LEGOTOWER is not set +# CONFIG_USB_LCD is not set +# CONFIG_USB_LED is not set +CONFIG_USB_CYTHERM=m +CONFIG_USB_PHIDGETSERVO=m +# CONFIG_USB_TEST is not set + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +# CONFIG_EXT3_FS_POSIX_ACL is not set +# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_ROMFS_FS is not set +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +CONFIG_AUTOFS4_FS=y + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +# CONFIG_ZISOFS is not set +CONFIG_UDF_FS=y + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_SYSFS=y +# CONFIG_DEVFS_FS is not set +# CONFIG_DEVPTS_FS_XATTR is not set +CONFIG_TMPFS=y +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_HFSPLUS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +# CONFIG_NFS_V3 is not set +# CONFIG_NFS_V4 is not set +# CONFIG_NFS_DIRECTIO is not set +CONFIG_NFSD=y +# CONFIG_NFSD_V3 is not set +CONFIG_NFSD_TCP=y +CONFIG_LOCKD=y +CONFIG_EXPORTFS=y +CONFIG_SUNRPC=y +# CONFIG_RPCSEC_GSS_KRB5 is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=y +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +CONFIG_NLS_ISO8859_1=y +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set + +# +# Profiling support +# +CONFIG_PROFILING=y +CONFIG_OPROFILE=y + +# +# Kernel hacking +# +# CONFIG_DEBUG_KERNEL is not set +CONFIG_EARLY_PRINTK=y +CONFIG_DEBUG_SPINLOCK_SLEEP=y +# CONFIG_FRAME_POINTER is not set +CONFIG_4KSTACKS=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y + +# +# Security options +# +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Library routines +# +CONFIG_CRC32=y +CONFIG_LIBCRC32C=m +CONFIG_X86_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y +CONFIG_X86_STD_RESOURCES=y +CONFIG_PC=y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile new file mode 100644 index 000000000000..933787a46b4c --- /dev/null +++ b/arch/i386/kernel/Makefile @@ -0,0 +1,71 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head.o init_task.o vmlinux.lds + +obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ + ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ + pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ + doublefault.o quirks.o + +obj-y += cpu/ +obj-y += timers/ +obj-$(CONFIG_ACPI_BOOT) += acpi/ +obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +obj-$(CONFIG_MCA) += mca.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_X86_SMP) += smp.o smpboot.o +obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_X86_MPPARSE) += mpparse.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_X86_NUMAQ) += numaq.o +obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_MODULES) += module.o +obj-y += sysenter.o vsyscall.o +obj-$(CONFIG_ACPI_SRAT) += srat.o +obj-$(CONFIG_HPET_TIMER) += time_hpet.o +obj-$(CONFIG_EFI) += efi.o efi_stub.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +EXTRA_AFLAGS := -traditional + +obj-$(CONFIG_SCx200) += scx200.o + +# vsyscall.o contains the vsyscall DSO images as __initdata. +# We must build both images before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so +targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) +targets += vsyscall.lds + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) + +vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 +SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) +SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) + +$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ +$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE + $(call if_changed,syscall) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vsyscall-syms.o +$(obj)/built-in.o: $(obj)/vsyscall-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o + +SYSCFLAGS_vsyscall-syms.o = -r +$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE + $(call if_changed,syscall) diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile new file mode 100644 index 000000000000..ee75cb286cfe --- /dev/null +++ b/arch/i386/kernel/acpi/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_ACPI_BOOT) := boot.o +obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o + diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c new file mode 100644 index 000000000000..9ba0b957d11f --- /dev/null +++ b/arch/i386/kernel/acpi/boot.c @@ -0,0 +1,908 @@ +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/init.h> +#include <linux/config.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/pgtable.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/mpspec.h> + +#ifdef CONFIG_X86_64 + +static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { } +extern void __init clustered_apic_check(void); +static inline int ioapic_setup_disabled(void) { return 0; } +#include <asm/proto.h> + +#else /* X86 */ + +#ifdef CONFIG_X86_LOCAL_APIC +#include <mach_apic.h> +#include <mach_mpparse.h> +#endif /* CONFIG_X86_LOCAL_APIC */ + +#endif /* X86 */ + +#define BAD_MADT_ENTRY(entry, end) ( \ + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ + ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) + +#define PREFIX "ACPI: " + +#ifdef CONFIG_ACPI_PCI +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ +int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ +#else +int acpi_noirq __initdata = 1; +int acpi_pci_disabled __initdata = 1; +#endif +int acpi_ht __initdata = 1; /* enable HT */ + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +EXPORT_SYMBOL(acpi_strict); + +acpi_interrupt_flags acpi_sci_flags __initdata; +int acpi_sci_override_gsi __initdata; +int acpi_skip_timer_override __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +#endif + +#ifndef __HAVE_ARCH_CMPXCHG +#warning ACPI uses CMPXCHG, i486 and later hardware +#endif + +#define MAX_MADT_ENTRIES 256 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = + { [0 ... MAX_MADT_ENTRIES-1] = 0xff }; +EXPORT_SYMBOL(x86_acpiid_to_apicid); + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overriden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + +#ifdef CONFIG_X86_64 + +/* rely on all ACPI tables being in the direct mapping */ +char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +{ + if (!phys_addr || !size) + return NULL; + + if (phys_addr < (end_pfn_map << PAGE_SHIFT)) + return __va(phys_addr); + + return NULL; +} + +#else + +/* + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, + * to map the target physical address. The problem is that set_fixmap() + * provides a single page, and it is possible that the page is not + * sufficient. + * By using this area, we can map up to MAX_IO_APICS pages temporarily, + * i.e. until the next __va_range() call. + * + * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* + * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and + * count idx down while incrementing the phys address. + */ +char *__acpi_map_table(unsigned long phys, unsigned long size) +{ + unsigned long base, offset, mapped_size; + int idx; + + if (phys + size < 8*1024*1024) + return __va(phys); + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + set_fixmap(FIX_ACPI_END, phys); + base = fix_to_virt(FIX_ACPI_END); + + /* + * Most cases can be covered by the below. + */ + idx = FIX_ACPI_END; + while (mapped_size < size) { + if (--idx < FIX_ACPI_BEGIN) + return NULL; /* cannot handle this */ + phys += PAGE_SIZE; + set_fixmap(idx, phys); + mapped_size += PAGE_SIZE; + } + + return ((unsigned char *) base + offset); +} +#endif + +#ifdef CONFIG_PCI_MMCONFIG +static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_mcfg *mcfg; + + if (!phys_addr || !size) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size); + if (!mcfg) { + printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); + return -ENODEV; + } + + if (mcfg->base_reserved) { + printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); + return -ENODEV; + } + + pci_mmcfg_base_addr = mcfg->base_address; + + return 0; +} +#else +#define acpi_parse_mcfg NULL +#endif /* !CONFIG_PCI_MMCONFIG */ + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init +acpi_parse_madt ( + unsigned long phys_addr, + unsigned long size) +{ + struct acpi_table_madt *madt = NULL; + + if (!phys_addr || !size) + return -EINVAL; + + madt = (struct acpi_table_madt *) __acpi_map_table(phys_addr, size); + if (!madt) { + printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->lapic_address) { + acpi_lapic_addr = (u64) madt->lapic_address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->lapic_address); + } + + acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); + + return 0; +} + + +static int __init +acpi_parse_lapic ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic *processor = NULL; + + processor = (struct acpi_table_lapic*) header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* no utility in registering a disabled processor */ + if (processor->flags.enabled == 0) + return 0; + + x86_acpiid_to_apicid[processor->acpi_id] = processor->id; + + mp_register_lapic ( + processor->id, /* APIC ID */ + processor->flags.enabled); /* Enabled? */ + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr*) header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_lapic_nmi ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_table_lapic_nmi*) header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (lapic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + + +#endif /*CONFIG_X86_LOCAL_APIC*/ + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) + +static int __init +acpi_parse_ioapic ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_ioapic *ioapic = NULL; + + ioapic = (struct acpi_table_ioapic*) header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + mp_register_ioapic ( + ioapic->id, + ioapic->address, + ioapic->global_irq_base); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void +acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags.trigger) + trigger = acpi_sci_flags.trigger; + + if (acpi_sci_flags.polarity) + polarity = acpi_sci_flags.polarity; + + /* + * mp_config_acpi_legacy_irqs() already setup IRQs < 16 + * If GSI is < 16, this will update its flags, + * else it will create a new mp_irqs[] entry. + */ + mp_override_legacy_irq(gsi, polarity, trigger, gsi); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_fadt + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_int_src_ovr *intsrc = NULL; + + intsrc = (struct acpi_table_int_src_ovr*) header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (intsrc->bus_irq == acpi_fadt.sci_int) { + acpi_sci_ioapic_setup(intsrc->global_irq, + intsrc->flags.polarity, intsrc->flags.trigger); + return 0; + } + + if (acpi_skip_timer_override && + intsrc->bus_irq == 0 && intsrc->global_irq == 2) { + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + return 0; + } + + mp_override_legacy_irq ( + intsrc->bus_irq, + intsrc->flags.polarity, + intsrc->flags.trigger, + intsrc->global_irq); + + return 0; +} + + +static int __init +acpi_parse_nmi_src ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_nmi_src *nmi_src = NULL; + + nmi_src = (struct acpi_table_nmi_src*) header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_ACPI_BUS + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) + */ + +void __init +acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(0x4d0) | (inb(0x4d1) << 8); + + /* + * If we use ACPI to set PCI irq's, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + outb(new, 0x4d0); + outb(new >> 8, 0x4d1); +} + + +#endif /* CONFIG_ACPI_BUS */ + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ +#ifdef CONFIG_X86_IO_APIC + if (use_pci_vector() && !platform_legacy_irq(gsi)) + *irq = IO_APIC_VECTOR(gsi); + else +#endif + *irq = gsi; + return 0; +} + +unsigned int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + +#ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. + */ + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + extern void eisa_set_level_irq(unsigned int irq); + + if (edge_level == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); + } +#endif + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { + plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low); + } +#endif + acpi_gsi_to_irq(plat_gsi, &irq); + return irq; +} +EXPORT_SYMBOL(acpi_register_gsi); + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int +acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_map_lsapic); + + +int +acpi_unmap_lsapic(int cpu) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +static unsigned long __init +acpi_scan_rsdp ( + unsigned long start, + unsigned long length) +{ + unsigned long offset = 0; + unsigned long sig_len = sizeof("RSD PTR ") - 1; + + /* + * Scan all 16-byte boundaries of the physical memory region for the + * RSDP signature. + */ + for (offset = 0; offset < length; offset += 16) { + if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + continue; + return (start + offset); + } + + return 0; +} + +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_sbf *sb; + + if (!phys_addr || !size) + return -EINVAL; + + sb = (struct acpi_table_sbf *) __acpi_map_table(phys_addr, size); + if (!sb) { + printk(KERN_WARNING PREFIX "Unable to map SBF\n"); + return -ENODEV; + } + + sbf_port = sb->sbf_cmos; /* Save CMOS port */ + + return 0; +} + + +#ifdef CONFIG_HPET_TIMER + +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) +{ + struct acpi_table_hpet *hpet_tbl; + + if (!phys || !size) + return -EINVAL; + + hpet_tbl = (struct acpi_table_hpet *) __acpi_map_table(phys, size); + if (!hpet_tbl) { + printk(KERN_WARNING PREFIX "Unable to map HPET\n"); + return -ENODEV; + } + + if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) { + printk(KERN_WARNING PREFIX "HPET timers must be located in " + "memory.\n"); + return -1; + } + +#ifdef CONFIG_X86_64 + vxtime.hpet_address = hpet_tbl->addr.addrl | + ((long) hpet_tbl->addr.addrh << 32); + + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, vxtime.hpet_address); +#else /* X86 */ + { + extern unsigned long hpet_address; + + hpet_address = hpet_tbl->addr.addrl; + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + } +#endif /* X86 */ + + return 0; +} +#else +#define acpi_parse_hpet NULL +#endif + +#ifdef CONFIG_X86_PM_TIMER +extern u32 pmtmr_ioport; +#endif + +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) +{ + struct fadt_descriptor_rev2 *fadt = NULL; + + fadt = (struct fadt_descriptor_rev2*) __acpi_map_table(phys,size); + if(!fadt) { + printk(KERN_WARNING PREFIX "Unable to map FADT\n"); + return 0; + } + +#ifdef CONFIG_ACPI_INTERPRETER + /* initialize sci_int early for INT_SRC_OVR MADT parsing */ + acpi_fadt.sci_int = fadt->sci_int; +#endif + +#ifdef CONFIG_X86_PM_TIMER + /* detect the location of the ACPI PM Timer */ + if (fadt->revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (fadt->xpm_tmr_blk.address_space_id != ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = fadt->xpm_tmr_blk.address; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = fadt->V1_pm_tmr_blk; + } + if (pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", pmtmr_ioport); +#endif + return 0; +} + + +unsigned long __init +acpi_find_rsdp (void) +{ + unsigned long rsdp_phys = 0; + + if (efi_enabled) { + if (efi.acpi20) + return __pa(efi.acpi20); + else if (efi.acpi) + return __pa(efi.acpi); + } + /* + * Scan memory looking for the RSDP signature. First search EBDA (low + * memory) paragraphs and then search upper memory (E0000-FFFFF). + */ + rsdp_phys = acpi_scan_rsdp (0, 0x400); + if (!rsdp_phys) + rsdp_phys = acpi_scan_rsdp (0xE0000, 0xFFFFF); + + return rsdp_phys; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init +acpi_parse_madt_lapic_entries(void) +{ + int count; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); + return count; + } + + mp_register_lapic_address(acpi_lapic_addr); + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic, + MAX_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } + else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init +acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) { + return -ENODEV; + } + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (skip_ioapic_setup) { + printk(KERN_INFO PREFIX "Skipping IOAPIC probe " + "due to 'noapic' option.\n"); + return -ENODEV; + } + + count = acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, MAX_IO_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + return -ENODEV; + } + else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + return count; + } + + count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + */ + if (!acpi_sci_override_gsi) + acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); + + /* Fill in identity legacy mapings where no override */ + mp_config_acpi_legacy_irqs(); + + count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !(CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER) */ + + +static void __init +acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int count, error; + + count = acpi_table_parse(ACPI_APIC, acpi_parse_madt); + if (count >= 1) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + + /* + * Parse MADT IO-APIC entries + */ + error = acpi_parse_madt_ioapic_entries(); + if (!error) { + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + acpi_irq_balance_set(NULL); + acpi_ioapic = 1; + + smp_found_config = 1; + clustered_apic_check(); + } + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif + return; +} + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + * + * return value: (currently ignored) + * 0: success + * !0: failure + */ + +int __init +acpi_boot_table_init(void) +{ + int error; + + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + /* + * Initialize the ACPI boot-time table parser. + */ + error = acpi_table_init(); + if (error) { + disable_acpi(); + return error; + } + +#ifdef __i386__ + check_acpi_pci(); +#endif + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + error = acpi_blacklisted(); + if (error) { + extern int acpi_force; + + if (acpi_force) { + printk(KERN_WARNING PREFIX "acpi=force override\n"); + } else { + printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + disable_acpi(); + return error; + } + } + + return 0; +} + + +int __init acpi_boot_init(void) +{ + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); + + return 0; +} + diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c new file mode 100644 index 000000000000..726a5ca4b165 --- /dev/null +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -0,0 +1,51 @@ +/* + * Do early PCI probing for bug detection when the main PCI subsystem is + * not up yet. + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> +#include <asm/acpi.h> + +static int __init check_bridge(int vendor, int device) +{ + /* According to Nvidia all timer overrides are bogus. Just ignore + them all. */ + if (vendor == PCI_VENDOR_ID_NVIDIA) { + acpi_skip_timer_override = 1; + } + return 0; +} + +void __init check_acpi_pci(void) +{ + int num,slot,func; + + /* Assume the machine supports type 1. If not it will + always read ffffffff and should not have any side effect. */ + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + + if (check_bridge(vendor&0xffff, vendor >> 16)) + return; + } + + } + } +} diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c new file mode 100644 index 000000000000..28bb0514bb6e --- /dev/null +++ b/arch/i386/kernel/acpi/sleep.c @@ -0,0 +1,93 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_video_flags; +extern char wakeup_start, wakeup_end; + +extern void zap_low_mappings(void); + +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); + +static void init_low_mapping(pgd_t *pgd, int pgd_limit) +{ + int pgd_ofs = 0; + + while ((pgd_ofs < pgd_limit) && (pgd_ofs + USER_PTRS_PER_PGD < PTRS_PER_PGD)) { + set_pgd(pgd, *(pgd+USER_PTRS_PER_PGD)); + pgd_ofs++, pgd++; + } + flush_tlb_all(); +} + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem (void) +{ + if (!acpi_wakeup_address) + return 1; + init_low_mapping(swapper_pg_dir, USER_PTRS_PER_PGD); + memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem (void) +{ + zap_low_mappings(); +} + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { + printk(KERN_ERR "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_video_flags = 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_video_flags |= 2; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S new file mode 100644 index 000000000000..39d32484f6f5 --- /dev/null +++ b/arch/i386/kernel/acpi/wakeup.S @@ -0,0 +1,318 @@ +.text +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> + +# +# wakeup_code runs in real mode, and at unknown address (determined at run-time). +# Therefore it must only use relative jumps/calls. +# +# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled +# +# If physical address of wakeup_code is 0x12345, BIOS should call us with +# cs = 0x1234, eip = 0x05 +# + +ALIGN + .align 4096 +ENTRY(wakeup_start) +wakeup_code: + wakeup_code_start = . + .code16 + + movw $0xb800, %ax + movw %ax,%fs + movw $0x0e00 + 'L', %fs:(0x10) + + cli + cld + + # setup data segment + movw %cs, %ax + movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ss + mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + movw $0x0e00 + 'S', %fs:(0x12) + + pushl $0 # Kill any dangerous flags + popfl + + movl real_magic - wakeup_code, %eax + cmpl $0x12345678, %eax + jne bogus_real_magic + + testl $1, video_flags - wakeup_code + jz 1f + lcall $0xc000,$3 + movw %cs, %ax + movw %ax, %ds # Bios might have played with that + movw %ax, %ss +1: + + testl $2, video_flags - wakeup_code + jz 1f + mov video_mode - wakeup_code, %ax + call mode_set +1: + + # set up page table + movl $swapper_pg_dir-__PAGE_OFFSET, %eax + movl %eax, %cr3 + + testl $1, real_efer_save_restore - wakeup_code + jz 4f + # restore efer setting + movl real_save_efer_edx - wakeup_code, %edx + movl real_save_efer_eax - wakeup_code, %eax + mov $0xc0000080, %ecx + wrmsr +4: + # make sure %cr4 is set correctly (features, etc) + movl real_save_cr4 - wakeup_code, %eax + movl %eax, %cr4 + movw $0xb800, %ax + movw %ax,%fs + movw $0x0e00 + 'i', %fs:(0x12) + + # need a gdt + lgdt real_save_gdt - wakeup_code + + movl real_save_cr0 - wakeup_code, %eax + movl %eax, %cr0 + jmp 1f +1: + movw $0x0e00 + 'n', %fs:(0x14) + + movl real_magic - wakeup_code, %eax + cmpl $0x12345678, %eax + jne bogus_real_magic + + ljmpl $__KERNEL_CS,$wakeup_pmode_return + +real_save_gdt: .word 0 + .long 0 +real_save_cr0: .long 0 +real_save_cr3: .long 0 +real_save_cr4: .long 0 +real_magic: .long 0 +video_mode: .long 0 +video_flags: .long 0 +real_efer_save_restore: .long 0 +real_save_efer_edx: .long 0 +real_save_efer_eax: .long 0 + +bogus_real_magic: + movw $0x0e00 + 'B', %fs:(0x12) + jmp bogus_real_magic + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +# Setting of user mode (AX=mode ID) => CF=success +mode_set: + movw %ax, %bx +#if 0 + cmpb $0xff, %ah + jz setalias + + testb $VIDEO_RECALC>>8, %ah + jnz _setrec + + cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah + jnc setres + + cmpb $VIDEO_FIRST_SPECIAL>>8, %ah + jz setspc + + cmpb $VIDEO_FIRST_V7>>8, %ah + jz setv7 +#endif + + cmpb $VIDEO_FIRST_VESA>>8, %ah + jnc check_vesa +#if 0 + orb %ah, %ah + jz setmenu +#endif + + decb %ah +# jz setbios Add bios modes later + +setbad: clc + ret + +check_vesa: + subb $VIDEO_FIRST_VESA>>8, %bh + orw $0x4000, %bx # Use linear frame buffer + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz _setbad # AH=0 if OK + + stc + ret + +_setbad: jmp setbad + + .code32 + ALIGN + +.org 0x800 +wakeup_stack_begin: # Stack grows down + +.org 0xff0 # Just below end of page +wakeup_stack: +ENTRY(wakeup_end) + +.org 0x1000 + +wakeup_pmode_return: + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + movw $0x0e00 + 'u', 0xb8016 + + # reload the gdt, as we need the full 32 bit address + lgdt saved_gdt + lidt saved_idt + lldt saved_ldt + ljmp $(__KERNEL_CS),$1f +1: + movl %cr3, %eax + movl %eax, %cr3 + wbinvd + + # and restore the stack ... but you need gdt for this to work + movl saved_context_esp, %esp + + movl %cs:saved_magic, %eax + cmpl $0x12345678, %eax + jne bogus_magic + + # jump to place where we left off + movl saved_eip,%eax + jmp *%eax + +bogus_magic: + movw $0x0e00 + 'B', 0xb8018 + jmp bogus_magic + + +## +# acpi_copy_wakeup_routine +# +# Copy the above routine to low memory. +# +# Parameters: +# %eax: place to copy wakeup routine to +# +# Returned address is location of code in low memory (past data and stack) +# +ENTRY(acpi_copy_wakeup_routine) + + sgdt saved_gdt + sidt saved_idt + sldt saved_ldt + str saved_tss + + movl nx_enabled, %edx + movl %edx, real_efer_save_restore - wakeup_start (%eax) + testl $1, real_efer_save_restore - wakeup_start (%eax) + jz 2f + # save efer setting + pushl %eax + movl %eax, %ebx + mov $0xc0000080, %ecx + rdmsr + movl %edx, real_save_efer_edx - wakeup_start (%ebx) + movl %eax, real_save_efer_eax - wakeup_start (%ebx) + popl %eax +2: + + movl %cr3, %edx + movl %edx, real_save_cr3 - wakeup_start (%eax) + movl %cr4, %edx + movl %edx, real_save_cr4 - wakeup_start (%eax) + movl %cr0, %edx + movl %edx, real_save_cr0 - wakeup_start (%eax) + sgdt real_save_gdt - wakeup_start (%eax) + + movl saved_videomode, %edx + movl %edx, video_mode - wakeup_start (%eax) + movl acpi_video_flags, %edx + movl %edx, video_flags - wakeup_start (%eax) + movl $0x12345678, real_magic - wakeup_start (%eax) + movl $0x12345678, saved_magic + ret + +.data +ALIGN +ENTRY(saved_magic) .long 0 +ENTRY(saved_eip) .long 0 + +save_registers: + leal 4(%esp), %eax + movl %eax, saved_context_esp + movl %ebx, saved_context_ebx + movl %ebp, saved_context_ebp + movl %esi, saved_context_esi + movl %edi, saved_context_edi + pushfl ; popl saved_context_eflags + + movl $ret_point, saved_eip + ret + + +restore_registers: + movl saved_context_ebp, %ebp + movl saved_context_ebx, %ebx + movl saved_context_esi, %esi + movl saved_context_edi, %edi + pushl saved_context_eflags ; popfl + ret + +ENTRY(do_suspend_lowlevel) + call save_processor_state + call save_registers + pushl $3 + call acpi_enter_sleep_state + addl $4, %esp + ret + .p2align 4,,7 +ret_point: + call restore_registers + call restore_processor_state + ret + +ENTRY(do_suspend_lowlevel_s4bios) + call save_processor_state + call save_registers + call acpi_enter_sleep_state_s4bios + ret + +ALIGN +# saved registers +saved_gdt: .long 0,0 +saved_idt: .long 0,0 +saved_ldt: .long 0 +saved_tss: .long 0 + diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c new file mode 100644 index 000000000000..35c1751ea0b0 --- /dev/null +++ b/arch/i386/kernel/apic.c @@ -0,0 +1,1278 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/config.h> +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/hpet.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +/* + * Debug level + */ +int apic_verbosity; + + +static void apic_pm_activate(void); + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + */ + ack_APIC_irq(); +} + +void __init apic_intr_init(void) +{ +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/* Using APIC to generate smp_local_timer_interrupt? */ +int using_apic_timer = 0; + +static DEFINE_PER_CPU(int, prof_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; + +static int enabled_via_apicbase; + +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v, ver; + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + v = APIC_DM_NMI; /* unmask and set to NMI */ + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT0, v); +} + +int get_physical_broadcast(void) +{ + unsigned int lvr, version; + lvr = apic_read(APIC_LVR); + version = GET_APIC_VERSION(lvr); + if (!APIC_INTEGRATED(version) || version >= 0x14) + return 0xff; + else + return 0xf; +} + +int get_maxlvt(void) +{ + unsigned int v, ver, maxlvt; + + v = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(v); + /* 82489DXs do not report # of LVT entries. */ + maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; + return maxlvt; +} + +void clear_local_APIC(void) +{ + int maxlvt; + unsigned long v; + + maxlvt = get_maxlvt(); + + /* + * Masking an LVT entry on a P6 can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + } + +/* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif + /* + * Clean APIC state for other OSs: + */ + apic_write_around(APIC_LVTT, APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); +#endif + v = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (APIC_INTEGRATED(v)) { /* !82489DX */ + if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. + * connect BSP's local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } + enable_apic_mode(); +} + +void disconnect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect + * only on certain older boards). Note that APIC + * interrupts, including IPIs, won't work beyond + * this point! The only exception are INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } +} + +void disable_local_APIC(void) +{ + unsigned long value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write_around(APIC_SPIV, value); + + if (enabled_via_apicbase) { + unsigned int l, h; + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +void __init sync_Arb_IDs(void) +{ + /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (ver >= 0x14) /* P4 or higher */ + return; + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); +} + +extern void __error_in_apic_c (void); + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned long value, ver; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); +} + +void __init setup_local_APIC (void) +{ + unsigned long oldvalue, value, ver, maxlvt; + + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) + __error_in_apic_c(); + + /* + * Double-check whether this APIC is really registered. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write_around(APIC_TASKPRI, value); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ +#if 1 + /* Enable focus processor (bit==0) */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#else + /* Disable focus processor (bit==1) */ + value |= APIC_SPIV_FOCUS_DISABLED; +#endif + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessery in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!smp_processor_id() && (pic_mode || !value)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", + smp_processor_id()); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", + smp_processor_id()); + } + apic_write_around(APIC_LVT0, value); + + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (!smp_processor_id()) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); + + if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ + maxlvt = get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + value = ERROR_APIC_VECTOR; // enables sending errors + apic_write_around(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk("Leaving ESR disabled.\n"); + else + printk("No ESR for 82489DX.\n"); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + setup_apic_nmi_watchdog(); + apic_pm_activate(); +} + +/* + * If Linux enabled the LAPIC against the BIOS default + * disable it down before re-entering the BIOS on shutdown. + * Otherwise the BIOS may get confused and not power-off. + */ +void lapic_shutdown(void) +{ + if (!cpu_has_apic || !enabled_via_apicbase) + return; + + local_irq_disable(); + disable_local_APIC(); + local_irq_enable(); +} + +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, u32 state) +{ + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __init apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + */ + +/* + * Knob to control our willingness to enable the local APIC. + */ +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +static int __init lapic_disable(char *str) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +__setup("nolapic", lapic_disable); + +static int __init lapic_enable(char *str) +{ + enable_local_apic = 1; + return 0; +} +__setup("lapic", lapic_enable); + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug", str); + + return 0; +} + +__setup("apic=", apic_set_verbosity); + +static int __init detect_init_APIC (void) +{ + u32 h, l, features; + extern void get_cpu_vendor(struct cpuinfo_x86*); + + /* Disabled by kernel option? */ + if (enable_local_apic < 0) + return -1; + + /* Workaround for us being called before identify_cpu(). */ + get_cpu_vendor(&boot_cpu_data); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local + * APIC only if "lapic" specified. + */ + if (enable_local_apic <= 0) { + printk("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the + * APIC_BASE MSR. This can only be done in + * software for Intel P6 or later and AMD K7 + * (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk("Could not enable APIC!\n"); + return -1; + } + set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + if (nmi_watchdog != NMI_NONE) + nmi_watchdog = NMI_LOCAL_APIC; + + printk("Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk("No local APIC present or hardware disabled\n"); + return -1; +} + +void __init init_apic_mappings(void) +{ + unsigned long apic_phys; + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, + apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + +#ifdef CONFIG_X86_IO_APIC + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } + } else { +fake_ioapic_page: + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } + } +#endif +} + +/* + * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts + * per second. We assume that the caller has already set up the local + * APIC. + * + * The APIC timer is not exactly sync with the external timer chip, it + * closely follows bus clocks. + */ + +/* + * The timer chip is already set up at HZ interrupts per second here, + * but we do not accept timer interrupts yet. We only allow the BP + * to calibrate. + */ +static unsigned int __init get_8254_timer_count(void) +{ + extern spinlock_t i8253_lock; + unsigned long flags; + + unsigned int count; + + spin_lock_irqsave(&i8253_lock, flags); + + outb_p(0x00, PIT_MODE); + count = inb_p(PIT_CH0); + count |= inb_p(PIT_CH0) << 8; + + spin_unlock_irqrestore(&i8253_lock, flags); + + return count; +} + +/* next tick in 8254 can be caught by catching timer wraparound */ +static void __init wait_8254_wraparound(void) +{ + unsigned int curr_count, prev_count; + + curr_count = get_8254_timer_count(); + do { + prev_count = curr_count; + curr_count = get_8254_timer_count(); + + /* workaround for broken Mercury/Neptune */ + if (prev_count >= curr_count + 0x100) + curr_count = get_8254_timer_count(); + + } while (prev_count >= curr_count); +} + +/* + * Default initialization for 8254 timers. If we use other timers like HPET, + * we override this later + */ +void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound; + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +#define APIC_DIVISOR 16 + +static void __setup_APIC_LVTT(unsigned int clocks) +{ + unsigned int lvtt_value, tmp_value, ver; + + ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + if (!APIC_INTEGRATED(ver)) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + apic_write_around(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void __init setup_APIC_timer(unsigned int clocks) +{ + unsigned long flags; + + local_irq_save(flags); + + /* + * Wait for IRQ0's slice: + */ + wait_timer_tick(); + + __setup_APIC_LVTT(clocks); + + local_irq_restore(flags); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +static int __init calibrate_APIC_clock(void) +{ + unsigned long long t1 = 0, t2 = 0; + long tt1, tt2; + long result; + int i; + const int LOOPS = HZ/10; + + apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + */ + __setup_APIC_LVTT(1000000000); + + /* + * The timer chip counts down to zero. Let's wait + * for a wraparound to start exact measurement: + * (the current tick might have been already half done) + */ + + wait_timer_tick(); + + /* + * We wrapped around just now. Let's start: + */ + if (cpu_has_tsc) + rdtscll(t1); + tt1 = apic_read(APIC_TMCCT); + + /* + * Let's wait LOOPS wraprounds: + */ + for (i = 0; i < LOOPS; i++) + wait_timer_tick(); + + tt2 = apic_read(APIC_TMCCT); + if (cpu_has_tsc) + rdtscll(t2); + + /* + * The APIC bus clock counter is 32 bits only, it + * might have overflown, but note that we use signed + * longs, thus no extra care needed. + * + * underflown to be exact, as the timer counts down ;) + */ + + result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + + if (cpu_has_tsc) + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + ((long)(t2-t1)/LOOPS)/(1000000/HZ), + ((long)(t2-t1)/LOOPS)%(1000000/HZ)); + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%ld.%04ld MHz.\n", + result/(1000000/HZ), + result%(1000000/HZ)); + + return result; +} + +static unsigned int calibration_result; + +void __init setup_boot_APIC_clock(void) +{ + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); + using_apic_timer = 1; + + local_irq_disable(); + + calibration_result = calibrate_APIC_clock(); + /* + * Now set up the timer for real. + */ + setup_APIC_timer(calibration_result); + + local_irq_enable(); +} + +void __init setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(calibration_result); +} + +void __init disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + } +} + +void enable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); + } +} + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + */ +int setup_profiling_timer(unsigned int multiplier) +{ + int i; + + /* + * Sanity check. [at least 500 APIC cycles should be + * between APIC interrupts as a rule of thumb, to avoid + * irqs flooding us] + */ + if ( (!multiplier) || (calibration_result/multiplier < 500)) + return -EINVAL; + + /* + * Set the new multiplier for each CPU. CPUs don't start using the + * new values until the next timer interrupt in which they do process + * accounting. At that time they also adjust their APIC timers + * accordingly. + */ + for (i = 0; i < NR_CPUS; ++i) + per_cpu(prof_multiplier, i) = multiplier; + + return 0; +} + +#undef APIC_DIVISOR + +/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ + +inline void smp_local_timer_interrupt(struct pt_regs * regs) +{ + int cpu = smp_processor_id(); + + profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { + /* + * The multiplier may have changed since the last time we got + * to this point as a result of the user writing to + * /proc/profile. In this case we need to adjust the APIC + * timer accordingly. + * + * Interrupts are already masked off at this point. + */ + per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); + if (per_cpu(prof_counter, cpu) != + per_cpu(prof_old_multiplier, cpu)) { + __setup_APIC_LVTT( + calibration_result/ + per_cpu(prof_counter, cpu)); + per_cpu(prof_old_multiplier, cpu) = + per_cpu(prof_counter, cpu); + } + +#ifdef CONFIG_SMP + update_process_times(user_mode(regs)); +#endif + } + + /* + * We take the 'long' return path, and there every subsystem + * grabs the apropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); +} + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +fastcall void smp_spurious_interrupt(struct pt_regs *regs) +{ + unsigned long v; + + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", + smp_processor_id()); + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ + +fastcall void smp_error_interrupt(struct pt_regs *regs) +{ + unsigned long v, v1; + + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + return -1; + } + + verify_local_APIC(); + + connect_bsp_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + + setup_local_APIC(); + + if (nmi_watchdog == NMI_LOCAL_APIC) + check_nmi_watchdog(); +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + setup_boot_APIC_clock(); + + return 0; +} diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c new file mode 100644 index 000000000000..45641a872550 --- /dev/null +++ b/arch/i386/kernel/apm.c @@ -0,0 +1,2428 @@ +/* -*- linux-c -*- + * APM BIOS driver for Linux + * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) + * + * Initial development of this driver was funded by NEC Australia P/L + * and NEC Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * October 1995, Rik Faith (faith@cs.unc.edu): + * Minor enhancements and updates (to the patch set) for 1.3.x + * Documentation + * January 1996, Rik Faith (faith@cs.unc.edu): + * Make /proc/apm easy to format (bump driver version) + * March 1996, Rik Faith (faith@cs.unc.edu): + * Prohibit APM BIOS calls unless apm_enabled. + * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>) + * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) + * Version 1.0 and 1.1 + * May 1996, Version 1.2 + * Feb 1998, Version 1.3 + * Feb 1998, Version 1.4 + * Aug 1998, Version 1.5 + * Sep 1998, Version 1.6 + * Nov 1998, Version 1.7 + * Jan 1999, Version 1.8 + * Jan 1999, Version 1.9 + * Oct 1999, Version 1.10 + * Nov 1999, Version 1.11 + * Jan 2000, Version 1.12 + * Feb 2000, Version 1.13 + * Nov 2000, Version 1.14 + * Oct 2001, Version 1.15 + * Jan 2002, Version 1.16 + * Oct 2002, Version 1.16ac + * + * History: + * 0.6b: first version in official kernel, Linux 1.3.46 + * 0.7: changed /proc/apm format, Linux 1.3.58 + * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 + * 0.9: only call bios if bios is present, Linux 1.3.72 + * 1.0: use fixed device number, consolidate /proc/apm into this file, + * Linux 1.3.85 + * 1.1: support user-space standby and suspend, power off after system + * halted, Linux 1.3.98 + * 1.2: When resetting RTC after resume, take care so that the time + * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth + * <jtoth@princeton.edu>); improve interaction between + * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 + * 1.2a:Simple change to stop mysterious bug reports with SMP also added + * levels to the printk calls. APM is not defined for SMP machines. + * The new replacment for it is, but Linux doesn't yet support this. + * Alan Cox Linux 2.1.55 + * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's + * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by + * Dean Gaudet <dgaudet@arctic.org>. + * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87 + * 1.5: Fix segment register reloading (in case of bad segments saved + * across BIOS call). + * Stephen Rothwell + * 1.6: Cope with complier/assembler differences. + * Only try to turn off the first display device. + * Fix OOPS at power off with no APM BIOS by Jan Echternach + * <echter@informatik.uni-rostock.de> + * Stephen Rothwell + * 1.7: Modify driver's cached copy of the disabled/disengaged flags + * to reflect current state of APM BIOS. + * Chris Rankin <rankinc@bellsouth.net> + * Reset interrupt 0 timer to 100Hz after suspend + * Chad Miller <cmiller@surfsouth.com> + * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE + * Richard Gooch <rgooch@atnf.csiro.au> + * Allow boot time disabling of APM + * Make boot messages far less verbose by default + * Make asm safer + * Stephen Rothwell + * 1.8: Add CONFIG_APM_RTC_IS_GMT + * Richard Gooch <rgooch@atnf.csiro.au> + * change APM_NOINTS to CONFIG_APM_ALLOW_INTS + * remove dependency on CONFIG_PROC_FS + * Stephen Rothwell + * 1.9: Fix small typo. <laslo@wodip.opole.pl> + * Try to cope with BIOS's that need to have all display + * devices blanked and not just the first one. + * Ross Paterson <ross@soi.city.ac.uk> + * Fix segment limit setting it has always been wrong as + * the segments needed to have byte granularity. + * Mark a few things __init. + * Add hack to allow power off of SMP systems by popular request. + * Use CONFIG_SMP instead of __SMP__ + * Ignore BOUNCES for three seconds. + * Stephen Rothwell + * 1.10: Fix for Thinkpad return code. + * Merge 2.2 and 2.3 drivers. + * Remove APM dependencies in arch/i386/kernel/process.c + * Remove APM dependencies in drivers/char/sysrq.c + * Reset time across standby. + * Allow more inititialisation on SMP. + * Remove CONFIG_APM_POWER_OFF and make it boot time + * configurable (default on). + * Make debug only a boot time parameter (remove APM_DEBUG). + * Try to blank all devices on any error. + * 1.11: Remove APM dependencies in drivers/char/console.c + * Check nr_running to detect if we are idle (from + * Borislav Deianov <borislav@lix.polytechnique.fr>) + * Fix for bioses that don't zero the top part of the + * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>) + * (reported by Panos Katsaloulis <teras@writeme.com>). + * Real mode power off patch (Walter Hofmann + * <Walter.Hofmann@physik.stud.uni-erlangen.de>). + * 1.12: Remove CONFIG_SMP as the compiler will optimize + * the code away anyway (smp_num_cpus == 1 in UP) + * noted by Artur Skawina <skawina@geocities.com>. + * Make power off under SMP work again. + * Fix thinko with initial engaging of BIOS. + * Make sure power off only happens on CPU 0 + * (Paul "Rusty" Russell <rusty@rustcorp.com.au>). + * Do error notification to user mode if BIOS calls fail. + * Move entrypoint offset fix to ...boot/setup.S + * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>). + * Remove smp-power-off. SMP users must now specify + * "apm=power-off" on the kernel command line. Suggested + * by Jim Avera <jima@hal.com>, modified by Alan Cox + * <alan@lxorguk.ukuu.org.uk>. + * Register the /proc/apm entry even on SMP so that + * scripts that check for it before doing power off + * work (Jim Avera <jima@hal.com>). + * 1.13: Changes for new pm_ interfaces (Andy Henroid + * <andy_henroid@yahoo.com>). + * Modularize the code. + * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS + * is now the way life works). + * Fix thinko in suspend() (wrong return). + * Notify drivers on critical suspend. + * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> + * modified by sfr). + * Disable interrupts while we are suspended (Andy Henroid + * <andy_henroid@yahoo.com> fixed by sfr). + * Make power off work on SMP again (Tony Hoyle + * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr. + * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore + * interval is now configurable. + * 1.14: Make connection version persist across module unload/load. + * Enable and engage power management earlier. + * Disengage power management on module unload. + * Changed to use the sysrq-register hack for registering the + * power off function called by magic sysrq based upon discussions + * in irc://irc.openprojects.net/#kernelnewbies + * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>). + * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. + * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr. + * Work around byte swap bug in one of the Vaio's BIOS's + * (Marc Boucher <marc@mbsi.ca>). + * Exposed the disable flag to dmi so that we can handle known + * broken APM (Alan Cox <alan@redhat.com>). + * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin + * calling it - instead idle. (Alan Cox <alan@redhat.com>) + * If an APM idle fails log it and idle sensibly + * 1.15: Don't queue events to clients who open the device O_WRONLY. + * Don't expect replies from clients who open the device O_RDONLY. + * (Idea from Thomas Hood) + * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>) + * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.) + * Notify listeners of standby or suspend events before notifying + * drivers. Return EBUSY to ioctl() if suspend is rejected. + * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood) + * Ignore first resume after we generate our own resume event + * after a suspend (Thomas Hood) + * Daemonize now gets rid of our controlling terminal (sfr). + * CONFIG_APM_CPU_IDLE now just affects the default value of + * idle_threshold (sfr). + * Change name of kernel apm daemon (as it no longer idles) (sfr). + * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we + * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. + * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. + * + * APM 1.1 Reference: + * + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.1, September 1993. + * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. + * + * [This document is available free from Intel by calling 800.628.8686 (fax + * 916.356.6100) or 800.548.4725; or via anonymous ftp from + * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also + * available from Microsoft by calling 206.882.8080.] + * + * APM 1.2 Reference: + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.2, February 1996. + * + * [This document is available from Microsoft at: + * http://www.microsoft.com/hwdev/busbios/amp_12.htm] + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/poll.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/timer.h> +#include <linux/fcntl.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/miscdevice.h> +#include <linux/apm_bios.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/sched.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/dmi.h> +#include <linux/suspend.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/desc.h> + +#include "io_ports.h" + +extern spinlock_t i8253_lock; +extern unsigned long get_cmos_time(void); +extern void machine_real_restart(unsigned char *, int); + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) +extern int (*console_blank_hook)(int); +#endif + +/* + * The apm_bios device is one of the misc char devices. + * This is its minor number. + */ +#define APM_MINOR_DEV 134 + +/* + * See Documentation/Config.help for the configuration options. + * + * Various options can be changed at boot time as follows: + * (We allow underscores for compatibility with the modules code) + * apm=on/off enable/disable APM + * [no-]allow[-_]ints allow interrupts during BIOS calls + * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call + * [no-]realmode[-_]power[-_]off switch to real mode before + * powering off + * [no-]debug log some debugging messages + * [no-]power[-_]off power off on shutdown + * [no-]smp Use apm even on an SMP box + * bounce[-_]interval=<n> number of ticks to ignore suspend + * bounces + * idle[-_]threshold=<n> System idle percentage above which to + * make APM BIOS idle calls. Set it to + * 100 to disable. + * idle[-_]period=<n> Period (in 1/100s of a second) over + * which the idle percentage is + * calculated. + */ + +/* KNOWN PROBLEM MACHINES: + * + * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant + * [Confirmed by TI representative] + * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification + * [Confirmed by BIOS disassembly] + * [This may work now ...] + * P: Toshiba 1950S: battery life information only gets updated after resume + * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking + * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>] + * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP + * Neale Banks <neale@lowendale.com.au> December 2000 + * + * Legend: U = unusable with APM patches + * P = partially usable with APM patches + */ + +/* + * Define as 1 to make the driver always call the APM BIOS busy + * routine even if the clock was not reported as slowed by the + * idle routine. Otherwise, define as 0. + */ +#define ALWAYS_CALL_BUSY 1 + +/* + * Define to make the APM BIOS calls zero all data segment registers (so + * that an incorrect BIOS implementation will cause a kernel panic if it + * tries to write to arbitrary memory). + */ +#define APM_ZERO_SEGS + +#include "apm.h" + +/* + * Define to make all _set_limit calls use 64k limits. The APM 1.1 BIOS is + * supposed to provide limit information that it recognizes. Many machines + * do this correctly, but many others do not restrict themselves to their + * claimed limit. When this happens, they will cause a segmentation + * violation in the kernel at boot time. Most BIOS's, however, will + * respect a 64k limit, so we use that. If you want to be pedantic and + * hold your BIOS to its claims, then undefine this. + */ +#define APM_RELAX_SEGMENTS + +/* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. + * This patched by Chad Miller <cmiller@surfsouth.com>, original code by + * David Chen <chen@ctpa04.mit.edu> + */ +#undef INIT_TIMER_AFTER_SUSPEND + +#ifdef INIT_TIMER_AFTER_SUSPEND +#include <linux/timex.h> +#include <asm/io.h> +#include <linux/delay.h> +#endif + +/* + * Need to poll the APM BIOS every second + */ +#define APM_CHECK_TIMEOUT (HZ) + +/* + * Ignore suspend events for this amount of time after a resume + */ +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) + +/* + * Maximum number of events stored + */ +#define APM_MAX_EVENTS 20 + +/* + * The per-file APM data + */ +struct apm_user { + int magic; + struct apm_user * next; + int suser: 1; + int writer: 1; + int reader: 1; + int suspend_wait: 1; + int suspend_result; + int suspends_pending; + int standbys_pending; + int suspends_read; + int standbys_read; + int event_head; + int event_tail; + apm_event_t events[APM_MAX_EVENTS]; +}; + +/* + * The magic number in apm_user + */ +#define APM_BIOS_MAGIC 0x4101 + +/* + * idle percentage above which bios idle calls are done + */ +#ifdef CONFIG_APM_CPU_IDLE +#define DEFAULT_IDLE_THRESHOLD 95 +#else +#define DEFAULT_IDLE_THRESHOLD 100 +#endif +#define DEFAULT_IDLE_PERIOD (100 / 3) + +/* + * Local variables + */ +static struct { + unsigned long offset; + unsigned short segment; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold = DEFAULT_IDLE_THRESHOLD; +static int idle_period = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval = DEFAULT_BOUNCE_INTERVAL; + +#ifdef CONFIG_APM_RTC_IS_GMT +# define clock_cmos_diff 0 +# define got_clock_diff 1 +#else +static long clock_cmos_diff; +static int got_clock_diff; +#endif +static int debug; +static int smp; +static int apm_disabled = -1; +#ifdef CONFIG_SMP +static int power_off; +#else +static int power_off = 1; +#endif +#ifdef CONFIG_APM_REAL_MODE_POWER_OFF +static int realmode_power_off = 1; +#else +static int realmode_power_off; +#endif +static int exit_kapmd; +static int kapmd_running; +#ifdef CONFIG_APM_ALLOW_INTS +static int allow_ints = 1; +#else +static int allow_ints; +#endif +static int broken_psr; + +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); +static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); +static struct apm_user * user_list; +static DEFINE_SPINLOCK(user_list_lock); +static struct desc_struct bad_bios_desc = { 0, 0x00409200 }; + +static char driver_version[] = "1.16ac"; /* no spaces */ + +/* + * APM event names taken from the APM 1.2 specification. These are + * the message codes that the BIOS uses to tell us about events + */ +static char * apm_event_name[] = { + "system standby", + "system suspend", + "normal resume", + "critical resume", + "low battery", + "power status change", + "update time", + "critical suspend", + "user standby", + "user suspend", + "system standby resume", + "capabilities change" +}; +#define NR_APM_EVENT_NAME \ + (sizeof(apm_event_name) / sizeof(apm_event_name[0])) + +typedef struct lookup_t { + int key; + char * msg; +} lookup_t; + +/* + * The BIOS returns a set of standard error codes in AX when the + * carry flag is set. + */ + +static const lookup_t error_table[] = { +/* N/A { APM_SUCCESS, "Operation succeeded" }, */ + { APM_DISABLED, "Power management disabled" }, + { APM_CONNECTED, "Real mode interface already connected" }, + { APM_NOT_CONNECTED, "Interface not connected" }, + { APM_16_CONNECTED, "16 bit interface already connected" }, +/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ + { APM_32_CONNECTED, "32 bit interface already connected" }, + { APM_32_UNSUPPORTED, "32 bit interface not supported" }, + { APM_BAD_DEVICE, "Unrecognized device ID" }, + { APM_BAD_PARAM, "Parameter out of range" }, + { APM_NOT_ENGAGED, "Interface not engaged" }, + { APM_BAD_FUNCTION, "Function not supported" }, + { APM_RESUME_DISABLED, "Resume timer disabled" }, + { APM_BAD_STATE, "Unable to enter requested state" }, +/* N/A { APM_NO_EVENTS, "No events pending" }, */ + { APM_NO_ERROR, "BIOS did not set a return code" }, + { APM_NOT_PRESENT, "No APM present" } +}; +#define ERROR_COUNT (sizeof(error_table)/sizeof(lookup_t)) + +/** + * apm_error - display an APM error + * @str: information string + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of + * an APM error. + */ + +static void apm_error(char *str, int err) +{ + int i; + + for (i = 0; i < ERROR_COUNT; i++) + if (error_table[i].key == err) break; + if (i < ERROR_COUNT) + printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + else + printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + str, err); +} + +/* + * Lock APM functionality to physical CPU 0 + */ + +#ifdef CONFIG_SMP + +static cpumask_t apm_save_cpus(void) +{ + cpumask_t x = current->cpus_allowed; + /* Some bioses don't like being called from CPU != 0 */ + set_cpus_allowed(current, cpumask_of_cpu(0)); + BUG_ON(smp_processor_id() != 0); + return x; +} + +static inline void apm_restore_cpus(cpumask_t mask) +{ + set_cpus_allowed(current, mask); +} + +#else + +/* + * No CPU lockdown needed on a uniprocessor + */ + +#define apm_save_cpus() (current->cpus_allowed) +#define apm_restore_cpus(x) (void)(x) + +#endif + +/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) + * are saved and zeroed this means that if the BIOS tries to reference + * any data without explicitly loading the segment registers, the kernel + * will fault immediately rather than have some unforeseen circumstances + * for the rest of the kernel. And it will be very obvious! :-) Doing + * this depends on CS referring to the same physical memory as DS so that + * DS can be zeroed before the call. Unfortunately, we can't do anything + * about the stack segment/pointer. Also, we tell the compiler that + * everything could change. + * + * Also, we KNOW that for the non error case of apm_bios_call, there + * is no useful data returned in the low order 8 bits of eax. + */ +#define APM_DO_CLI \ + if (apm_info.allow_ints) \ + local_irq_enable(); \ + else \ + local_irq_disable(); + +#ifdef APM_ZERO_SEGS +# define APM_DECL_SEGS \ + unsigned int saved_fs; unsigned int saved_gs; +# define APM_DO_SAVE_SEGS \ + savesegment(fs, saved_fs); savesegment(gs, saved_gs) +# define APM_DO_RESTORE_SEGS \ + loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) +#else +# define APM_DECL_SEGS +# define APM_DO_SAVE_SEGS +# define APM_DO_RESTORE_SEGS +#endif + +/** + * apm_bios_call - Make an APM BIOS 32bit call + * @func: APM function to execute + * @ebx_in: EBX register for call entry + * @ecx_in: ECX register for call entry + * @eax: EAX register return + * @ebx: EBX register return + * @ecx: ECX register return + * @edx: EDX register return + * @esi: ESI register return + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and + * enabled. This call can disable interrupts for a long period of + * time on some laptops. The return value is in AH and the carry + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. + */ + +static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) +{ + APM_DECL_SEGS + unsigned long flags; + cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + + cpus = apm_save_cpus(); + + cpu = get_cpu(); + save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8]; + per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc; + + local_save_flags(flags); + APM_DO_CLI; + APM_DO_SAVE_SEGS; + apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); + APM_DO_RESTORE_SEGS; + local_irq_restore(flags); + per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = save_desc_40; + put_cpu(); + apm_restore_cpus(cpus); + + return *eax & 0xff; +} + +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * + * Make a BIOS call that does only returns one value, or just status. + * If there is an error, then the error code is returned in AH + * (bits 8-15 of eax) and this function returns non-zero. This is + * used for simpler BIOS operations. This call may hold interrupts + * off for a long time on some laptops. + */ + +static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) +{ + u8 error; + APM_DECL_SEGS + unsigned long flags; + cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + + + cpus = apm_save_cpus(); + + cpu = get_cpu(); + save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8]; + per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc; + + local_save_flags(flags); + APM_DO_CLI; + APM_DO_SAVE_SEGS; + error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); + APM_DO_RESTORE_SEGS; + local_irq_restore(flags); + __get_cpu_var(cpu_gdt_table)[0x40 / 8] = save_desc_40; + put_cpu(); + apm_restore_cpus(cpus); + return error; +} + +/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * + * Retrieve the APM version supported by the BIOS. This is only + * supported for APM 1.1 or higher. An error indicates APM 1.0 is + * probably present. + * + * On entry val should point to a value indicating the APM driver + * version with the high byte being the major and the low byte the + * minor number both in BCD + * + * On return it will hold the BIOS revision supported in the + * same format. + */ + +static int apm_driver_version(u_short *val) +{ + u32 eax; + + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) + return (eax >> 8) & 0xff; + *val = eax; + return APM_SUCCESS; +} + +/** + * apm_get_event - get an APM event from the BIOS + * @event: pointer to the event + * @info: point to the event information + * + * The APM BIOS provides a polled information for event + * reporting. The BIOS expects to be polled at least every second + * when events are pending. When a message is found the caller should + * poll until no more messages are present. However, this causes + * problems on some laptops where a suspend event notification is + * not cleared until it is acknowledged. + * + * Additional information is returned in the info pointer, providing + * that APM 1.2 is in use. If no messges are pending the value 0x80 + * is returned (No power management events pending). + */ + +static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 dummy; + + if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, + &dummy, &dummy)) + return (eax >> 8) & 0xff; + *event = ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else + *info = ecx; + return APM_SUCCESS; +} + +/** + * set_power_state - set the power management state + * @what: which items to transition + * @state: state to transition to + * + * Request an APM change of state for one or more system devices. The + * processor state must be transitioned last of all. what holds the + * class of device in the upper byte and the device number (0xFF for + * all) for the object to be transitioned. + * + * The state holds the state to transition to, which may in fact + * be an acceptance of a BIOS requested state change. + */ + +static int set_power_state(u_short what, u_short state) +{ + u32 eax; + + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) + return (eax >> 8) & 0xff; + return APM_SUCCESS; +} + +/** + * set_system_power_state - set system wide power state + * @state: which state to enter + * + * Transition the entire system into a new APM power state. + */ + +static int set_system_power_state(u_short state) +{ + return set_power_state(APM_DEVICE_ALL, state); +} + +/** + * apm_do_idle - perform power saving + * + * This function notifies the BIOS that the processor is (in the view + * of the OS) idle. It returns -1 in the event that the BIOS refuses + * to handle the idle request. On a success the function returns 1 + * if the BIOS did clock slowing or 0 otherwise. + */ + +static int apm_do_idle(void) +{ + u32 eax; + + if (apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax)) { + static unsigned long t; + + /* This always fails on some SMP boards running UP kernels. + * Only report the failure the first 5 times. + */ + if (++t < 5) + { + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", + (eax >> 8) & 0xff); + t = jiffies; + } + return -1; + } + clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; + return clock_slowed; +} + +/** + * apm_do_busy - inform the BIOS the CPU is busy + * + * Request that the BIOS brings the CPU back to full performance. + */ + +static void apm_do_busy(void) +{ + u32 dummy; + + if (clock_slowed || ALWAYS_CALL_BUSY) { + (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + clock_slowed = 0; + } +} + +/* + * If no process has really been interested in + * the CPU for some time, we want to call BIOS + * power management - we probably want + * to conserve power. + */ +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 + +static void (*original_pm_idle)(void); + +extern void default_idle(void); + +/** + * apm_cpu_idle - cpu idling for APM capable Linux + * + * This is the idling function the kernel executes when APM is available. It + * tries to do BIOS powermanagement based on the average system idle time. + * Furthermore it calls the system default idle routine. + */ + +static void apm_cpu_idle(void) +{ + static int use_apm_idle; /* = 0 */ + static unsigned int last_jiffies; /* = 0 */ + static unsigned int last_stime; /* = 0 */ + + int apm_idle_done = 0; + unsigned int jiffies_since_last_check = jiffies - last_jiffies; + unsigned int bucket; + +recalc: + if (jiffies_since_last_check > IDLE_CALC_LIMIT) { + use_apm_idle = 0; + last_jiffies = jiffies; + last_stime = current->stime; + } else if (jiffies_since_last_check > idle_period) { + unsigned int idle_percentage; + + idle_percentage = current->stime - last_stime; + idle_percentage *= 100; + idle_percentage /= jiffies_since_last_check; + use_apm_idle = (idle_percentage > idle_threshold); + if (apm_info.forbid_idle) + use_apm_idle = 0; + last_jiffies = jiffies; + last_stime = current->stime; + } + + bucket = IDLE_LEAKY_MAX; + + while (!need_resched()) { + if (use_apm_idle) { + unsigned int t; + + t = jiffies; + switch (apm_do_idle()) { + case 0: apm_idle_done = 1; + if (t != jiffies) { + if (bucket) { + bucket = IDLE_LEAKY_MAX; + continue; + } + } else if (bucket) { + bucket--; + continue; + } + break; + case 1: apm_idle_done = 1; + break; + default: /* BIOS refused */ + break; + } + } + if (original_pm_idle) + original_pm_idle(); + else + default_idle(); + jiffies_since_last_check = jiffies - last_jiffies; + if (jiffies_since_last_check > idle_period) + goto recalc; + } + + if (apm_idle_done) + apm_do_busy(); +} + +/** + * apm_power_off - ask the BIOS to power off + * + * Handle the power off sequence. This is the one piece of code we + * will execute even on SMP machines. In order to deal with BIOS + * bugs we support real mode APM BIOS power off calls. We also make + * the SMP call on CPU0 as some systems will only honour this call + * on their first cpu. + */ + +static void apm_power_off(void) +{ + unsigned char po_bios_call[] = { + 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ + 0x8e, 0xd0, /* movw ax,ss */ + 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ + 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ + 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ + 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ + 0xcd, 0x15 /* int $0x15 */ + }; + + /* + * This may be called on an SMP machine. + */ +#ifdef CONFIG_SMP + /* Some bioses don't like being called from CPU != 0 */ + set_cpus_allowed(current, cpumask_of_cpu(0)); + BUG_ON(smp_processor_id() != 0); +#endif + if (apm_info.realmode_power_off) + { + (void)apm_save_cpus(); + machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } + else + (void) set_system_power_state(APM_STATE_OFF); +} + +#ifdef CONFIG_APM_DO_ENABLE + +/** + * apm_enable_power_management - enable BIOS APM power management + * @enable: enable yes/no + * + * Enable or disable the APM BIOS power services. + */ + +static int apm_enable_power_management(int enable) +{ + u32 eax; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, + enable, &eax)) + return (eax >> 8) & 0xff; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else + apm_info.bios.flags |= APM_BIOS_DISABLED; + return APM_SUCCESS; +} +#endif + +/** + * apm_get_power_status - get current power state + * @status: returned status + * @bat: battery info + * @life: estimated life + * + * Obtain the current power status from the APM BIOS. We return a + * status which gives the rough battery status, and current power + * source. The bat value returned give an estimate as a percentage + * of life and a status value for the battery. The estimated life + * if reported is a lifetime in secodnds/minutes at current powwer + * consumption. + */ + +static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 dummy; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; + if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, + &eax, &ebx, &ecx, &edx, &dummy)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + if (apm_info.get_power_status_swabinminutes) { + *life = swab16((u16)edx); + *life |= 0x8000; + } else + *life = edx; + return APM_SUCCESS; +} + +#if 0 +static int apm_get_battery_status(u_short which, u_short *status, + u_short *bat, u_short *life, u_short *nbat) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + + if (apm_info.connection_version < 0x0102) { + /* pretend we only have one battery. */ + if (which != 1) + return APM_BAD_DEVICE; + *nbat = 1; + return apm_get_power_status(status, bat, life); + } + + if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, + &ebx, &ecx, &edx, &esi)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + *life = edx; + *nbat = esi; + return APM_SUCCESS; +} +#endif + +/** + * apm_engage_power_management - enable PM on a device + * @device: identity of device + * @enable: on/off + * + * Activate or deactive power management on either a specific device + * or the entire system (%APM_DEVICE_ALL). + */ + +static int apm_engage_power_management(u_short device, int enable) +{ + u32 eax; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) + return (eax >> 8) & 0xff; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; + else + apm_info.bios.flags |= APM_BIOS_DISENGAGED; + } + return APM_SUCCESS; +} + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + +/** + * apm_console_blank - blank the display + * @blank: on/off + * + * Attempt to blank the console, firstly by blanking just video device + * zero, and if that fails (some BIOSes don't support it) then it blanks + * all video devices. Typically the BIOS will do laptop backlight and + * monitor powerdown for us. + */ + +static int apm_console_blank(int blank) +{ + int error; + u_short state; + + state = blank ? APM_STATE_STANDBY : APM_STATE_READY; + /* Blank the first display device */ + error = set_power_state(0x100, state); + if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) { + /* try to blank them all instead */ + error = set_power_state(0x1ff, state); + if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) + /* try to blank device one instead */ + error = set_power_state(0x101, state); + } + if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) + return 1; + if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { + eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (eng_error) { + apm_error("set display", error); + apm_error("engage interface", eng_error); + return 0; + } else + return apm_console_blank(blank); + } + } + apm_error("set display", error); + return 0; +} +#endif + +static int queue_empty(struct apm_user *as) +{ + return as->event_head == as->event_tail; +} + +static apm_event_t get_queued_event(struct apm_user *as) +{ + as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS; + return as->events[as->event_tail]; +} + +static void queue_event(apm_event_t event, struct apm_user *sender) +{ + struct apm_user * as; + + spin_lock(&user_list_lock); + if (user_list == NULL) + goto out; + for (as = user_list; as != NULL; as = as->next) { + if ((as == sender) || (!as->reader)) + continue; + as->event_head = (as->event_head + 1) % APM_MAX_EVENTS; + if (as->event_head == as->event_tail) { + static int notified; + + if (notified++ == 0) + printk(KERN_ERR "apm: an event queue overflowed\n"); + as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS; + } + as->events[as->event_head] = event; + if ((!as->suser) || (!as->writer)) + continue; + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_pending++; + suspends_pending++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_pending++; + standbys_pending++; + break; + } + } + wake_up_interruptible(&apm_waitqueue); +out: + spin_unlock(&user_list_lock); +} + +static void set_time(void) +{ + if (got_clock_diff) { /* Must know time zone in order to set clock */ + xtime.tv_sec = get_cmos_time() + clock_cmos_diff; + xtime.tv_nsec = 0; + } +} + +static void get_time_diff(void) +{ +#ifndef CONFIG_APM_RTC_IS_GMT + /* + * Estimate time zone so that set_time can update the clock + */ + clock_cmos_diff = -get_cmos_time(); + clock_cmos_diff += get_seconds(); + got_clock_diff = 1; +#endif +} + +static void reinit_timer(void) +{ +#ifdef INIT_TIMER_AFTER_SUSPEND + unsigned long flags; + extern spinlock_t i8253_lock; + + spin_lock_irqsave(&i8253_lock, flags); + /* set the clock to 100 Hz */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8, PIT_CH0); /* MSB */ + udelay(10); + spin_unlock_irqrestore(&i8253_lock, flags); +#endif +} + +static int suspend(int vetoable) +{ + int err; + struct apm_user *as; + + if (pm_send_all(PM_SUSPEND, (void *)3)) { + /* Vetoed */ + if (vetoable) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + err = -EBUSY; + ignore_sys_suspend = 0; + printk(KERN_WARNING "apm: suspend was vetoed.\n"); + goto out; + } + printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); + } + + device_suspend(PMSG_SUSPEND); + local_irq_disable(); + device_power_down(PMSG_SUSPEND); + + /* serialize with the timer interrupt */ + write_seqlock(&xtime_lock); + + /* protect against access to timer chip registers */ + spin_lock(&i8253_lock); + + get_time_diff(); + /* + * Irq spinlock must be dropped around set_system_power_state. + * We'll undo any timer changes due to interrupts below. + */ + spin_unlock(&i8253_lock); + write_sequnlock(&xtime_lock); + local_irq_enable(); + + save_processor_state(); + err = set_system_power_state(APM_STATE_SUSPEND); + restore_processor_state(); + + local_irq_disable(); + write_seqlock(&xtime_lock); + spin_lock(&i8253_lock); + reinit_timer(); + set_time(); + ignore_normal_resume = 1; + + spin_unlock(&i8253_lock); + write_sequnlock(&xtime_lock); + + if (err == APM_NO_ERROR) + err = APM_SUCCESS; + if (err != APM_SUCCESS) + apm_error("suspend", err); + err = (err == APM_SUCCESS) ? 0 : -EIO; + device_power_up(); + local_irq_enable(); + device_resume(); + pm_send_all(PM_RESUME, (void *)0); + queue_event(APM_NORMAL_RESUME, NULL); + out: + spin_lock(&user_list_lock); + for (as = user_list; as != NULL; as = as->next) { + as->suspend_wait = 0; + as->suspend_result = err; + } + spin_unlock(&user_list_lock); + wake_up_interruptible(&apm_suspend_waitqueue); + return err; +} + +static void standby(void) +{ + int err; + + local_irq_disable(); + device_power_down(PMSG_SUSPEND); + /* serialize with the timer interrupt */ + write_seqlock(&xtime_lock); + /* If needed, notify drivers here */ + get_time_diff(); + write_sequnlock(&xtime_lock); + local_irq_enable(); + + err = set_system_power_state(APM_STATE_STANDBY); + if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) + apm_error("standby", err); + + local_irq_disable(); + device_power_up(); + local_irq_enable(); +} + +static apm_event_t get_event(void) +{ + int error; + apm_event_t event; + apm_eventinfo_t info; + + static int notified; + + /* we don't use the eventinfo */ + error = apm_get_event(&event, &info); + if (error == APM_SUCCESS) + return event; + + if ((error != APM_NO_EVENTS) && (notified++ == 0)) + apm_error("get_event", error); + + return 0; +} + +static void check_events(void) +{ + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; + + while ((event = get_event()) != 0) { + if (debug) { + if (event <= NR_APM_EVENT_NAME) + printk(KERN_DEBUG "apm: received %s notify\n", + apm_event_name[event - 1]); + else + printk(KERN_DEBUG "apm: received unknown " + "event 0x%02x\n", event); + } + if (ignore_bounce + && ((jiffies - last_resume) > bounce_interval)) + ignore_bounce = 0; + + switch (event) { + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + queue_event(event, NULL); + if (standbys_pending <= 0) + standby(); + break; + + case APM_USER_SUSPEND: +#ifdef CONFIG_APM_IGNORE_USER_SUSPEND + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; +#endif + case APM_SYS_SUSPEND: + if (ignore_bounce) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; + } + /* + * If we are already processing a SUSPEND, + * then further SUSPEND events from the BIOS + * will be ignored. We also return here to + * cope with the fact that the Thinkpads keep + * sending a SUSPEND event until something else + * happens! + */ + if (ignore_sys_suspend) + return; + ignore_sys_suspend = 1; + queue_event(event, NULL); + if (suspends_pending <= 0) + (void) suspend(1); + break; + + case APM_NORMAL_RESUME: + case APM_CRITICAL_RESUME: + case APM_STANDBY_RESUME: + ignore_sys_suspend = 0; + last_resume = jiffies; + ignore_bounce = 1; + if ((event != APM_NORMAL_RESUME) + || (ignore_normal_resume == 0)) { + write_seqlock_irq(&xtime_lock); + set_time(); + write_sequnlock_irq(&xtime_lock); + device_resume(); + pm_send_all(PM_RESUME, (void *)0); + queue_event(event, NULL); + } + ignore_normal_resume = 0; + break; + + case APM_CAPABILITY_CHANGE: + case APM_LOW_BATTERY: + case APM_POWER_STATUS_CHANGE: + queue_event(event, NULL); + /* If needed, notify drivers here */ + break; + + case APM_UPDATE_TIME: + write_seqlock_irq(&xtime_lock); + set_time(); + write_sequnlock_irq(&xtime_lock); + break; + + case APM_CRITICAL_SUSPEND: + /* + * We are not allowed to reject a critical suspend. + */ + (void) suspend(0); + break; + } + } +} + +static void apm_event_handler(void) +{ + static int pending_count = 4; + int err; + + if ((standbys_pending > 0) || (suspends_pending > 0)) { + if ((apm_info.connection_version > 0x100) && + (pending_count-- <= 0)) { + pending_count = 4; + if (debug) + printk(KERN_DEBUG "apm: setting state busy\n"); + err = set_system_power_state(APM_STATE_BUSY); + if (err) + apm_error("busy", err); + } + } else + pending_count = 4; + check_events(); +} + +/* + * This is the APM thread main loop. + */ + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&apm_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + schedule_timeout(APM_CHECK_TIMEOUT); + if (exit_kapmd) + break; + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + set_current_state(TASK_INTERRUPTIBLE); + apm_event_handler(); + } + remove_wait_queue(&apm_waitqueue, &wait); +} + +static int check_apm_user(struct apm_user *as, const char *func) +{ + if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { + printk(KERN_ERR "apm: %s passed bad filp\n", func); + return 1; + } + return 0; +} + +static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) +{ + struct apm_user * as; + int i; + apm_event_t event; + + as = fp->private_data; + if (check_apm_user(as, "read")) + return -EIO; + if ((int)count < sizeof(apm_event_t)) + return -EINVAL; + if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) + return -EAGAIN; + wait_event_interruptible(apm_waitqueue, !queue_empty(as)); + i = count; + while ((i >= sizeof(event)) && !queue_empty(as)) { + event = get_queued_event(as); + if (copy_to_user(buf, &event, sizeof(event))) { + if (i < count) + break; + return -EFAULT; + } + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_read++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_read++; + break; + } + buf += sizeof(event); + i -= sizeof(event); + } + if (i < count) + return count - i; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +static unsigned int do_poll(struct file *fp, poll_table * wait) +{ + struct apm_user * as; + + as = fp->private_data; + if (check_apm_user(as, "poll")) + return 0; + poll_wait(fp, &apm_waitqueue, wait); + if (!queue_empty(as)) + return POLLIN | POLLRDNORM; + return 0; +} + +static int do_ioctl(struct inode * inode, struct file *filp, + u_int cmd, u_long arg) +{ + struct apm_user * as; + + as = filp->private_data; + if (check_apm_user(as, "ioctl")) + return -EIO; + if ((!as->suser) || (!as->writer)) + return -EPERM; + switch (cmd) { + case APM_IOC_STANDBY: + if (as->standbys_read > 0) { + as->standbys_read--; + as->standbys_pending--; + standbys_pending--; + } else + queue_event(APM_USER_STANDBY, as); + if (standbys_pending <= 0) + standby(); + break; + case APM_IOC_SUSPEND: + if (as->suspends_read > 0) { + as->suspends_read--; + as->suspends_pending--; + suspends_pending--; + } else + queue_event(APM_USER_SUSPEND, as); + if (suspends_pending <= 0) { + return suspend(1); + } else { + as->suspend_wait = 1; + wait_event_interruptible(apm_suspend_waitqueue, + as->suspend_wait == 0); + return as->suspend_result; + } + break; + default: + return -EINVAL; + } + return 0; +} + +static int do_release(struct inode * inode, struct file * filp) +{ + struct apm_user * as; + + as = filp->private_data; + if (check_apm_user(as, "release")) + return 0; + filp->private_data = NULL; + if (as->standbys_pending > 0) { + standbys_pending -= as->standbys_pending; + if (standbys_pending <= 0) + standby(); + } + if (as->suspends_pending > 0) { + suspends_pending -= as->suspends_pending; + if (suspends_pending <= 0) + (void) suspend(1); + } + spin_lock(&user_list_lock); + if (user_list == as) + user_list = as->next; + else { + struct apm_user * as1; + + for (as1 = user_list; + (as1 != NULL) && (as1->next != as); + as1 = as1->next) + ; + if (as1 == NULL) + printk(KERN_ERR "apm: filp not in user list\n"); + else + as1->next = as->next; + } + spin_unlock(&user_list_lock); + kfree(as); + return 0; +} + +static int do_open(struct inode * inode, struct file * filp) +{ + struct apm_user * as; + + as = (struct apm_user *)kmalloc(sizeof(*as), GFP_KERNEL); + if (as == NULL) { + printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", + sizeof(*as)); + return -ENOMEM; + } + as->magic = APM_BIOS_MAGIC; + as->event_tail = as->event_head = 0; + as->suspends_pending = as->standbys_pending = 0; + as->suspends_read = as->standbys_read = 0; + /* + * XXX - this is a tiny bit broken, when we consider BSD + * process accounting. If the device is opened by root, we + * instantly flag that we used superuser privs. Who knows, + * we might close the device immediately without doing a + * privileged operation -- cevans + */ + as->suser = capable(CAP_SYS_ADMIN); + as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; + as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; + spin_lock(&user_list_lock); + as->next = user_list; + user_list = as; + spin_unlock(&user_list_lock); + filp->private_data = as; + return 0; +} + +static int apm_get_info(char *buf, char **start, off_t fpos, int length) +{ + char * p; + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + unsigned short ac_line_status = 0xff; + unsigned short battery_status = 0xff; + unsigned short battery_flag = 0xff; + int percentage = -1; + int time_units = -1; + char *units = "?"; + + p = buf; + + if ((num_online_cpus() == 1) && + !(error = apm_get_power_status(&bx, &cx, &dx))) { + ac_line_status = (bx >> 8) & 0xff; + battery_status = bx & 0xff; + if ((cx & 0xff) != 0xff) + percentage = cx & 0xff; + + if (apm_info.connection_version > 0x100) { + battery_flag = (cx >> 8) & 0xff; + if (dx != 0xffff) { + units = (dx & 0x8000) ? "min" : "sec"; + time_units = dx & 0x7fff; + } + } + } + /* Arguments, with symbols from linux/apm_bios.h. Information is + from the Get Power Status (0x0a) call unless otherwise noted. + + 0) Linux driver version (this will change if format changes) + 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. + 2) APM flags from APM Installation Check (0x00): + bit 0: APM_16_BIT_SUPPORT + bit 1: APM_32_BIT_SUPPORT + bit 2: APM_IDLE_SLOWS_CLOCK + bit 3: APM_BIOS_DISABLED + bit 4: APM_BIOS_DISENGAGED + 3) AC line status + 0x00: Off-line + 0x01: On-line + 0x02: On backup power (BIOS >= 1.1 only) + 0xff: Unknown + 4) Battery status + 0x00: High + 0x01: Low + 0x02: Critical + 0x03: Charging + 0x04: Selected battery not present (BIOS >= 1.2 only) + 0xff: Unknown + 5) Battery flag + bit 0: High + bit 1: Low + bit 2: Critical + bit 3: Charging + bit 7: No system battery + 0xff: Unknown + 6) Remaining battery life (percentage of charge): + 0-100: valid + -1: Unknown + 7) Remaining battery life (time units): + Number of remaining minutes or seconds + -1: Unknown + 8) min = minutes; sec = seconds */ + + p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); + + return p - buf; +} + +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + char * power_stat; + char * bat_stat; + + kapmd_running = 1; + + daemonize("kapmd"); + + current->flags |= PF_NOFREEZE; + +#ifdef CONFIG_SMP + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ + set_cpus_allowed(current, cpumask_of_cpu(0)); + BUG_ON(smp_processor_id() != 0); +#endif + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; + if (apm_info.connection_version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_info.connection_version > 0x0102) + apm_info.connection_version = 0x0102; + error = apm_driver_version(&apm_info.connection_version); + if (error != APM_SUCCESS) { + apm_error("driver version", error); + /* Fall back to an APM 1.0 connection. */ + apm_info.connection_version = 0x100; + } + } + } + + if (debug) + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_info.connection_version >> 8) & 0xff, + apm_info.connection_version & 0xff); + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_info.bios.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(1); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + + if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (error) { + apm_error("engage power management", error); + return -1; + } + } + + if (debug && (num_online_cpus() == 1 || smp )) { + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: power_stat = "off line"; break; + case 1: power_stat = "on line"; break; + case 2: power_stat = "on backup power"; break; + default: power_stat = "unknown"; break; + } + switch (bx & 0xff) { + case 0: bat_stat = "high"; break; + case 1: bat_stat = "low"; break; + case 2: bat_stat = "critical"; break; + case 3: bat_stat = "charging"; break; + default: bat_stat = "unknown"; break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_info.connection_version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + + /* Install our power off handler.. */ + if (power_off) + pm_power_off = apm_power_off; + + if (num_online_cpus() == 1 || smp) { +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = apm_console_blank; +#endif + apm_mainloop(); +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = NULL; +#endif + } + kapmd_running = 0; + + return 0; +} + +#ifndef MODULE +static int __init apm_setup(char *str) +{ + int invert; + + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "off", 3) == 0) + apm_disabled = 1; + if (strncmp(str, "on", 2) == 0) + apm_disabled = 0; + if ((strncmp(str, "bounce-interval=", 16) == 0) || + (strncmp(str, "bounce_interval=", 16) == 0)) + bounce_interval = simple_strtol(str + 16, NULL, 0); + if ((strncmp(str, "idle-threshold=", 15) == 0) || + (strncmp(str, "idle_threshold=", 15) == 0)) + idle_threshold = simple_strtol(str + 15, NULL, 0); + if ((strncmp(str, "idle-period=", 12) == 0) || + (strncmp(str, "idle_period=", 12) == 0)) + idle_period = simple_strtol(str + 12, NULL, 0); + invert = (strncmp(str, "no-", 3) == 0) || + (strncmp(str, "no_", 3) == 0); + if (invert) + str += 3; + if (strncmp(str, "debug", 5) == 0) + debug = !invert; + if ((strncmp(str, "power-off", 9) == 0) || + (strncmp(str, "power_off", 9) == 0)) + power_off = !invert; + if (strncmp(str, "smp", 3) == 0) + { + smp = !invert; + idle_threshold = 100; + } + if ((strncmp(str, "allow-ints", 10) == 0) || + (strncmp(str, "allow_ints", 10) == 0)) + apm_info.allow_ints = !invert; + if ((strncmp(str, "broken-psr", 10) == 0) || + (strncmp(str, "broken_psr", 10) == 0)) + apm_info.get_power_status_broken = !invert; + if ((strncmp(str, "realmode-power-off", 18) == 0) || + (strncmp(str, "realmode_power_off", 18) == 0)) + apm_info.realmode_power_off = !invert; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("apm=", apm_setup); +#endif + +static struct file_operations apm_bios_fops = { + .owner = THIS_MODULE, + .read = do_read, + .poll = do_poll, + .ioctl = do_ioctl, + .open = do_open, + .release = do_release, +}; + +static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", + &apm_bios_fops +}; + + +/* Simple "print if true" callback */ +static int __init print_if_true(struct dmi_system_id *d) +{ + printk("%s\n", d->ident); + return 0; +} + +/* + * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was + * disabled before the suspend. Linux used to get terribly confused by that. + */ +static int __init broken_ps2_resume(struct dmi_system_id *d) +{ + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); + return 0; +} + +/* Some bioses have a broken protected mode poweroff and need to use realmode */ +static int __init set_realmode_power_off(struct dmi_system_id *d) +{ + if (apm_info.realmode_power_off == 0) { + apm_info.realmode_power_off = 1; + printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); + } + return 0; +} + +/* Some laptops require interrupts to be enabled during APM calls */ +static int __init set_apm_ints(struct dmi_system_id *d) +{ + if (apm_info.allow_ints == 0) { + apm_info.allow_ints = 1; + printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); + } + return 0; +} + +/* Some APM bioses corrupt memory or just plain do not work */ +static int __init apm_is_horked(struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + } + return 0; +} + +static int __init apm_is_horked_d850md(struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); + printk(KERN_INFO "download from support.intel.com \n"); + } + return 0; +} + +/* Some APM bioses hang on APM idle calls */ +static int __init apm_likes_to_melt(struct dmi_system_id *d) +{ + if (apm_info.forbid_idle == 0) { + apm_info.forbid_idle = 1; + printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); + } + return 0; +} + +/* + * Check for clue free BIOS implementations who use + * the following QA technique + * + * [ Write BIOS Code ]<------ + * | ^ + * < Does it Compile >----N-- + * |Y ^ + * < Does it Boot Win98 >-N-- + * |Y + * [Ship It] + * + * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) + * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) + */ +static int __init broken_apm_power(struct dmi_system_id *d) +{ + apm_info.get_power_status_broken = 1; + printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); + return 0; +} + +/* + * This bios swaps the APM minute reporting bytes over (Many sony laptops + * have this problem). + */ +static int __init swab_apm_power_in_minutes(struct dmi_system_id *d) +{ + apm_info.get_power_status_swabinminutes = 1; + printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); + return 0; +} + +static struct dmi_system_id __initdata apm_dmi_table[] = { + { + print_if_true, + KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, + }, + { /* Handle problems with APM on the C600 */ + broken_ps2_resume, "Dell Latitude C600", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, + }, + { /* Allow interrupts during suspend on Dell Latitude laptops*/ + set_apm_ints, "Dell Latitude", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* Allow interrupts during suspend on Dell Inspiron laptops*/ + set_apm_ints, "Dell Inspiron", { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, + }, + { /* Handle problems with APM on Inspiron 5000e */ + broken_apm_power, "Dell Inspiron 5000e", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A04"), + DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, + }, + { /* Handle problems with APM on Inspiron 2500 */ + broken_apm_power, "Dell Inspiron 2500", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A12"), + DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Dimension 4100", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), + DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* Allow interrupts during suspend on Compaq Laptops*/ + set_apm_ints, "Compaq 12XL125", + { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, + }, + { /* Allow interrupts during APM or the clock goes slow */ + set_apm_ints, "ASUSTeK", + { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, + }, + { /* APM blows on shutdown */ + apm_is_horked, "ABIT KX7-333[R]", + { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), + DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, + }, + { /* APM crashes */ + apm_is_horked, "Trigem Delhi3", + { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), + DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, + }, + { /* APM crashes */ + apm_is_horked, "Fujitsu-Siemens", + { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), + DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, + }, + { /* APM crashes */ + apm_is_horked_d850md, "Intel D850MD", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, + }, + { /* APM crashes */ + apm_is_horked, "Intel D810EMO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell XPS-Z", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, + }, + { /* APM crashes */ + apm_is_horked, "Sharp PC-PJ/AX", + { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), + DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), + DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "Jabil AMD", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "AMI Bios", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), + DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505VX */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), + DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-XG29 */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), + DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), + DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), + DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), + DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), + DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-F104K */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), + DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, + }, + + { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), + DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), + DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), + DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, + }, + { /* broken PM poweroff bios */ + set_realmode_power_off, "Award Software v4.60 PGMA", + { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), + DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, + }, + + /* Generic per vendor APM settings */ + + { /* Allow interrupts during suspend on IBM laptops */ + set_apm_ints, "IBM", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + + { } +}; + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) +{ + struct proc_dir_entry *apm_proc; + int ret; + int i; + + dmi_check_system(apm_dmi_table); + + if (apm_info.bios.version == 0) { + printk(KERN_INFO "apm: BIOS not found.\n"); + return -ENODEV; + } + printk(KERN_INFO + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); + if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { + printk(KERN_INFO "apm: no 32 bit BIOS support\n"); + return -ENODEV; + } + + if (allow_ints) + apm_info.allow_ints = 1; + if (broken_psr) + apm_info.get_power_status_broken = 1; + if (realmode_power_off) + apm_info.realmode_power_off = 1; + /* User can override, but default is to trust DMI */ + if (apm_disabled != -1) + apm_info.disabled = apm_disabled; + + /* + * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 + * but is reportedly a 1.0 BIOS. + */ + if (apm_info.bios.version == 0x001) + apm_info.bios.version = 0x100; + + /* BIOS < 1.2 doesn't set cseg_16_len */ + if (apm_info.bios.version < 0x102) + apm_info.bios.cseg_16_len = 0; /* 64k */ + + if (debug) { + printk(KERN_INFO "apm: entry %x:%lx cseg16 %x dseg %x", + apm_info.bios.cseg, apm_info.bios.offset, + apm_info.bios.cseg_16, apm_info.bios.dseg); + if (apm_info.bios.version > 0x100) + printk(" cseg len %x, dseg len %x", + apm_info.bios.cseg_len, + apm_info.bios.dseg_len); + if (apm_info.bios.version > 0x101) + printk(" cseg16 len %x", apm_info.bios.cseg_16_len); + printk("\n"); + } + + if (apm_info.disabled) { + printk(KERN_NOTICE "apm: disabled on user request.\n"); + return -ENODEV; + } + if ((num_online_cpus() > 1) && !power_off && !smp) { + printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + if (PM_IS_ACTIVE()) { + printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + pm_active = 1; + + /* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ + set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); + _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + + apm_bios_entry.offset = apm_info.bios.offset; + apm_bios_entry.segment = APM_CS; + + for (i = 0; i < NR_CPUS; i++) { + set_base(per_cpu(cpu_gdt_table, i)[APM_CS >> 3], + __va((unsigned long)apm_info.bios.cseg << 4)); + set_base(per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3], + __va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_base(per_cpu(cpu_gdt_table, i)[APM_DS >> 3], + __va((unsigned long)apm_info.bios.dseg << 4)); +#ifndef APM_RELAX_SEGMENTS + if (apm_info.bios.version == 0x100) { +#endif + /* For ASUS motherboard, Award BIOS rev 110 (and others?) */ + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3], 64 * 1024 - 1); + /* For some unknown machine. */ + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3], 64 * 1024 - 1); + /* For the DEC Hinote Ultra CT475 (and others?) */ + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_DS >> 3], 64 * 1024 - 1); +#ifndef APM_RELAX_SEGMENTS + } else { + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3], + (apm_info.bios.cseg_len - 1) & 0xffff); + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS_16 >> 3], + (apm_info.bios.cseg_16_len - 1) & 0xffff); + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_DS >> 3], + (apm_info.bios.dseg_len - 1) & 0xffff); + /* workaround for broken BIOSes */ + if (apm_info.bios.cseg_len <= apm_info.bios.offset) + _set_limit((char *)&per_cpu(cpu_gdt_table, i)[APM_CS >> 3], 64 * 1024 -1); + if (apm_info.bios.dseg_len <= 0x40) { /* 0x40 * 4kB == 64kB */ + /* for the BIOS that assumes granularity = 1 */ + per_cpu(cpu_gdt_table, i)[APM_DS >> 3].b |= 0x800000; + printk(KERN_NOTICE "apm: we set the granularity of dseg.\n"); + } + } +#endif + } + + apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); + if (apm_proc) + apm_proc->owner = THIS_MODULE; + + ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD); + if (ret < 0) { + printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n"); + return -ENOMEM; + } + + if (num_online_cpus() > 1 && !smp ) { + printk(KERN_NOTICE + "apm: disabled - APM is not SMP safe (power off active).\n"); + return 0; + } + + misc_register(&apm_device); + + if (HZ != 100) + idle_period = (idle_period * HZ) / 100; + if (idle_threshold < 100) { + original_pm_idle = pm_idle; + pm_idle = apm_cpu_idle; + set_pm_idle = 1; + } + + return 0; +} + +static void __exit apm_exit(void) +{ + int error; + + if (set_pm_idle) { + pm_idle = original_pm_idle; + /* + * We are about to unload the current idle thread pm callback + * (pm_idle), Wait for all processors to update cached/local + * copies of pm_idle before proceeding. + */ + cpu_idle_wait(); + } + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 0); + if (error) + apm_error("disengage power management", error); + } + misc_deregister(&apm_device); + remove_proc_entry("apm", NULL); + if (power_off) + pm_power_off = NULL; + exit_kapmd = 1; + while (kapmd_running) + schedule(); + pm_active = 0; +} + +module_init(apm_init); +module_exit(apm_exit); + +MODULE_AUTHOR("Stephen Rothwell"); +MODULE_DESCRIPTION("Advanced Power Management"); +MODULE_LICENSE("GPL"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(debug, "Enable debug mode"); +module_param(power_off, bool, 0444); +MODULE_PARM_DESC(power_off, "Enable power off"); +module_param(bounce_interval, int, 0444); +MODULE_PARM_DESC(bounce_interval, + "Set the number of ticks to ignore suspend bounces"); +module_param(allow_ints, bool, 0444); +MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); +module_param(broken_psr, bool, 0444); +MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); +module_param(realmode_power_off, bool, 0444); +MODULE_PARM_DESC(realmode_power_off, + "Switch to real mode before powering off"); +module_param(idle_threshold, int, 0444); +MODULE_PARM_DESC(idle_threshold, + "System idle percentage above which to make APM BIOS idle calls"); +module_param(idle_period, int, 0444); +MODULE_PARM_DESC(idle_period, + "Period (in sec/100) over which to caculate the idle percentage"); +module_param(smp, bool, 0444); +MODULE_PARM_DESC(smp, + "Set this to enable APM use on an SMP platform. Use with caution on older systems"); +MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c new file mode 100644 index 000000000000..36d66e2077d0 --- /dev/null +++ b/arch/i386/kernel/asm-offsets.c @@ -0,0 +1,72 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#include <linux/sched.h> +#include <linux/signal.h> +#include <linux/personality.h> +#include <linux/suspend.h> +#include <asm/ucontext.h> +#include "sigframe.h" +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/thread_info.h> + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +#define OFFSET(sym, str, mem) \ + DEFINE(sym, offsetof(struct str, mem)); + +void foo(void) +{ + OFFSET(SIGCONTEXT_eax, sigcontext, eax); + OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); + OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); + OFFSET(SIGCONTEXT_edx, sigcontext, edx); + OFFSET(SIGCONTEXT_esi, sigcontext, esi); + OFFSET(SIGCONTEXT_edi, sigcontext, edi); + OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); + OFFSET(SIGCONTEXT_esp, sigcontext, esp); + OFFSET(SIGCONTEXT_eip, sigcontext, eip); + BLANK(); + + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); + + OFFSET(TI_task, thread_info, task); + OFFSET(TI_exec_domain, thread_info, exec_domain); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_cpu, thread_info, cpu); + OFFSET(TI_preempt_count, thread_info, preempt_count); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_restart_block, thread_info, restart_block); + BLANK(); + + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); + OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + BLANK(); + + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + + /* Offset from the sysenter stack to tss.esp0 */ + DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - + sizeof(struct tss_struct)); + + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); +} diff --git a/arch/i386/kernel/bootflag.c b/arch/i386/kernel/bootflag.c new file mode 100644 index 000000000000..4c30ed01f4e1 --- /dev/null +++ b/arch/i386/kernel/bootflag.c @@ -0,0 +1,99 @@ +/* + * Implement 'Simple Boot Flag Specification 2.0' + */ + + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/acpi.h> +#include <asm/io.h> + +#include <linux/mc146818rtc.h> + + +#define SBF_RESERVED (0x78) +#define SBF_PNPOS (1<<0) +#define SBF_BOOTING (1<<1) +#define SBF_DIAG (1<<2) +#define SBF_PARITY (1<<7) + + +int sbf_port __initdata = -1; /* set via acpi_boot_init() */ + + +static int __init parity(u8 v) +{ + int x = 0; + int i; + + for(i=0;i<8;i++) + { + x^=(v&1); + v>>=1; + } + return x; +} + +static void __init sbf_write(u8 v) +{ + unsigned long flags; + if(sbf_port != -1) + { + v &= ~SBF_PARITY; + if(!parity(v)) + v|=SBF_PARITY; + + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(v, sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + } +} + +static u8 __init sbf_read(void) +{ + u8 v; + unsigned long flags; + if(sbf_port == -1) + return 0; + spin_lock_irqsave(&rtc_lock, flags); + v = CMOS_READ(sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + return v; +} + +static int __init sbf_value_valid(u8 v) +{ + if(v&SBF_RESERVED) /* Reserved bits */ + return 0; + if(!parity(v)) + return 0; + return 1; +} + +static int __init sbf_init(void) +{ + u8 v; + if(sbf_port == -1) + return 0; + v = sbf_read(); + if(!sbf_value_valid(v)) + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); + + v &= ~SBF_RESERVED; + v &= ~SBF_BOOTING; + v &= ~SBF_DIAG; +#if defined(CONFIG_ISAPNP) + v |= SBF_PNPOS; +#endif + sbf_write(v); + return 0; +} + +module_init(sbf_init); diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile new file mode 100644 index 000000000000..010aecfffbc1 --- /dev/null +++ b/arch/i386/kernel/cpu/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for x86-compatible CPU details and quirks +# + +obj-y := common.o proc.o + +obj-y += amd.o +obj-y += cyrix.o +obj-y += centaur.o +obj-y += transmeta.o +obj-y += intel.o intel_cacheinfo.o +obj-y += rise.o +obj-y += nexgen.o +obj-y += umc.o + +obj-$(CONFIG_X86_MCE) += mcheck/ + +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c new file mode 100644 index 000000000000..ae94585d0445 --- /dev/null +++ b/arch/i386/kernel/cpu/amd.c @@ -0,0 +1,249 @@ +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/mm.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "cpu.h" + +/* + * B step AMD K6 before B 9730xxxx have hardware bugs that can cause + * misexecution of code under Linux. Owners of such processors should + * contact AMD for precise details and a CPU swap. + * + * See http://www.multimania.com/poulot/k6bug.html + * http://www.amd.com/K6/k6docs/revgd.html + * + * The following test is erm.. interesting. AMD neglected to up + * the chip setting when fixing the bug but they also tweaked some + * performance at the same time.. + */ + +extern void vide(void); +__asm__(".align 4\nvide: ret"); + +static void __init init_amd(struct cpuinfo_x86 *c) +{ + u32 l, h; + int mbytes = num_physpages >> (20-PAGE_SHIFT); + int r; + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_bit(0*32+31, c->x86_capability); + + r = get_model_name(c); + + switch(c->x86) + { + case 4: + /* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model==9 || c->x86_model == 10) { + if (inl (CBAR) & CBAR_ENB) + outl (0 | CBAR_KEY, CBAR); + } + break; + case 5: + if( c->x86_model < 6 ) + { + /* Based on AMD doc 20734R - June 2000 */ + if ( c->x86_model == 0 ) { + clear_bit(X86_FEATURE_APIC, c->x86_capability); + set_bit(X86_FEATURE_PGE, c->x86_capability); + } + break; + } + + if ( c->x86_model == 6 && c->x86_mask == 1 ) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + /* Knock these two lines out if it debugs out ok */ + printk(KERN_INFO "AMD K6 stepping B detected - "); + /* -- cut here -- */ + if (d > 20*K6_BUG_LOOP) + printk("system stability may be impaired when more than 32 MB are used.\n"); + else + printk("probably OK (after B9730xxxx).\n"); + printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model== 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if(mbytes>508) + mbytes=508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF)==0) { + unsigned long flags; + l=(1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + break; + } + + if ((c->x86_model == 8 && c->x86_mask >7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if(mbytes>4092) + mbytes=4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000)==0) { + unsigned long flags; + l=((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_bit(X86_FEATURE_K6_MTRR, c->x86_capability); + break; + } + break; + + case 6: /* An Athlon/Duron */ + + /* Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + rdmsr(MSR_K7_HWCR, l, h); + l &= ~0x00008000; + wrmsr(MSR_K7_HWCR, l, h); + set_bit(X86_FEATURE_XMM, c->x86_capability); + } + } + + /* It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, + ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + break; + } + + switch (c->x86) { + case 15: + set_bit(X86_FEATURE_K8, c->x86_capability); + break; + case 6: + set_bit(X86_FEATURE_K7, c->x86_capability); + break; + } + + display_cacheinfo(c); + detect_ht(c); + +#ifdef CONFIG_X86_HT + /* AMD dual core looks like HT but isn't really. Hide it from the + scheduler. This works around problems with the domain scheduler. + Also probably gives slightly better scheduling and disables + SMT nice which is harmful on dual core. + TBD tune the domain scheduler for dual core. */ + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + smp_num_siblings = 1; +#endif + + if (cpuid_eax(0x80000000) >= 0x80000008) { + c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (c->x86_num_cores & (c->x86_num_cores - 1)) + c->x86_num_cores = 1; + } +} + +static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) +{ + /* AMD errata T13 (order #21922) */ + if ((c->x86 == 6)) { + if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + size = 64; + if (c->x86_model == 4 && + (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */ + size = 256; + } + return size; +} + +static struct cpu_dev amd_cpu_dev __initdata = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, + .c_models = { + { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = + { + [3] = "486 DX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB", + [14] = "Am5x86-WT", + [15] = "Am5x86-WB" + } + }, + }, + .c_init = init_amd, + .c_identify = generic_identify, + .c_size_cache = amd_size_cache, +}; + +int __init amd_init_cpu(void) +{ + cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; + return 0; +} + +//early_arch_initcall(amd_init_cpu); diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c new file mode 100644 index 000000000000..394814e57672 --- /dev/null +++ b/arch/i386/kernel/cpu/centaur.c @@ -0,0 +1,476 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/e820.h> +#include "cpu.h" + +#ifdef CONFIG_X86_OOSTORE + +static u32 __init power2(u32 x) +{ + u32 s=1; + while(s<=x) + s<<=1; + return s>>=1; +} + + +/* + * Set up an actual MCR + */ + +static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) +{ + u32 lo, hi; + + hi = base & ~0xFFF; + lo = ~(size-1); /* Size is a power of 2 so this makes a mask */ + lo &= ~0xFFF; /* Remove the ctrl value bits */ + lo |= key; /* Attribute we wish to set */ + wrmsr(reg+MSR_IDT_MCR0, lo, hi); + mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */ +} + +/* + * Figure what we can cover with MCR's + * + * Shortcut: We know you can't put 4Gig of RAM on a winchip + */ + +static u32 __init ramtop(void) /* 16388 */ +{ + int i; + u32 top = 0; + u32 clip = 0xFFFFFFFFUL; + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + + if (e820.map[i].addr > 0xFFFFFFFFUL) + continue; + /* + * Don't MCR over reserved space. Ignore the ISA hole + * we frob around that catastrophy already + */ + + if (e820.map[i].type == E820_RESERVED) + { + if(e820.map[i].addr >= 0x100000UL && e820.map[i].addr < clip) + clip = e820.map[i].addr; + continue; + } + start = e820.map[i].addr; + end = e820.map[i].addr + e820.map[i].size; + if (start >= end) + continue; + if (end > top) + top = end; + } + /* Everything below 'top' should be RAM except for the ISA hole. + Because of the limited MCR's we want to map NV/ACPI into our + MCR range for gunk in RAM + + Clip might cause us to MCR insufficient RAM but that is an + acceptable failure mode and should only bite obscure boxes with + a VESA hole at 15Mb + + The second case Clip sometimes kicks in is when the EBDA is marked + as reserved. Again we fail safe with reasonable results + */ + + if(top>clip) + top=clip; + + return top; +} + +/* + * Compute a set of MCR's to give maximum coverage + */ + +static int __init centaur_mcr_compute(int nr, int key) +{ + u32 mem = ramtop(); + u32 root = power2(mem); + u32 base = root; + u32 top = root; + u32 floor = 0; + int ct = 0; + + while(ct<nr) + { + u32 fspace = 0; + + /* + * Find the largest block we will fill going upwards + */ + + u32 high = power2(mem-top); + + /* + * Find the largest block we will fill going downwards + */ + + u32 low = base/2; + + /* + * Don't fill below 1Mb going downwards as there + * is an ISA hole in the way. + */ + + if(base <= 1024*1024) + low = 0; + + /* + * See how much space we could cover by filling below + * the ISA hole + */ + + if(floor == 0) + fspace = 512*1024; + else if(floor ==512*1024) + fspace = 128*1024; + + /* And forget ROM space */ + + /* + * Now install the largest coverage we get + */ + + if(fspace > high && fspace > low) + { + centaur_mcr_insert(ct, floor, fspace, key); + floor += fspace; + } + else if(high > low) + { + centaur_mcr_insert(ct, top, high, key); + top += high; + } + else if(low > 0) + { + base -= low; + centaur_mcr_insert(ct, base, low, key); + } + else break; + ct++; + } + /* + * We loaded ct values. We now need to set the mask. The caller + * must do this bit. + */ + + return ct; +} + +static void __init centaur_create_optimal_mcr(void) +{ + int i; + /* + * Allocate up to 6 mcrs to mark as much of ram as possible + * as write combining and weak write ordered. + * + * To experiment with: Linux never uses stack operations for + * mmio spaces so we could globally enable stack operation wc + * + * Load the registers with type 31 - full write combining, all + * writes weakly ordered. + */ + int used = centaur_mcr_compute(6, 31); + + /* + * Wipe unused MCRs + */ + + for(i=used;i<8;i++) + wrmsr(MSR_IDT_MCR0+i, 0, 0); +} + +static void __init winchip2_create_optimal_mcr(void) +{ + u32 lo, hi; + int i; + + /* + * Allocate up to 6 mcrs to mark as much of ram as possible + * as write combining, weak store ordered. + * + * Load the registers with type 25 + * 8 - weak write ordering + * 16 - weak read ordering + * 1 - write combining + */ + + int used = centaur_mcr_compute(6, 25); + + /* + * Mark the registers we are using. + */ + + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + for(i=0;i<used;i++) + lo|=1<<(9+i); + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); + + /* + * Wipe unused MCRs + */ + + for(i=used;i<8;i++) + wrmsr(MSR_IDT_MCR0+i, 0, 0); +} + +/* + * Handle the MCR key on the Winchip 2. + */ + +static void __init winchip2_unprotect_mcr(void) +{ + u32 lo, hi; + u32 key; + + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + lo&=~0x1C0; /* blank bits 8-6 */ + key = (lo>>17) & 7; + lo |= key<<6; /* replace with unlock key */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); +} + +static void __init winchip2_protect_mcr(void) +{ + u32 lo, hi; + + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + lo&=~0x1C0; /* blank bits 8-6 */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); +} +#endif /* CONFIG_X86_OOSTORE */ + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ + +static void __init init_c3(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Centaur Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr (MSR_VIA_FCR, lo, hi); + lo |= ACE_FCR; /* enable ACE unit */ + wrmsr (MSR_VIA_FCR, lo, hi); + printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); + } + + /* enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr (MSR_VIA_RNG, lo, hi); + lo |= RNG_ENABLE; /* enable RNG unit */ + wrmsr (MSR_VIA_RNG, lo, hi); + printk(KERN_INFO "CPU: Enabled h/w RNG\n"); + } + + /* store Centaur Extended Feature Flags as + * word 5 of the CPU capability bit array + */ + c->x86_capability[5] = cpuid_edx(0xC0000001); + } + + /* Cyrix III family needs CX8 & PGE explicity enabled. */ + if (c->x86_model >=6 && c->x86_model <= 9) { + rdmsr (MSR_VIA_FCR, lo, hi); + lo |= (1<<1 | 1<<7); + wrmsr (MSR_VIA_FCR, lo, hi); + set_bit(X86_FEATURE_CX8, c->x86_capability); + } + + /* Before Nehemiah, the C3's had 3dNOW! */ + if (c->x86_model >=6 && c->x86_model <9) + set_bit(X86_FEATURE_3DNOW, c->x86_capability); + + get_model_name(c); + display_cacheinfo(c); +} + +static void __init init_centaur(struct cpuinfo_x86 *c) +{ + enum { + ECX8=1<<1, + EIERRINT=1<<2, + DPM=1<<3, + DMCE=1<<4, + DSTPCLK=1<<5, + ELINEAR=1<<6, + DSMC=1<<7, + DTLOCK=1<<8, + EDCTLB=1<<8, + EMMX=1<<9, + DPDC=1<<11, + EBRPRED=1<<12, + DIC=1<<13, + DDC=1<<14, + DNA=1<<15, + ERETSTK=1<<16, + E2MMX=1<<19, + EAMD3D=1<<20, + }; + + char *name; + u32 fcr_set=0; + u32 fcr_clr=0; + u32 lo,hi,newlo; + u32 aa,bb,cc,dd; + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_bit(0*32+31, c->x86_capability); + + switch (c->x86) { + + case 5: + switch(c->x86_model) { + case 4: + name="C6"; + fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK; + fcr_clr=DPDC; + printk(KERN_NOTICE "Disabling bugged TSC.\n"); + clear_bit(X86_FEATURE_TSC, c->x86_capability); +#ifdef CONFIG_X86_OOSTORE + centaur_create_optimal_mcr(); + /* Enable + write combining on non-stack, non-string + write combining on string, all types + weak write ordering + + The C6 original lacks weak read order + + Note 0x120 is write only on Winchip 1 */ + + wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); +#endif + break; + case 8: + switch(c->x86_mask) { + default: + name="2"; + break; + case 7 ... 9: + name="2A"; + break; + case 10 ... 15: + name="2B"; + break; + } + fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D; + fcr_clr=DPDC; +#ifdef CONFIG_X86_OOSTORE + winchip2_unprotect_mcr(); + winchip2_create_optimal_mcr(); + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + /* Enable + write combining on non-stack, non-string + write combining on string, all types + weak write ordering + */ + lo|=31; + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); + winchip2_protect_mcr(); +#endif + break; + case 9: + name="3"; + fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D; + fcr_clr=DPDC; +#ifdef CONFIG_X86_OOSTORE + winchip2_unprotect_mcr(); + winchip2_create_optimal_mcr(); + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + /* Enable + write combining on non-stack, non-string + write combining on string, all types + weak write ordering + */ + lo|=31; + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); + winchip2_protect_mcr(); +#endif + break; + case 10: + name="4"; + /* no info on the WC4 yet */ + break; + default: + name="??"; + } + + rdmsr(MSR_IDT_FCR1, lo, hi); + newlo=(lo|fcr_set) & (~fcr_clr); + + if (newlo!=lo) { + printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", lo, newlo ); + wrmsr(MSR_IDT_FCR1, newlo, hi ); + } else { + printk(KERN_INFO "Centaur FCR is 0x%X\n",lo); + } + /* Emulate MTRRs using Centaur's MCR. */ + set_bit(X86_FEATURE_CENTAUR_MCR, c->x86_capability); + /* Report CX8 */ + set_bit(X86_FEATURE_CX8, c->x86_capability); + /* Set 3DNow! on Winchip 2 and above. */ + if (c->x86_model >=8) + set_bit(X86_FEATURE_3DNOW, c->x86_capability); + /* See if we can find out some more. */ + if ( cpuid_eax(0x80000000) >= 0x80000005 ) { + /* Yes, we can. */ + cpuid(0x80000005,&aa,&bb,&cc,&dd); + /* Add L1 data and code cache sizes. */ + c->x86_cache_size = (cc>>24)+(dd>>24); + } + sprintf( c->x86_model_id, "WinChip %s", name ); + break; + + case 6: + init_c3(c); + break; + } +} + +static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) +{ + /* VIA C3 CPUs (670-68F) need further shifting. */ + if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) + size >>= 8; + + /* VIA also screwed up Nehemiah stepping 1, and made + it return '65KB' instead of '64KB' + - Note, it seems this may only be in engineering samples. */ + if ((c->x86==6) && (c->x86_model==9) && (c->x86_mask==1) && (size==65)) + size -=1; + + return size; +} + +static struct cpu_dev centaur_cpu_dev __initdata = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_init = init_centaur, + .c_size_cache = centaur_size_cache, +}; + +int __init centaur_init_cpu(void) +{ + cpu_devs[X86_VENDOR_CENTAUR] = ¢aur_cpu_dev; + return 0; +} + +//early_arch_initcall(centaur_init_cpu); diff --git a/arch/i386/kernel/cpu/changelog b/arch/i386/kernel/cpu/changelog new file mode 100644 index 000000000000..cef76b80a710 --- /dev/null +++ b/arch/i386/kernel/cpu/changelog @@ -0,0 +1,63 @@ +/* + * Enhanced CPU type detection by Mike Jagdis, Patrick St. Jean + * and Martin Mares, November 1997. + * + * Force Cyrix 6x86(MX) and M II processors to report MTRR capability + * and Cyrix "coma bug" recognition by + * Zoltán Böszörményi <zboszor@mail.externet.hu> February 1999. + * + * Force Centaur C6 processors to report MTRR capability. + * Bart Hartgers <bart@etpmod.phys.tue.nl>, May 1999. + * + * Intel Mobile Pentium II detection fix. Sean Gilley, June 1999. + * + * IDT Winchip tweaks, misc clean ups. + * Dave Jones <davej@suse.de>, August 1999 + * + * Better detection of Centaur/IDT WinChip models. + * Bart Hartgers <bart@etpmod.phys.tue.nl>, August 1999. + * + * Cleaned up cache-detection code + * Dave Jones <davej@suse.de>, October 1999 + * + * Added proper L2 cache detection for Coppermine + * Dragan Stancevic <visitor@valinux.com>, October 1999 + * + * Added the original array for capability flags but forgot to credit + * myself :) (~1998) Fixed/cleaned up some cpu_model_info and other stuff + * Jauder Ho <jauderho@carumba.com>, January 2000 + * + * Detection for Celeron coppermine, identify_cpu() overhauled, + * and a few other clean ups. + * Dave Jones <davej@suse.de>, April 2000 + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * Added proper Cascades CPU and L2 cache detection for Cascades + * and 8-way type cache happy bunch from Intel:^) + * Dragan Stancevic <visitor@valinux.com>, May 2000 + * + * Forward port AMD Duron errata T13 from 2.2.17pre + * Dave Jones <davej@suse.de>, August 2000 + * + * Forward port lots of fixes/improvements from 2.2.18pre + * Cyrix III, Pentium IV support. + * Dave Jones <davej@suse.de>, October 2000 + * + * Massive cleanup of CPU detection and bug handling; + * Transmeta CPU detection, + * H. Peter Anvin <hpa@zytor.com>, November 2000 + * + * VIA C3 Support. + * Dave Jones <davej@suse.de>, March 2001 + * + * AMD Athlon/Duron/Thunderbird bluesmoke support. + * Dave Jones <davej@suse.de>, April 2001. + * + * CacheSize bug workaround updates for AMD, Intel & VIA Cyrix. + * Dave Jones <davej@suse.de>, September, October 2001. + * + */ + diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c new file mode 100644 index 000000000000..ebd5d8247faa --- /dev/null +++ b/arch/i386/kernel/cpu/common.c @@ -0,0 +1,634 @@ +#include <linux/init.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#endif + +#include "cpu.h" + +DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); +EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); + +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); + +static int cachesize_override __initdata = -1; +static int disable_x86_fxsr __initdata = 0; +static int disable_x86_serial_nr __initdata = 1; + +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; + +extern void mcheck_init(struct cpuinfo_x86 *c); + +extern int disable_pse; + +static void default_init(struct cpuinfo_x86 * c) +{ + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +} + +static struct cpu_dev default_cpu = { + .c_init = default_init, +}; +static struct cpu_dev * this_cpu = &default_cpu; + +static int __init cachesize_setup(char *str) +{ + get_option (&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +int __init get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (cpuid_eax(0x80000000) < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ + p = q = &c->x86_model_id[0]; + while ( *p == ' ' ) + p++; + if ( p != q ) { + while ( *p ) + *q++ = *p++; + while ( q <= &c->x86_model_id[48] ) + *q++ = '\0'; /* Zero-pad the rest */ + } + + return 1; +} + + +void __init display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ecx, edx, l2size; + + n = cpuid_eax(0x80000000); + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + ecx = cpuid_ecx(0x80000006); + l2size = ecx >> 16; + + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c,l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if ( l2size == 0 ) + return; /* Again, no L2 cache is possible */ + + c->x86_cache_size = l2size; + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + l2size, ecx & 0xFF); +} + +/* Naming convention should be: <Name> [(<Codename>)] */ +/* This table only is used unless init_<vendor>() below doesn't set it; */ +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ + +/* Look up CPU names by table lookup. */ +static char __init *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if ( c->x86_model >= 16 ) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + + +void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v,cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v,cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + if (!early) + this_cpu = cpu_devs[i]; + break; + } + } + } +} + + +static int __init x86_fxsr_setup(char * s) +{ + disable_x86_fxsr = 1; + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + + +/* Probe for the CPUID instruction */ +static int __init have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 1); + + c->x86 = 4; + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) { + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += ((tfms >> 16) & 0xF) << 4; + } + c->x86_mask = tfms & 15; + if (cap0 & (1<<19)) + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + } + + early_intel_workaround(c); +} + +void __init generic_identify(struct cpuinfo_x86 * c) +{ + u32 tfms, xlvl; + int junk; + + if (have_cpuid_p()) { + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 0); + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if ( c->cpuid_level >= 0x00000001 ) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &junk, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) { + c->x86 += (tfms >> 20) & 0xff; + c->x86_model += ((tfms >> 16) & 0xF) << 4; + } + c->x86_mask = tfms & 15; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ( (xlvl & 0xffff0000) == 0x80000000 ) { + if ( xlvl >= 0x80000001 ) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if ( xlvl >= 0x80000004 ) + get_model_name(c); /* Default name */ + } + } +} + +static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { + /* Disable processor serial number */ + unsigned long lo,hi; + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_bit(X86_FEATURE_PN, c->x86_capability); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); + + + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __init identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_num_cores = 1; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + if (!have_cpuid_p()) { + /* First of all, decide if this is a 486 or higher */ + /* It's a 486 if we can modify the AC flag */ + if ( flag_is_changeable_p(X86_EFLAGS_AC) ) + c->x86 = 4; + else + c->x86 = 3; + } + + generic_identify(c); + + printk(KERN_DEBUG "CPU: After generic identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + if (this_cpu->c_identify) { + this_cpu->c_identify(c); + + printk(KERN_DEBUG "CPU: After vendor identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + } + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. Now + * we do "generic changes." + */ + + /* TSC disabled? */ + if ( tsc_disable ) + clear_bit(X86_FEATURE_TSC, c->x86_capability); + + /* FXSR disabled? */ + if (disable_x86_fxsr) { + clear_bit(X86_FEATURE_FXSR, c->x86_capability); + clear_bit(X86_FEATURE_XMM, c->x86_capability); + } + + if (disable_pse) + clear_bit(X86_FEATURE_PSE, c->x86_capability); + + /* If the model name is still unset, do table lookup. */ + if ( !c->x86_model_id[0] ) { + char *p; + p = table_lookup_model(c); + if ( p ) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86_vendor, c->x86_model); + } + + /* Now the feature flags better reflect actual CPU features! */ + + printk(KERN_DEBUG "CPU: After all inits, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if ( c != &boot_cpu_data ) { + /* AND the already accumulated flags with these */ + for ( i = 0 ; i < NCAPINTS ; i++ ) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ +#ifdef CONFIG_X86_MCE + mcheck_init(c); +#endif +} + +#ifdef CONFIG_X86_HT +void __init detect_ht(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + int index_lsb, index_msb, tmp; + int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + + cpuid(1, &eax, &ebx, &ecx, &edx); + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1 ) { + index_lsb = 0; + index_msb = 31; + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + tmp = smp_num_siblings; + while ((tmp & 1) == 0) { + tmp >>=1 ; + index_lsb++; + } + tmp = smp_num_siblings; + while ((tmp & 0x80000000 ) == 0) { + tmp <<=1 ; + index_msb--; + } + if (index_lsb != index_msb ) + index_msb++; + phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + phys_proc_id[cpu]); + } +} +#endif + +void __init print_cpu_info(struct cpuinfo_x86 *c) +{ + char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) + vendor = this_cpu->c_vendor; + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + printk("%s ", vendor); + + if (!c->x86_model_id[0]) + printk("%d86", c->x86); + else + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; + +/* This is hacky. :) + * We're emulating future behavior. + * In the future, the cpu-specific init functions will be called implicitly + * via the magic of initcalls. + * They will insert themselves into the cpu_devs structure. + * Then, when cpu_init() is called, we can just iterate over that array. + */ + +extern int intel_cpu_init(void); +extern int cyrix_init_cpu(void); +extern int nsc_init_cpu(void); +extern int amd_init_cpu(void); +extern int centaur_init_cpu(void); +extern int transmeta_init_cpu(void); +extern int rise_init_cpu(void); +extern int nexgen_init_cpu(void); +extern int umc_init_cpu(void); + +void __init early_cpu_init(void) +{ + intel_cpu_init(); + cyrix_init_cpu(); + nsc_init_cpu(); + amd_init_cpu(); + centaur_init_cpu(); + transmeta_init_cpu(); + rise_init_cpu(); + nexgen_init_cpu(); + umc_init_cpu(); + early_cpu_detect(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif +} +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __init cpu_init (void) +{ + int cpu = smp_processor_id(); + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = ¤t->thread; + __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table, + GDT_SIZE); + + /* Set up GDT entry for 16bit stack */ + *(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |= + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | + (CPU_16BIT_STACK_SIZE - 1); + + cpu_gdt_descr[cpu].size = GDT_SIZE - 1; + cpu_gdt_descr[cpu].address = + (unsigned long)&per_cpu(cpu_gdt_table, cpu); + + /* + * Set up the per-thread TLS descriptor cache: + */ + memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu), + GDT_ENTRY_TLS_ENTRIES * 8); + + __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu])); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + if (current->mm) + BUG(); + enter_lazy_tlb(&init_mm, current); + + load_esp0(t, thread); + set_tss_desc(cpu,t); + load_TR_desc(); + load_LDT(&init_mm.context); + + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ + +#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); + + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + +#undef CD + + /* + * Force FPU initialization: + */ + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); +} diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h new file mode 100644 index 000000000000..5a1d4f163e84 --- /dev/null +++ b/arch/i386/kernel/cpu/cpu.h @@ -0,0 +1,30 @@ + +struct cpu_model_info { + int vendor; + int family; + char *model_names[16]; +}; + +/* attempt to consolidate cpu attributes */ +struct cpu_dev { + char * c_vendor; + + /* some have two possibilities for cpuid string */ + char * c_ident[2]; + + struct cpu_model_info c_models[4]; + + void (*c_init)(struct cpuinfo_x86 * c); + void (*c_identify)(struct cpuinfo_x86 * c); + unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); +}; + +extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; + +extern int get_model_name(struct cpuinfo_x86 *c); +extern void display_cacheinfo(struct cpuinfo_x86 *c); + +extern void generic_identify(struct cpuinfo_x86 * c); + +extern void early_intel_workaround(struct cpuinfo_x86 *c); + diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig new file mode 100644 index 000000000000..f25ffd74235c --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/Kconfig @@ -0,0 +1,231 @@ +# +# CPU Frequency scaling +# + +menu "CPU Frequency scaling" + +source "drivers/cpufreq/Kconfig" + +if CPU_FREQ + +comment "CPUFreq processor drivers" + +config X86_ACPI_CPUFREQ + tristate "ACPI Processor P-States driver" + select CPU_FREQ_TABLE + depends on ACPI_PROCESSOR + help + This driver adds a CPUFreq driver which utilizes the ACPI + Processor Performance States. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config ELAN_CPUFREQ + tristate "AMD Elan" + select CPU_FREQ_TABLE + depends on X86_ELAN + ---help--- + This adds the CPUFreq driver for AMD Elan SC400 and SC410 + processors. + + You need to specify the processor maximum speed as boot + parameter: elanfreq=maxspeed (in kHz) or as module + parameter "max_freq". + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_POWERNOW_K6 + tristate "AMD Mobile K6-2/K6-3 PowerNow!" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for mobile AMD K6-2+ and mobile + AMD K6-3+ processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_POWERNOW_K7 + tristate "AMD Mobile Athlon/Duron PowerNow!" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for mobile AMD K7 mobile processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_POWERNOW_K7_ACPI + bool + depends on X86_POWERNOW_K7 && ACPI_PROCESSOR + depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) + default y + +config X86_POWERNOW_K8 + tristate "AMD Opteron/Athlon64 PowerNow!" + select CPU_FREQ_TABLE + depends on EXPERIMENTAL + help + This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_POWERNOW_K8_ACPI + bool + depends on X86_POWERNOW_K8 && ACPI_PROCESSOR + depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) + default y + +config X86_GX_SUSPMOD + tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" + help + This add the CPUFreq driver for NatSemi Geode processors which + support suspend modulation. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO + tristate "Intel Enhanced SpeedStep" + select CPU_FREQ_TABLE + select X86_SPEEDSTEP_CENTRINO_TABLE if (!X86_SPEEDSTEP_CENTRINO_ACPI) + help + This adds the CPUFreq driver for Enhanced SpeedStep enabled + mobile CPUs. This means Intel Pentium M (Centrino) CPUs. However, + you also need to say Y to "Use ACPI tables to decode..." below + [which might imply enabling ACPI] if you want to use this driver + on non-Banias CPUs. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO_ACPI + bool "Use ACPI tables to decode valid frequency/voltage pairs" + depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR + depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m) + default y + help + Use primarily the information provided in the BIOS ACPI tables + to determine valid CPU frequency and voltage pairings. It is + required for the driver to work on non-Banias CPUs. + + If in doubt, say Y. + +config X86_SPEEDSTEP_CENTRINO_TABLE + bool "Built-in tables for Banias CPUs" + depends on X86_SPEEDSTEP_CENTRINO + default y + help + Use built-in tables for Banias CPUs if ACPI encoding + is not available. + + If in doubt, say N. + +config X86_SPEEDSTEP_ICH + tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for certain mobile Intel Pentium III + (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all + mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, + ICH3 or ICH4 southbridge. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_SPEEDSTEP_SMI + tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" + select CPU_FREQ_TABLE + depends on EXPERIMENTAL + help + This adds the CPUFreq driver for certain mobile Intel Pentium III + (Coppermine), all mobile Intel Pentium III-M (Tualatin) + on systems which have an Intel 440BX/ZX/MX southbridge. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_P4_CLOCKMOD + tristate "Intel Pentium 4 clock modulation" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for Intel Pentium 4 / XEON + processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_CPUFREQ_NFORCE2 + tristate "nVidia nForce2 FSB changing" + depends on EXPERIMENTAL + help + This adds the CPUFreq driver for FSB changing on nVidia nForce2 + platforms. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_LONGRUN + tristate "Transmeta LongRun" + help + This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors + which support LongRun. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +config X86_LONGHAUL + tristate "VIA Cyrix III Longhaul" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for VIA Samuel/CyrixIII, + VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T + processors. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +comment "shared options" + +config X86_ACPI_CPUFREQ_PROC_INTF + bool "/proc/acpi/processor/../performance interface (deprecated)" + depends on PROC_FS + depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K7_ACPI || X86_POWERNOW_K8_ACPI + help + This enables the deprecated /proc/acpi/processor/../performance + interface. While it is helpful for debugging, the generic, + cross-architecture cpufreq interfaces should be used. + + If in doubt, say N. + +config X86_SPEEDSTEP_LIB + tristate + default X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD + +config X86_SPEEDSTEP_RELAXED_CAP_CHECK + bool "Relaxed speedstep capability checks" + depends on (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) + help + Don't perform all checks for a speedstep capable system which would + normally be done. Some ancient or strange systems, though speedstep + capable, don't always indicate that they are speedstep capable. This + option lets the probing code bypass some of those checks if the + parameter "relaxed_check=1" is passed to the module. + +endif # CPU_FREQ + +endmenu diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile new file mode 100644 index 000000000000..a922e97aeedd --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o +obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o +obj-$(CONFIG_X86_LONGHAUL) += longhaul.o +obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o +obj-$(CONFIG_X86_LONGRUN) += longrun.o +obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o +obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o +obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o +obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o +obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o +obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c new file mode 100644 index 000000000000..963e17aa205d --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -0,0 +1,537 @@ +/* + * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.3 $) + * + * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <asm/io.h> +#include <asm/delay.h> +#include <asm/uaccess.h> + +#include <linux/acpi.h> +#include <acpi/processor.h> + +#include "speedstep-est-common.h" + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) + +MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); +MODULE_DESCRIPTION("ACPI Processor P-States Driver"); +MODULE_LICENSE("GPL"); + + +struct cpufreq_acpi_io { + struct acpi_processor_performance acpi_data; + struct cpufreq_frequency_table *freq_table; + unsigned int resume; +}; + +static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; + +static struct cpufreq_driver acpi_cpufreq_driver; + +static int +acpi_processor_write_port( + u16 port, + u8 bit_width, + u32 value) +{ + if (bit_width <= 8) { + outb(value, port); + } else if (bit_width <= 16) { + outw(value, port); + } else if (bit_width <= 32) { + outl(value, port); + } else { + return -ENODEV; + } + return 0; +} + +static int +acpi_processor_read_port( + u16 port, + u8 bit_width, + u32 *ret) +{ + *ret = 0; + if (bit_width <= 8) { + *ret = inb(port); + } else if (bit_width <= 16) { + *ret = inw(port); + } else if (bit_width <= 32) { + *ret = inl(port); + } else { + return -ENODEV; + } + return 0; +} + +static int +acpi_processor_set_performance ( + struct cpufreq_acpi_io *data, + unsigned int cpu, + int state) +{ + u16 port = 0; + u8 bit_width = 0; + int ret = 0; + u32 value = 0; + int i = 0; + struct cpufreq_freqs cpufreq_freqs; + cpumask_t saved_mask; + int retval; + + dprintk("acpi_processor_set_performance\n"); + + /* + * TBD: Use something other than set_cpus_allowed. + * As set_cpus_allowed is a bit racy, + * with any other set_cpus_allowed for this process. + */ + saved_mask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (smp_processor_id() != cpu) { + return (-EAGAIN); + } + + if (state == data->acpi_data.state) { + if (unlikely(data->resume)) { + dprintk("Called after resume, resetting to P%d\n", state); + data->resume = 0; + } else { + dprintk("Already at target state (P%d)\n", state); + retval = 0; + goto migrate_end; + } + } + + dprintk("Transitioning from P%d to P%d\n", + data->acpi_data.state, state); + + /* cpufreq frequency struct */ + cpufreq_freqs.cpu = cpu; + cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency; + cpufreq_freqs.new = data->freq_table[state].frequency; + + /* notify cpufreq */ + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); + + /* + * First we write the target state's 'control' value to the + * control_register. + */ + + port = data->acpi_data.control_register.address; + bit_width = data->acpi_data.control_register.bit_width; + value = (u32) data->acpi_data.states[state].control; + + dprintk("Writing 0x%08x to port 0x%04x\n", value, port); + + ret = acpi_processor_write_port(port, bit_width, value); + if (ret) { + dprintk("Invalid port width 0x%04x\n", bit_width); + retval = ret; + goto migrate_end; + } + + /* + * Then we read the 'status_register' and compare the value with the + * target state's 'status' to make sure the transition was successful. + * Note that we'll poll for up to 1ms (100 cycles of 10us) before + * giving up. + */ + + port = data->acpi_data.status_register.address; + bit_width = data->acpi_data.status_register.bit_width; + + dprintk("Looking for 0x%08x from port 0x%04x\n", + (u32) data->acpi_data.states[state].status, port); + + for (i=0; i<100; i++) { + ret = acpi_processor_read_port(port, bit_width, &value); + if (ret) { + dprintk("Invalid port width 0x%04x\n", bit_width); + retval = ret; + goto migrate_end; + } + if (value == (u32) data->acpi_data.states[state].status) + break; + udelay(10); + } + + /* notify cpufreq */ + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); + + if (value != (u32) data->acpi_data.states[state].status) { + unsigned int tmp = cpufreq_freqs.new; + cpufreq_freqs.new = cpufreq_freqs.old; + cpufreq_freqs.old = tmp; + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); + printk(KERN_WARNING "acpi-cpufreq: Transition failed\n"); + retval = -ENODEV; + goto migrate_end; + } + + dprintk("Transition successful after %d microseconds\n", i * 10); + + data->acpi_data.state = state; + + retval = 0; +migrate_end: + set_cpus_allowed(current, saved_mask); + return (retval); +} + + +static int +acpi_cpufreq_target ( + struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + unsigned int next_state = 0; + unsigned int result = 0; + + dprintk("acpi_cpufreq_setpolicy\n"); + + result = cpufreq_frequency_table_target(policy, + data->freq_table, + target_freq, + relation, + &next_state); + if (result) + return (result); + + result = acpi_processor_set_performance (data, policy->cpu, next_state); + + return (result); +} + + +static int +acpi_cpufreq_verify ( + struct cpufreq_policy *policy) +{ + unsigned int result = 0; + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + + dprintk("acpi_cpufreq_verify\n"); + + result = cpufreq_frequency_table_verify(policy, + data->freq_table); + + return (result); +} + + +static unsigned long +acpi_cpufreq_guess_freq ( + struct cpufreq_acpi_io *data, + unsigned int cpu) +{ + if (cpu_khz) { + /* search the closest match to cpu_khz */ + unsigned int i; + unsigned long freq; + unsigned long freqn = data->acpi_data.states[0].core_frequency * 1000; + + for (i=0; i < (data->acpi_data.state_count - 1); i++) { + freq = freqn; + freqn = data->acpi_data.states[i+1].core_frequency * 1000; + if ((2 * cpu_khz) > (freqn + freq)) { + data->acpi_data.state = i; + return (freq); + } + } + data->acpi_data.state = data->acpi_data.state_count - 1; + return (freqn); + } else + /* assume CPU is at P0... */ + data->acpi_data.state = 0; + return data->acpi_data.states[0].core_frequency * 1000; + +} + + +/* + * acpi_processor_cpu_init_pdc_est - let BIOS know about the SMP capabilities + * of this driver + * @perf: processor-specific acpi_io_data struct + * @cpu: CPU being initialized + * + * To avoid issues with legacy OSes, some BIOSes require to be informed of + * the SMP capabilities of OS P-state driver. Here we set the bits in _PDC + * accordingly, for Enhanced Speedstep. Actual call to _PDC is done in + * driver/acpi/processor.c + */ +static void +acpi_processor_cpu_init_pdc_est( + struct acpi_processor_performance *perf, + unsigned int cpu, + struct acpi_object_list *obj_list + ) +{ + union acpi_object *obj; + u32 *buf; + struct cpuinfo_x86 *c = cpu_data + cpu; + dprintk("acpi_processor_cpu_init_pdc_est\n"); + + if (!cpu_has(c, X86_FEATURE_EST)) + return; + + /* Initialize pdc. It will be used later. */ + if (!obj_list) + return; + + if (!(obj_list->count && obj_list->pointer)) + return; + + obj = obj_list->pointer; + if ((obj->buffer.length == 12) && obj->buffer.pointer) { + buf = (u32 *)obj->buffer.pointer; + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = ACPI_PDC_EST_CAPABILITY_SMP; + perf->pdc = obj_list; + } + return; +} + + +/* CPU specific PDC initialization */ +static void +acpi_processor_cpu_init_pdc( + struct acpi_processor_performance *perf, + unsigned int cpu, + struct acpi_object_list *obj_list + ) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + dprintk("acpi_processor_cpu_init_pdc\n"); + perf->pdc = NULL; + if (cpu_has(c, X86_FEATURE_EST)) + acpi_processor_cpu_init_pdc_est(perf, cpu, obj_list); + return; +} + + +static int +acpi_cpufreq_cpu_init ( + struct cpufreq_policy *policy) +{ + unsigned int i; + unsigned int cpu = policy->cpu; + struct cpufreq_acpi_io *data; + unsigned int result = 0; + + union acpi_object arg0 = {ACPI_TYPE_BUFFER}; + u32 arg0_buf[3]; + struct acpi_object_list arg_list = {1, &arg0}; + + dprintk("acpi_cpufreq_cpu_init\n"); + /* setup arg_list for _PDC settings */ + arg0.buffer.length = 12; + arg0.buffer.pointer = (u8 *) arg0_buf; + + data = kmalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL); + if (!data) + return (-ENOMEM); + memset(data, 0, sizeof(struct cpufreq_acpi_io)); + + acpi_io_data[cpu] = data; + + acpi_processor_cpu_init_pdc(&data->acpi_data, cpu, &arg_list); + result = acpi_processor_register_performance(&data->acpi_data, cpu); + data->acpi_data.pdc = NULL; + + if (result) + goto err_free; + + if (is_const_loops_cpu(cpu)) { + acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; + } + + /* capability check */ + if (data->acpi_data.state_count <= 1) { + dprintk("No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO) || + (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO)) { + dprintk("Unsupported address space [%d, %d]\n", + (u32) (data->acpi_data.control_register.space_id), + (u32) (data->acpi_data.status_register.space_id)); + result = -ENODEV; + goto err_unreg; + } + + /* alloc freq_table */ + data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * (data->acpi_data.state_count + 1), GFP_KERNEL); + if (!data->freq_table) { + result = -ENOMEM; + goto err_unreg; + } + + /* detect transition latency */ + policy->cpuinfo.transition_latency = 0; + for (i=0; i<data->acpi_data.state_count; i++) { + if ((data->acpi_data.states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency) + policy->cpuinfo.transition_latency = data->acpi_data.states[i].transition_latency * 1000; + } + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + + /* The current speed is unknown and not detectable by ACPI... */ + policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); + + /* table init */ + for (i=0; i<=data->acpi_data.state_count; i++) + { + data->freq_table[i].index = i; + if (i<data->acpi_data.state_count) + data->freq_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; + else + data->freq_table[i].frequency = CPUFREQ_TABLE_END; + } + + result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); + if (result) { + goto err_freqfree; + } + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management activated.\n", + cpu); + for (i = 0; i < data->acpi_data.state_count; i++) + dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", + (i == data->acpi_data.state?'*':' '), i, + (u32) data->acpi_data.states[i].core_frequency, + (u32) data->acpi_data.states[i].power, + (u32) data->acpi_data.states[i].transition_latency); + + cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); + return (result); + + err_freqfree: + kfree(data->freq_table); + err_unreg: + acpi_processor_unregister_performance(&data->acpi_data, cpu); + err_free: + kfree(data); + acpi_io_data[cpu] = NULL; + + return (result); +} + + +static int +acpi_cpufreq_cpu_exit ( + struct cpufreq_policy *policy) +{ + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + + + dprintk("acpi_cpufreq_cpu_exit\n"); + + if (data) { + cpufreq_frequency_table_put_attr(policy->cpu); + acpi_io_data[policy->cpu] = NULL; + acpi_processor_unregister_performance(&data->acpi_data, policy->cpu); + kfree(data); + } + + return (0); +} + +static int +acpi_cpufreq_resume ( + struct cpufreq_policy *policy) +{ + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + + + dprintk("acpi_cpufreq_resume\n"); + + data->resume = 1; + + return (0); +} + + +static struct freq_attr* acpi_cpufreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver acpi_cpufreq_driver = { + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, +}; + + +static int __init +acpi_cpufreq_init (void) +{ + int result = 0; + + dprintk("acpi_cpufreq_init\n"); + + result = cpufreq_register_driver(&acpi_cpufreq_driver); + + return (result); +} + + +static void __exit +acpi_cpufreq_exit (void) +{ + dprintk("acpi_cpufreq_exit\n"); + + cpufreq_unregister_driver(&acpi_cpufreq_driver); + + return; +} + + +late_initcall(acpi_cpufreq_init); +module_exit(acpi_cpufreq_exit); + +MODULE_ALIAS("acpi"); diff --git a/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c new file mode 100644 index 000000000000..04a405345203 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -0,0 +1,457 @@ +/* + * (C) 2004 Sebastian Witt <se.witt@gmx.net> + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon reverse engineered information + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/pci.h> +#include <linux/delay.h> + +#define NFORCE2_XTAL 25 +#define NFORCE2_BOOTFSB 0x48 +#define NFORCE2_PLLENABLE 0xa8 +#define NFORCE2_PLLREG 0xa4 +#define NFORCE2_PLLADR 0xa0 +#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) + +#define NFORCE2_MIN_FSB 50 +#define NFORCE2_SAFE_DISTANCE 50 + +/* Delay in ms between FSB changes */ +//#define NFORCE2_DELAY 10 + +/* nforce2_chipset: + * FSB is changed using the chipset + */ +static struct pci_dev *nforce2_chipset_dev; + +/* fid: + * multiplier * 10 + */ +static int fid = 0; + +/* min_fsb, max_fsb: + * minimum and maximum FSB (= FSB at boot time) + */ +static int min_fsb = 0; +static int max_fsb = 0; + +MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); +MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); +MODULE_LICENSE("GPL"); + +module_param(fid, int, 0444); +module_param(min_fsb, int, 0444); + +MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); +MODULE_PARM_DESC(min_fsb, + "Minimum FSB to use, if not defined: current FSB - 50"); + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg) + +/* + * nforce2_calc_fsb - calculate FSB + * @pll: PLL value + * + * Calculates FSB from PLL value + */ +static int nforce2_calc_fsb(int pll) +{ + unsigned char mul, div; + + mul = (pll >> 8) & 0xff; + div = pll & 0xff; + + if (div > 0) + return NFORCE2_XTAL * mul / div; + + return 0; +} + +/* + * nforce2_calc_pll - calculate PLL value + * @fsb: FSB + * + * Calculate PLL value for given FSB + */ +static int nforce2_calc_pll(unsigned int fsb) +{ + unsigned char xmul, xdiv; + unsigned char mul = 0, div = 0; + int tried = 0; + + /* Try to calculate multiplier and divider up to 4 times */ + while (((mul == 0) || (div == 0)) && (tried <= 3)) { + for (xdiv = 1; xdiv <= 0x80; xdiv++) + for (xmul = 1; xmul <= 0xfe; xmul++) + if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == + fsb + tried) { + mul = xmul; + div = xdiv; + } + tried++; + } + + if ((mul == 0) || (div == 0)) + return -1; + + return NFORCE2_PLL(mul, div); +} + +/* + * nforce2_write_pll - write PLL value to chipset + * @pll: PLL value + * + * Writes new FSB PLL value to chipset + */ +static void nforce2_write_pll(int pll) +{ + int temp; + + /* Set the pll addr. to 0x00 */ + temp = 0x00; + pci_write_config_dword(nforce2_chipset_dev, NFORCE2_PLLADR, temp); + + /* Now write the value in all 64 registers */ + for (temp = 0; temp <= 0x3f; temp++) { + pci_write_config_dword(nforce2_chipset_dev, + NFORCE2_PLLREG, pll); + } + + return; +} + +/* + * nforce2_fsb_read - Read FSB + * + * Read FSB from chipset + * If bootfsb != 0, return FSB at boot-time + */ +static unsigned int nforce2_fsb_read(int bootfsb) +{ + struct pci_dev *nforce2_sub5; + u32 fsb, temp = 0; + + + /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ + nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, + 0x01EF, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + + if (!nforce2_sub5) + return 0; + + pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); + fsb /= 1000000; + + /* Check if PLL register is already set */ + pci_read_config_byte(nforce2_chipset_dev, + NFORCE2_PLLENABLE, (u8 *)&temp); + + if(bootfsb || !temp) + return fsb; + + /* Use PLL register FSB value */ + pci_read_config_dword(nforce2_chipset_dev, + NFORCE2_PLLREG, &temp); + fsb = nforce2_calc_fsb(temp); + + return fsb; +} + +/* + * nforce2_set_fsb - set new FSB + * @fsb: New FSB + * + * Sets new FSB + */ +static int nforce2_set_fsb(unsigned int fsb) +{ + u32 pll, temp = 0; + unsigned int tfsb; + int diff; + + if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { + printk(KERN_ERR "cpufreq: FSB %d is out of range!\n", fsb); + return -EINVAL; + } + + tfsb = nforce2_fsb_read(0); + if (!tfsb) { + printk(KERN_ERR "cpufreq: Error while reading the FSB\n"); + return -EINVAL; + } + + /* First write? Then set actual value */ + pci_read_config_byte(nforce2_chipset_dev, + NFORCE2_PLLENABLE, (u8 *)&temp); + if (!temp) { + pll = nforce2_calc_pll(tfsb); + + if (pll < 0) + return -EINVAL; + + nforce2_write_pll(pll); + } + + /* Enable write access */ + temp = 0x01; + pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8)temp); + + diff = tfsb - fsb; + + if (!diff) + return 0; + + while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { + if (diff < 0) + tfsb++; + else + tfsb--; + + /* Calculate the PLL reg. value */ + if ((pll = nforce2_calc_pll(tfsb)) == -1) + return -EINVAL; + + nforce2_write_pll(pll); +#ifdef NFORCE2_DELAY + mdelay(NFORCE2_DELAY); +#endif + } + + temp = 0x40; + pci_write_config_byte(nforce2_chipset_dev, NFORCE2_PLLADR, (u8)temp); + + return 0; +} + +/** + * nforce2_get - get the CPU frequency + * @cpu: CPU number + * + * Returns the CPU frequency + */ +static unsigned int nforce2_get(unsigned int cpu) +{ + if (cpu) + return 0; + return nforce2_fsb_read(0) * fid * 100; +} + +/** + * nforce2_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int nforce2_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ +// unsigned long flags; + struct cpufreq_freqs freqs; + unsigned int target_fsb; + + if ((target_freq > policy->max) || (target_freq < policy->min)) + return -EINVAL; + + target_fsb = target_freq / (fid * 100); + + freqs.old = nforce2_get(policy->cpu); + freqs.new = target_fsb * fid * 100; + freqs.cpu = 0; /* Only one CPU on nForce2 plattforms */ + + if (freqs.old == freqs.new) + return 0; + + dprintk(KERN_INFO "cpufreq: Old CPU frequency %d kHz, new %d kHz\n", + freqs.old, freqs.new); + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Disable IRQs */ + //local_irq_save(flags); + + if (nforce2_set_fsb(target_fsb) < 0) + printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n", + target_fsb); + else + dprintk(KERN_INFO "cpufreq: Changed FSB successfully to %d\n", + target_fsb); + + /* Enable IRQs */ + //local_irq_restore(flags); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return 0; +} + +/** + * nforce2_verify - verifies a new CPUFreq policy + * @policy: new policy + */ +static int nforce2_verify(struct cpufreq_policy *policy) +{ + unsigned int fsb_pol_max; + + fsb_pol_max = policy->max / (fid * 100); + + if (policy->min < (fsb_pol_max * fid * 100)) + policy->max = (fsb_pol_max + 1) * fid * 100; + + cpufreq_verify_within_limits(policy, + policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static int nforce2_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int fsb; + unsigned int rfid; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + /* Get current FSB */ + fsb = nforce2_fsb_read(0); + + if (!fsb) + return -EIO; + + /* FIX: Get FID from CPU */ + if (!fid) { + if (!cpu_khz) { + printk(KERN_WARNING + "cpufreq: cpu_khz not set, can't calculate multiplier!\n"); + return -ENODEV; + } + + fid = cpu_khz / (fsb * 100); + rfid = fid % 5; + + if (rfid) { + if (rfid > 2) + fid += 5 - rfid; + else + fid -= rfid; + } + } + + printk(KERN_INFO "cpufreq: FSB currently at %i MHz, FID %d.%d\n", fsb, + fid / 10, fid % 10); + + /* Set maximum FSB to FSB at boot time */ + max_fsb = nforce2_fsb_read(1); + + if(!max_fsb) + return -EIO; + + if (!min_fsb) + min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; + + if (min_fsb < NFORCE2_MIN_FSB) + min_fsb = NFORCE2_MIN_FSB; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = min_fsb * fid * 100; + policy->cpuinfo.max_freq = max_fsb * fid * 100; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = nforce2_get(policy->cpu); + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + + return 0; +} + +static int nforce2_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver nforce2_driver = { + .name = "nforce2", + .verify = nforce2_verify, + .target = nforce2_target, + .get = nforce2_get, + .init = nforce2_cpu_init, + .exit = nforce2_cpu_exit, + .owner = THIS_MODULE, +}; + +/** + * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic + * + * Detects nForce2 A2 and C1 stepping + * + */ +static unsigned int nforce2_detect_chipset(void) +{ + u8 revision; + + nforce2_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, + PCI_DEVICE_ID_NVIDIA_NFORCE2, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + + if (nforce2_chipset_dev == NULL) + return -ENODEV; + + pci_read_config_byte(nforce2_chipset_dev, PCI_REVISION_ID, &revision); + + printk(KERN_INFO "cpufreq: Detected nForce2 chipset revision %X\n", + revision); + printk(KERN_INFO + "cpufreq: FSB changing is maybe unstable and can lead to crashes and data loss.\n"); + + return 0; +} + +/** + * nforce2_init - initializes the nForce2 CPUFreq driver + * + * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported + * devices, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init nforce2_init(void) +{ + /* TODO: do we need to detect the processor? */ + + /* detect chipset */ + if (nforce2_detect_chipset()) { + printk(KERN_ERR "cpufreq: No nForce2 chipset.\n"); + return -ENODEV; + } + + return cpufreq_register_driver(&nforce2_driver); +} + +/** + * nforce2_exit - unregisters cpufreq module + * + * Unregisters nForce2 FSB change support. + */ +static void __exit nforce2_exit(void) +{ + cpufreq_unregister_driver(&nforce2_driver); +} + +module_init(nforce2_init); +module_exit(nforce2_exit); + diff --git a/arch/i386/kernel/cpu/cpufreq/elanfreq.c b/arch/i386/kernel/cpu/cpufreq/elanfreq.c new file mode 100644 index 000000000000..3f7caa4ae6d6 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/elanfreq.c @@ -0,0 +1,312 @@ +/* + * elanfreq: cpufreq driver for the AMD ELAN family + * + * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de> + * + * Parts of this code are (c) Sven Geggus <sven@geggus.net> + * + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/cpufreq.h> + +#include <asm/msr.h> +#include <asm/timex.h> +#include <asm/io.h> + +#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ +#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ + +/* Module parameter */ +static int max_freq; + +struct s_elan_multiplier { + int clock; /* frequency in kHz */ + int val40h; /* PMU Force Mode register */ + int val80h; /* CPU Clock Speed Register */ +}; + +/* + * It is important that the frequencies + * are listed in ascending order here! + */ +struct s_elan_multiplier elan_multiplier[] = { + {1000, 0x02, 0x18}, + {2000, 0x02, 0x10}, + {4000, 0x02, 0x08}, + {8000, 0x00, 0x00}, + {16000, 0x00, 0x02}, + {33000, 0x00, 0x04}, + {66000, 0x01, 0x04}, + {99000, 0x01, 0x05} +}; + +static struct cpufreq_frequency_table elanfreq_table[] = { + {0, 1000}, + {1, 2000}, + {2, 4000}, + {3, 8000}, + {4, 16000}, + {5, 33000}, + {6, 66000}, + {7, 99000}, + {0, CPUFREQ_TABLE_END}, +}; + + +/** + * elanfreq_get_cpu_frequency: determine current cpu speed + * + * Finds out at which frequency the CPU of the Elan SOC runs + * at the moment. Frequencies from 1 to 33 MHz are generated + * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" + * and have the rest of the chip running with 33 MHz. + */ + +static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) +{ + u8 clockspeed_reg; /* Clock Speed Register */ + + local_irq_disable(); + outb_p(0x80,REG_CSCIR); + clockspeed_reg = inb_p(REG_CSCDR); + local_irq_enable(); + + if ((clockspeed_reg & 0xE0) == 0xE0) { return 0; } + + /* Are we in CPU clock multiplied mode (66/99 MHz)? */ + if ((clockspeed_reg & 0xE0) == 0xC0) { + if ((clockspeed_reg & 0x01) == 0) { + return 66000; + } else { + return 99000; + } + } + + /* 33 MHz is not 32 MHz... */ + if ((clockspeed_reg & 0xE0)==0xA0) + return 33000; + + return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); +} + + +/** + * elanfreq_set_cpu_frequency: Change the CPU core frequency + * @cpu: cpu number + * @freq: frequency in kHz + * + * This function takes a frequency value and changes the CPU frequency + * according to this. Note that the frequency has to be checked by + * elanfreq_validatespeed() for correctness! + * + * There is no return value. + */ + +static void elanfreq_set_cpu_state (unsigned int state) { + + struct cpufreq_freqs freqs; + + freqs.old = elanfreq_get_cpu_frequency(0); + freqs.new = elan_multiplier[state].clock; + freqs.cpu = 0; /* elanfreq.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",elan_multiplier[state].clock); + + + /* + * Access to the Elan's internal registers is indexed via + * 0x22: Chip Setup & Control Register Index Register (CSCI) + * 0x23: Chip Setup & Control Register Data Register (CSCD) + * + */ + + /* + * 0x40 is the Power Management Unit's Force Mode Register. + * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) + */ + + local_irq_disable(); + outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00,REG_CSCDR); + local_irq_enable(); /* wait till internal pipelines and */ + udelay(1000); /* buffers have cleaned up */ + + local_irq_disable(); + + /* now, set the CPU clock speed register (0x80) */ + outb_p(0x80,REG_CSCIR); + outb_p(elan_multiplier[state].val80h,REG_CSCDR); + + /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ + outb_p(0x40,REG_CSCIR); + outb_p(elan_multiplier[state].val40h,REG_CSCDR); + udelay(10000); + local_irq_enable(); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +}; + + +/** + * elanfreq_validatespeed: test if frequency range is valid + * @policy: the policy to validate + * + * This function checks if a given frequency range in kHz is valid + * for the hardware supported by the driver. + */ + +static int elanfreq_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); +} + +static int elanfreq_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], target_freq, relation, &newstate)) + return -EINVAL; + + elanfreq_set_cpu_state(newstate); + + return 0; +} + + +/* + * Module init and exit code + */ + +static int elanfreq_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = cpu_data; + unsigned int i; + int result; + + /* capability check */ + if ((c->x86_vendor != X86_VENDOR_AMD) || + (c->x86 != 4) || (c->x86_model!=10)) + return -ENODEV; + + /* max freq */ + if (!max_freq) + max_freq = elanfreq_get_cpu_frequency(0); + + /* table init */ + for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + if (elanfreq_table[i].frequency > max_freq) + elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; + } + + /* cpuinfo and default policy values */ + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = elanfreq_get_cpu_frequency(0); + + result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); + if (result) + return (result); + + cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); + + return 0; +} + + +static int elanfreq_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + + +#ifndef MODULE +/** + * elanfreq_setup - elanfreq command line parameter parsing + * + * elanfreq command line parameter. Use: + * elanfreq=66000 + * to set the maximum CPU frequency to 66 MHz. Note that in + * case you do not give this boot parameter, the maximum + * frequency will fall back to _current_ CPU frequency which + * might be lower. If you build this as a module, use the + * max_freq module parameter instead. + */ +static int __init elanfreq_setup(char *str) +{ + max_freq = simple_strtoul(str, &str, 0); + printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); + return 1; +} +__setup("elanfreq=", elanfreq_setup); +#endif + + +static struct freq_attr* elanfreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver elanfreq_driver = { + .get = elanfreq_get_cpu_frequency, + .verify = elanfreq_verify, + .target = elanfreq_target, + .init = elanfreq_cpu_init, + .exit = elanfreq_cpu_exit, + .name = "elanfreq", + .owner = THIS_MODULE, + .attr = elanfreq_attr, +}; + + +static int __init elanfreq_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + /* Test if we have the right hardware */ + if ((c->x86_vendor != X86_VENDOR_AMD) || + (c->x86 != 4) || (c->x86_model!=10)) + { + printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); + return -ENODEV; + } + + return cpufreq_register_driver(&elanfreq_driver); +} + + +static void __exit elanfreq_exit(void) +{ + cpufreq_unregister_driver(&elanfreq_driver); +} + + +module_param (max_freq, int, 0444); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); +MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); + +module_init(elanfreq_init); +module_exit(elanfreq_exit); + diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c new file mode 100644 index 000000000000..1a49adb1f4a6 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -0,0 +1,502 @@ +/* + * Cyrix MediaGX and NatSemi Geode Suspend Modulation + * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> + * (C) 2002 Hiroshi Miura <miura@da-cha.org> + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation + * + * The author(s) of this software shall not be held liable for damages + * of any nature resulting due to the use of this software. This + * software is provided AS-IS with no warranties. + * + * Theoritical note: + * + * (see Geode(tm) CS5530 manual (rev.4.1) page.56) + * + * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 + * are based on Suspend Moduration. + * + * Suspend Modulation works by asserting and de-asserting the SUSP# pin + * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# + * the CPU enters an idle state. GX1 stops its core clock when SUSP# is + * asserted then power consumption is reduced. + * + * Suspend Modulation's OFF/ON duration are configurable + * with 'Suspend Modulation OFF Count Register' + * and 'Suspend Modulation ON Count Register'. + * These registers are 8bit counters that represent the number of + * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) + * to the processor. + * + * These counters define a ratio which is the effective frequency + * of operation of the system. + * + * OFF Count + * F_eff = Fgx * ---------------------- + * OFF Count + ON Count + * + * 0 <= On Count, Off Count <= 255 + * + * From these limits, we can get register values + * + * off_duration + on_duration <= MAX_DURATION + * on_duration = off_duration * (stock_freq - freq) / freq + * + * off_duration = (freq * DURATION) / stock_freq + * on_duration = DURATION - off_duration + * + * + *--------------------------------------------------------------------------- + * + * ChangeLog: + * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org> + * - fix on/off register mistake + * - fix cpu_khz calc when it stops cpu modulation. + * + * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org> + * - rewrite for Cyrix MediaGX Cx5510/5520 and + * NatSemi Geode Cs5530(A). + * + * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com> + * - cs5530_mod patch for 2.4.19-rc1. + * + *--------------------------------------------------------------------------- + * + * Todo + * Test on machines with 5510, 5530, 5530A + */ + +/************************************************************************ + * Suspend Modulation - Definitions * + ************************************************************************/ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cpufreq.h> +#include <linux/pci.h> +#include <asm/processor.h> +#include <asm/errno.h> + +/* PCI config registers, all at F0 */ +#define PCI_PMER1 0x80 /* power management enable register 1 */ +#define PCI_PMER2 0x81 /* power management enable register 2 */ +#define PCI_PMER3 0x82 /* power management enable register 3 */ +#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ +#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ +#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ +#define PCI_MODON 0x95 /* suspend modulation ON counter register */ +#define PCI_SUSCFG 0x96 /* suspend configuration register */ + +/* PMER1 bits */ +#define GPM (1<<0) /* global power management */ +#define GIT (1<<1) /* globally enable PM device idle timers */ +#define GTR (1<<2) /* globally enable IO traps */ +#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ +#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ + +/* SUSCFG bits */ +#define SUSMOD (1<<0) /* enable/disable suspend modulation */ +/* the belows support only with cs5530 (after rev.1.2)/cs5530A */ +#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ + /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ +#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ +/* the belows support only with cs5530A */ +#define PWRSVE_ISA (1<<3) /* stop ISA clock */ +#define PWRSVE (1<<4) /* active idle */ + +struct gxfreq_params { + u8 on_duration; + u8 off_duration; + u8 pci_suscfg; + u8 pci_pmer1; + u8 pci_pmer2; + u8 pci_rev; + struct pci_dev *cs55x0; +}; + +static struct gxfreq_params *gx_params; +static int stock_freq; + +/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ +static int pci_busclk = 0; +module_param (pci_busclk, int, 0444); + +/* maximum duration for which the cpu may be suspended + * (32us * MAX_DURATION). If no parameter is given, this defaults + * to 255. + * Note that this leads to a maximum of 8 ms(!) where the CPU clock + * is suspended -- processing power is just 0.39% of what it used to be, + * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ +static int max_duration = 255; +module_param (max_duration, int, 0444); + +/* For the default policy, we want at least some processing power + * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) + */ +#define POLICY_MIN_DIV 20 + + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "gx-suspmod", msg) + +/** + * we can detect a core multipiler from dir0_lsb + * from GX1 datasheet p.56, + * MULT[3:0]: + * 0000 = SYSCLK multiplied by 4 (test only) + * 0001 = SYSCLK multiplied by 10 + * 0010 = SYSCLK multiplied by 4 + * 0011 = SYSCLK multiplied by 6 + * 0100 = SYSCLK multiplied by 9 + * 0101 = SYSCLK multiplied by 5 + * 0110 = SYSCLK multiplied by 7 + * 0111 = SYSCLK multiplied by 8 + * of 33.3MHz + **/ +static int gx_freq_mult[16] = { + 4, 10, 4, 6, 9, 5, 7, 8, + 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +/**************************************************************** + * Low Level chipset interface * + ****************************************************************/ +static struct pci_device_id gx_chipset_tbl[] __initdata = { + { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, PCI_ANY_ID, PCI_ANY_ID }, + { 0, }, +}; + +/** + * gx_detect_chipset: + * + **/ +static __init struct pci_dev *gx_detect_chipset(void) +{ + struct pci_dev *gx_pci = NULL; + + /* check if CPU is a MediaGX or a Geode. */ + if ((current_cpu_data.x86_vendor != X86_VENDOR_NSC) && + (current_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { + dprintk("error: no MediaGX/Geode processor found!\n"); + return NULL; + } + + /* detect which companion chip is used */ + while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + if ((pci_match_device (gx_chipset_tbl, gx_pci)) != NULL) { + return gx_pci; + } + } + + dprintk("error: no supported chipset found!\n"); + return NULL; +} + +/** + * gx_get_cpuspeed: + * + * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi Geode CPU runs. + */ +static unsigned int gx_get_cpuspeed(unsigned int cpu) +{ + if ((gx_params->pci_suscfg & SUSMOD) == 0) + return stock_freq; + + return (stock_freq * gx_params->off_duration) + / (gx_params->on_duration + gx_params->off_duration); +} + +/** + * gx_validate_speed: + * determine current cpu speed + * +**/ + +static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, u8 *off_duration) +{ + unsigned int i; + u8 tmp_on, tmp_off; + int old_tmp_freq = stock_freq; + int tmp_freq; + + *off_duration=1; + *on_duration=0; + + for (i=max_duration; i>0; i--) { + tmp_off = ((khz * i) / stock_freq) & 0xff; + tmp_on = i - tmp_off; + tmp_freq = (stock_freq * tmp_off) / i; + /* if this relation is closer to khz, use this. If it's equal, + * prefer it, too - lower latency */ + if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { + *on_duration = tmp_on; + *off_duration = tmp_off; + old_tmp_freq = tmp_freq; + } + } + + return old_tmp_freq; +} + + +/** + * gx_set_cpuspeed: + * set cpu speed in khz. + **/ + +static void gx_set_cpuspeed(unsigned int khz) +{ + u8 suscfg, pmer1; + unsigned int new_khz; + unsigned long flags; + struct cpufreq_freqs freqs; + + + freqs.cpu = 0; + freqs.old = gx_get_cpuspeed(0); + + new_khz = gx_validate_speed(khz, &gx_params->on_duration, &gx_params->off_duration); + + freqs.new = new_khz; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + local_irq_save(flags); + + if (new_khz != stock_freq) { /* if new khz == 100% of CPU speed, it is special case */ + switch (gx_params->cs55x0->device) { + case PCI_DEVICE_ID_CYRIX_5530_LEGACY: + pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; + /* FIXME: need to test other values -- Zwane,Miura */ + pci_write_config_byte(gx_params->cs55x0, PCI_IRQTC, 4); /* typical 2 to 4ms */ + pci_write_config_byte(gx_params->cs55x0, PCI_VIDTC, 100);/* typical 50 to 100ms */ + pci_write_config_byte(gx_params->cs55x0, PCI_PMER1, pmer1); + + if (gx_params->pci_rev < 0x10) { /* CS5530(rev 1.2, 1.3) */ + suscfg = gx_params->pci_suscfg | SUSMOD; + } else { /* CS5530A,B.. */ + suscfg = gx_params->pci_suscfg | SUSMOD | PWRSVE; + } + break; + case PCI_DEVICE_ID_CYRIX_5520: + case PCI_DEVICE_ID_CYRIX_5510: + suscfg = gx_params->pci_suscfg | SUSMOD; + break; + default: + local_irq_restore(flags); + dprintk("fatal: try to set unknown chipset.\n"); + return; + } + } else { + suscfg = gx_params->pci_suscfg & ~(SUSMOD); + gx_params->off_duration = 0; + gx_params->on_duration = 0; + dprintk("suspend modulation disabled: cpu runs 100 percent speed.\n"); + } + + pci_write_config_byte(gx_params->cs55x0, PCI_MODOFF, gx_params->off_duration); + pci_write_config_byte(gx_params->cs55x0, PCI_MODON, gx_params->on_duration); + + pci_write_config_byte(gx_params->cs55x0, PCI_SUSCFG, suscfg); + pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); + + local_irq_restore(flags); + + gx_params->pci_suscfg = suscfg; + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", + gx_params->on_duration * 32, gx_params->off_duration * 32); + dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); +} + +/**************************************************************** + * High level functions * + ****************************************************************/ + +/* + * cpufreq_gx_verify: test if frequency range is valid + * + * This function checks if a given frequency range in kHz is valid + * for the hardware supported by the driver. + */ + +static int cpufreq_gx_verify(struct cpufreq_policy *policy) +{ + unsigned int tmp_freq = 0; + u8 tmp1, tmp2; + + if (!stock_freq || !policy) + return -EINVAL; + + policy->cpu = 0; + cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); + + /* it needs to be assured that at least one supported frequency is + * within policy->min and policy->max. If it is not, policy->max + * needs to be increased until one freuqency is supported. + * policy->min may not be decreased, though. This way we guarantee a + * specific processing capacity. + */ + tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); + if (tmp_freq < policy->min) + tmp_freq += stock_freq / max_duration; + policy->min = tmp_freq; + if (policy->min > policy->max) + policy->max = tmp_freq; + tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); + if (tmp_freq > policy->max) + tmp_freq -= stock_freq / max_duration; + policy->max = tmp_freq; + if (policy->max < policy->min) + policy->max = policy->min; + cpufreq_verify_within_limits(policy, (stock_freq / max_duration), stock_freq); + + return 0; +} + +/* + * cpufreq_gx_target: + * + */ +static int cpufreq_gx_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + u8 tmp1, tmp2; + unsigned int tmp_freq; + + if (!stock_freq || !policy) + return -EINVAL; + + policy->cpu = 0; + + tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); + while (tmp_freq < policy->min) { + tmp_freq += stock_freq / max_duration; + tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); + } + while (tmp_freq > policy->max) { + tmp_freq -= stock_freq / max_duration; + tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); + } + + gx_set_cpuspeed(tmp_freq); + + return 0; +} + +static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int maxfreq, curfreq; + + if (!policy || policy->cpu != 0) + return -ENODEV; + + /* determine maximum frequency */ + if (pci_busclk) { + maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; + } else if (cpu_khz) { + maxfreq = cpu_khz; + } else { + maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; + } + stock_freq = maxfreq; + curfreq = gx_get_cpuspeed(0); + + dprintk("cpu max frequency is %d.\n", maxfreq); + dprintk("cpu current frequency is %dkHz.\n",curfreq); + + /* setup basic struct for cpufreq API */ + policy->cpu = 0; + + if (max_duration < POLICY_MIN_DIV) + policy->min = maxfreq / max_duration; + else + policy->min = maxfreq / POLICY_MIN_DIV; + policy->max = maxfreq; + policy->cur = curfreq; + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.min_freq = maxfreq / max_duration; + policy->cpuinfo.max_freq = maxfreq; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + + return 0; +} + +/* + * cpufreq_gx_init: + * MediaGX/Geode GX initialize cpufreq driver + */ +static struct cpufreq_driver gx_suspmod_driver = { + .get = gx_get_cpuspeed, + .verify = cpufreq_gx_verify, + .target = cpufreq_gx_target, + .init = cpufreq_gx_cpu_init, + .name = "gx-suspmod", + .owner = THIS_MODULE, +}; + +static int __init cpufreq_gx_init(void) +{ + int ret; + struct gxfreq_params *params; + struct pci_dev *gx_pci; + u32 class_rev; + + /* Test if we have the right hardware */ + if ((gx_pci = gx_detect_chipset()) == NULL) + return -ENODEV; + + /* check whether module parameters are sane */ + if (max_duration > 0xff) + max_duration = 0xff; + + dprintk("geode suspend modulation available.\n"); + + params = kmalloc(sizeof(struct gxfreq_params), GFP_KERNEL); + if (params == NULL) + return -ENOMEM; + memset(params, 0, sizeof(struct gxfreq_params)); + + params->cs55x0 = gx_pci; + gx_params = params; + + /* keep cs55x0 configurations */ + pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); + pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); + pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); + pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); + pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration)); + pci_read_config_dword(params->cs55x0, PCI_CLASS_REVISION, &class_rev); + params->pci_rev = class_rev && 0xff; + + if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) { + kfree(params); + return ret; /* register error! */ + } + + return 0; +} + +static void __exit cpufreq_gx_exit(void) +{ + cpufreq_unregister_driver(&gx_suspmod_driver); + pci_dev_put(gx_params->cs55x0); + kfree(gx_params); +} + +MODULE_AUTHOR ("Hiroshi Miura <miura@da-cha.org>"); +MODULE_DESCRIPTION ("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); +MODULE_LICENSE ("GPL"); + +module_init(cpufreq_gx_init); +module_exit(cpufreq_gx_exit); + diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c new file mode 100644 index 000000000000..ab0f9f5aac11 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -0,0 +1,658 @@ +/* + * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> + * (C) 2002 Padraig Brady. <padraig@antefacto.com> + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by VIA. + * + * VIA have currently 3 different versions of Longhaul. + * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. + * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. + * Version 2 of longhaul is the same as v1, but adds voltage scaling. + * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C) + * voltage scaling support has currently been disabled in this driver + * until we have code that gets it right. + * Version 3 of longhaul got renamed to Powersaver and redesigned + * to use the POWERSAVER MSR at 0x110a. + * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. + * It's pretty much the same feature wise to longhaul v2, though + * there is provision for scaling FSB too, but this doesn't work + * too well in practice so we don't even try to use this. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <asm/msr.h> +#include <asm/timex.h> +#include <asm/io.h> + +#include "longhaul.h" + +#define PFX "longhaul: " + +#define TYPE_LONGHAUL_V1 1 +#define TYPE_LONGHAUL_V2 2 +#define TYPE_POWERSAVER 3 + +#define CPU_SAMUEL 1 +#define CPU_SAMUEL2 2 +#define CPU_EZRA 3 +#define CPU_EZRA_T 4 +#define CPU_NEHEMIAH 5 + +static int cpu_model; +static unsigned int numscales=16, numvscales; +static unsigned int fsb; +static int minvid, maxvid; +static unsigned int minmult, maxmult; +static int can_scale_voltage; +static int vrmrev; + +/* Module parameters */ +static int dont_scale_voltage; + + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) + + +#define __hlt() __asm__ __volatile__("hlt": : :"memory") + +/* Clock ratios multiplied by 10 */ +static int clock_ratio[32]; +static int eblcr_table[32]; +static int voltage_table[32]; +static unsigned int highest_speed, lowest_speed; /* kHz */ +static int longhaul_version; +static struct cpufreq_frequency_table *longhaul_table; + +#ifdef CONFIG_CPU_FREQ_DEBUG +static char speedbuffer[8]; + +static char *print_speed(int speed) +{ + if (speed > 1000) { + if (speed%1000 == 0) + sprintf (speedbuffer, "%dGHz", speed/1000); + else + sprintf (speedbuffer, "%d.%dGHz", speed/1000, (speed%1000)/100); + } else + sprintf (speedbuffer, "%dMHz", speed); + + return speedbuffer; +} +#endif + + +static unsigned int calc_speed(int mult) +{ + int khz; + khz = (mult/10)*fsb; + if (mult%10) + khz += fsb/2; + khz *= 1000; + return khz; +} + + +static int longhaul_get_cpu_mult(void) +{ + unsigned long invalue=0,lo, hi; + + rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi); + invalue = (lo & (1<<22|1<<23|1<<24|1<<25)) >>22; + if (longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) { + if (lo & (1<<27)) + invalue+=16; + } + return eblcr_table[invalue]; +} + + +static void do_powersaver(union msr_longhaul *longhaul, + unsigned int clock_ratio_index) +{ + int version; + + switch (cpu_model) { + case CPU_EZRA_T: + version = 3; + break; + case CPU_NEHEMIAH: + version = 0xf; + break; + default: + return; + } + + rdmsrl(MSR_VIA_LONGHAUL, longhaul->val); + longhaul->bits.SoftBusRatio = clock_ratio_index & 0xf; + longhaul->bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; + longhaul->bits.EnableSoftBusRatio = 1; + longhaul->bits.RevisionKey = 0; + local_irq_disable(); + wrmsrl(MSR_VIA_LONGHAUL, longhaul->val); + local_irq_enable(); + __hlt(); + + rdmsrl(MSR_VIA_LONGHAUL, longhaul->val); + longhaul->bits.EnableSoftBusRatio = 0; + longhaul->bits.RevisionKey = version; + local_irq_disable(); + wrmsrl(MSR_VIA_LONGHAUL, longhaul->val); + local_irq_enable(); +} + +/** + * longhaul_set_cpu_frequency() + * @clock_ratio_index : bitpattern of the new multiplier. + * + * Sets a new clock ratio. + */ + +static void longhaul_setstate(unsigned int clock_ratio_index) +{ + int speed, mult; + struct cpufreq_freqs freqs; + union msr_longhaul longhaul; + union msr_bcr2 bcr2; + static unsigned int old_ratio=-1; + + if (old_ratio == clock_ratio_index) + return; + old_ratio = clock_ratio_index; + + mult = clock_ratio[clock_ratio_index]; + if (mult == -1) + return; + + speed = calc_speed(mult); + if ((speed > highest_speed) || (speed < lowest_speed)) + return; + + freqs.old = calc_speed(longhaul_get_cpu_mult()); + freqs.new = speed; + freqs.cpu = 0; /* longhaul.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", + fsb, mult/10, mult%10, print_speed(speed/1000)); + + switch (longhaul_version) { + + /* + * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) + * Software controlled multipliers only. + * + * *NB* Until we get voltage scaling working v1 & v2 are the same code. + * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5b] and Ezra [C5C] + */ + case TYPE_LONGHAUL_V1: + case TYPE_LONGHAUL_V2: + rdmsrl (MSR_VIA_BCR2, bcr2.val); + /* Enable software clock multiplier */ + bcr2.bits.ESOFTBF = 1; + bcr2.bits.CLOCKMUL = clock_ratio_index; + local_irq_disable(); + wrmsrl (MSR_VIA_BCR2, bcr2.val); + local_irq_enable(); + + __hlt(); + + /* Disable software clock multiplier */ + rdmsrl (MSR_VIA_BCR2, bcr2.val); + bcr2.bits.ESOFTBF = 0; + local_irq_disable(); + wrmsrl (MSR_VIA_BCR2, bcr2.val); + local_irq_enable(); + break; + + /* + * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) + * We can scale voltage with this too, but that's currently + * disabled until we come up with a decent 'match freq to voltage' + * algorithm. + * When we add voltage scaling, we will also need to do the + * voltage/freq setting in order depending on the direction + * of scaling (like we do in powernow-k7.c) + * Nehemiah can do FSB scaling too, but this has never been proven + * to work in practice. + */ + case TYPE_POWERSAVER: + do_powersaver(&longhaul, clock_ratio_index); + break; + } + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +} + +/* + * Centaur decided to make life a little more tricky. + * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. + * Samuel2 and above have to try and guess what the FSB is. + * We do this by assuming we booted at maximum multiplier, and interpolate + * between that value multiplied by possible FSBs and cpu_mhz which + * was calculated at boot time. Really ugly, but no other way to do this. + */ + +#define ROUNDING 0xf + +static int _guess(int guess) +{ + int target; + + target = ((maxmult/10)*guess); + if (maxmult%10 != 0) + target += (guess/2); + target += ROUNDING/2; + target &= ~ROUNDING; + return target; +} + + +static int guess_fsb(void) +{ + int speed = (cpu_khz/1000); + int i; + int speeds[3] = { 66, 100, 133 }; + + speed += ROUNDING/2; + speed &= ~ROUNDING; + + for (i=0; i<3; i++) { + if (_guess(speeds[i]) == speed) + return speeds[i]; + } + return 0; +} + + +static int __init longhaul_get_ranges(void) +{ + unsigned long invalue; + unsigned int multipliers[32]= { + 50,30,40,100,55,35,45,95,90,70,80,60,120,75,85,65, + -1,110,120,-1,135,115,125,105,130,150,160,140,-1,155,-1,145 }; + unsigned int j, k = 0; + union msr_longhaul longhaul; + unsigned long lo, hi; + unsigned int eblcr_fsb_table_v1[] = { 66, 133, 100, -1 }; + unsigned int eblcr_fsb_table_v2[] = { 133, 100, -1, 66 }; + + switch (longhaul_version) { + case TYPE_LONGHAUL_V1: + case TYPE_LONGHAUL_V2: + /* Ugh, Longhaul v1 didn't have the min/max MSRs. + Assume min=3.0x & max = whatever we booted at. */ + minmult = 30; + maxmult = longhaul_get_cpu_mult(); + rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi); + invalue = (lo & (1<<18|1<<19)) >>18; + if (cpu_model==CPU_SAMUEL || cpu_model==CPU_SAMUEL2) + fsb = eblcr_fsb_table_v1[invalue]; + else + fsb = guess_fsb(); + break; + + case TYPE_POWERSAVER: + /* Ezra-T */ + if (cpu_model==CPU_EZRA_T) { + rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); + invalue = longhaul.bits.MaxMHzBR; + if (longhaul.bits.MaxMHzBR4) + invalue += 16; + maxmult=multipliers[invalue]; + + invalue = longhaul.bits.MinMHzBR; + if (longhaul.bits.MinMHzBR4 == 1) + minmult = 30; + else + minmult = multipliers[invalue]; + fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB]; + break; + } + + /* Nehemiah */ + if (cpu_model==CPU_NEHEMIAH) { + rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); + + /* + * TODO: This code works, but raises a lot of questions. + * - Some Nehemiah's seem to have broken Min/MaxMHzBR's. + * We get around this by using a hardcoded multiplier of 4.0x + * for the minimimum speed, and the speed we booted up at for the max. + * This is done in longhaul_get_cpu_mult() by reading the EBLCR register. + * - According to some VIA documentation EBLCR is only + * in pre-Nehemiah C3s. How this still works is a mystery. + * We're possibly using something undocumented and unsupported, + * But it works, so we don't grumble. + */ + minmult=40; + maxmult=longhaul_get_cpu_mult(); + + /* Starting with the 1.2GHz parts, theres a 200MHz bus. */ + if ((cpu_khz/1000) > 1200) + fsb = 200; + else + fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB]; + break; + } + } + + dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", + minmult/10, minmult%10, maxmult/10, maxmult%10); + + if (fsb == -1) { + printk (KERN_INFO PFX "Invalid (reserved) FSB!\n"); + return -EINVAL; + } + + highest_speed = calc_speed(maxmult); + lowest_speed = calc_speed(minmult); + dprintk ("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, + print_speed(lowest_speed/1000), + print_speed(highest_speed/1000)); + + if (lowest_speed == highest_speed) { + printk (KERN_INFO PFX "highestspeed == lowest, aborting.\n"); + return -EINVAL; + } + if (lowest_speed > highest_speed) { + printk (KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", + lowest_speed, highest_speed); + return -EINVAL; + } + + longhaul_table = kmalloc((numscales + 1) * sizeof(struct cpufreq_frequency_table), GFP_KERNEL); + if(!longhaul_table) + return -ENOMEM; + + for (j=0; j < numscales; j++) { + unsigned int ratio; + ratio = clock_ratio[j]; + if (ratio == -1) + continue; + if (ratio > maxmult || ratio < minmult) + continue; + longhaul_table[k].frequency = calc_speed(ratio); + longhaul_table[k].index = j; + k++; + } + + longhaul_table[k].frequency = CPUFREQ_TABLE_END; + if (!k) { + kfree (longhaul_table); + return -EINVAL; + } + + return 0; +} + + +static void __init longhaul_setup_voltagescaling(void) +{ + union msr_longhaul longhaul; + + rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); + + if (!(longhaul.bits.RevisionID & 1)) + return; + + minvid = longhaul.bits.MinimumVID; + maxvid = longhaul.bits.MaximumVID; + vrmrev = longhaul.bits.VRMRev; + + if (minvid == 0 || maxvid == 0) { + printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " + "Voltage scaling disabled.\n", + minvid/1000, minvid%1000, maxvid/1000, maxvid%1000); + return; + } + + if (minvid == maxvid) { + printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are " + "both %d.%03d. Voltage scaling disabled\n", + maxvid/1000, maxvid%1000); + return; + } + + if (vrmrev==0) { + dprintk ("VRM 8.5 \n"); + memcpy (voltage_table, vrm85scales, sizeof(voltage_table)); + numvscales = (voltage_table[maxvid]-voltage_table[minvid])/25; + } else { + dprintk ("Mobile VRM \n"); + memcpy (voltage_table, mobilevrmscales, sizeof(voltage_table)); + numvscales = (voltage_table[maxvid]-voltage_table[minvid])/5; + } + + /* Current voltage isn't readable at first, so we need to + set it to a known value. The spec says to use maxvid */ + longhaul.bits.RevisionKey = longhaul.bits.RevisionID; /* FIXME: This is bad. */ + longhaul.bits.EnableSoftVID = 1; + longhaul.bits.SoftVID = maxvid; + wrmsrl (MSR_VIA_LONGHAUL, longhaul.val); + + minvid = voltage_table[minvid]; + maxvid = voltage_table[maxvid]; + + dprintk ("Min VID=%d.%03d Max VID=%d.%03d, %d possible voltage scales\n", + maxvid/1000, maxvid%1000, minvid/1000, minvid%1000, numvscales); + + can_scale_voltage = 1; +} + + +static int longhaul_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, longhaul_table); +} + + +static int longhaul_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + unsigned int table_index = 0; + unsigned int new_clock_ratio = 0; + + if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, relation, &table_index)) + return -EINVAL; + + new_clock_ratio = longhaul_table[table_index].index & 0xFF; + + longhaul_setstate(new_clock_ratio); + + return 0; +} + + +static unsigned int longhaul_get(unsigned int cpu) +{ + if (cpu) + return 0; + return calc_speed(longhaul_get_cpu_mult()); +} + + +static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = cpu_data; + char *cpuname=NULL; + int ret; + + switch (c->x86_model) { + case 6: + cpu_model = CPU_SAMUEL; + cpuname = "C3 'Samuel' [C5A]"; + longhaul_version = TYPE_LONGHAUL_V1; + memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); + memcpy (eblcr_table, samuel1_eblcr, sizeof(samuel1_eblcr)); + break; + + case 7: + longhaul_version = TYPE_LONGHAUL_V1; + switch (c->x86_mask) { + case 0: + cpu_model = CPU_SAMUEL2; + cpuname = "C3 'Samuel 2' [C5B]"; + /* Note, this is not a typo, early Samuel2's had Samuel1 ratios. */ + memcpy (clock_ratio, samuel1_clock_ratio, sizeof(samuel1_clock_ratio)); + memcpy (eblcr_table, samuel2_eblcr, sizeof(samuel2_eblcr)); + break; + case 1 ... 15: + if (c->x86_mask < 8) { + cpu_model = CPU_SAMUEL2; + cpuname = "C3 'Samuel 2' [C5B]"; + } else { + cpu_model = CPU_EZRA; + cpuname = "C3 'Ezra' [C5C]"; + } + memcpy (clock_ratio, ezra_clock_ratio, sizeof(ezra_clock_ratio)); + memcpy (eblcr_table, ezra_eblcr, sizeof(ezra_eblcr)); + break; + } + break; + + case 8: + cpu_model = CPU_EZRA_T; + cpuname = "C3 'Ezra-T' [C5M]"; + longhaul_version = TYPE_POWERSAVER; + numscales=32; + memcpy (clock_ratio, ezrat_clock_ratio, sizeof(ezrat_clock_ratio)); + memcpy (eblcr_table, ezrat_eblcr, sizeof(ezrat_eblcr)); + break; + + case 9: + cpu_model = CPU_NEHEMIAH; + longhaul_version = TYPE_POWERSAVER; + numscales=32; + switch (c->x86_mask) { + case 0 ... 1: + cpuname = "C3 'Nehemiah A' [C5N]"; + memcpy (clock_ratio, nehemiah_a_clock_ratio, sizeof(nehemiah_a_clock_ratio)); + memcpy (eblcr_table, nehemiah_a_eblcr, sizeof(nehemiah_a_eblcr)); + break; + case 2 ... 4: + cpuname = "C3 'Nehemiah B' [C5N]"; + memcpy (clock_ratio, nehemiah_b_clock_ratio, sizeof(nehemiah_b_clock_ratio)); + memcpy (eblcr_table, nehemiah_b_eblcr, sizeof(nehemiah_b_eblcr)); + break; + case 5 ... 15: + cpuname = "C3 'Nehemiah C' [C5N]"; + memcpy (clock_ratio, nehemiah_c_clock_ratio, sizeof(nehemiah_c_clock_ratio)); + memcpy (eblcr_table, nehemiah_c_eblcr, sizeof(nehemiah_c_eblcr)); + break; + } + break; + + default: + cpuname = "Unknown"; + break; + } + + printk (KERN_INFO PFX "VIA %s CPU detected. ", cpuname); + switch (longhaul_version) { + case TYPE_LONGHAUL_V1: + case TYPE_LONGHAUL_V2: + printk ("Longhaul v%d supported.\n", longhaul_version); + break; + case TYPE_POWERSAVER: + printk ("Powersaver supported.\n"); + break; + }; + + ret = longhaul_get_ranges(); + if (ret != 0) + return ret; + + if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && + (dont_scale_voltage==0)) + longhaul_setup_voltagescaling(); + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = calc_speed(longhaul_get_cpu_mult()); + + ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); + if (ret) + return ret; + + cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); + + return 0; +} + +static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static struct freq_attr* longhaul_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver longhaul_driver = { + .verify = longhaul_verify, + .target = longhaul_target, + .get = longhaul_get, + .init = longhaul_cpu_init, + .exit = __devexit_p(longhaul_cpu_exit), + .name = "longhaul", + .owner = THIS_MODULE, + .attr = longhaul_attr, +}; + + +static int __init longhaul_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) + return -ENODEV; + + switch (c->x86_model) { + case 6 ... 9: + return cpufreq_register_driver(&longhaul_driver); + default: + printk (KERN_INFO PFX "Unknown VIA CPU. Contact davej@codemonkey.org.uk\n"); + } + + return -ENODEV; +} + + +static void __exit longhaul_exit(void) +{ + int i=0; + + for (i=0; i < numscales; i++) { + if (clock_ratio[i] == maxmult) { + longhaul_setstate(i); + break; + } + } + + cpufreq_unregister_driver(&longhaul_driver); + kfree(longhaul_table); +} + +module_param (dont_scale_voltage, int, 0644); +MODULE_PARM_DESC(dont_scale_voltage, "Don't scale voltage of processor"); + +MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); +MODULE_LICENSE ("GPL"); + +module_init(longhaul_init); +module_exit(longhaul_exit); + diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h new file mode 100644 index 000000000000..2a495c162ec7 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h @@ -0,0 +1,466 @@ +/* + * longhaul.h + * (C) 2003 Dave Jones. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * VIA-specific information + */ + +union msr_bcr2 { + struct { + unsigned Reseved:19, // 18:0 + ESOFTBF:1, // 19 + Reserved2:3, // 22:20 + CLOCKMUL:4, // 26:23 + Reserved3:5; // 31:27 + } bits; + unsigned long val; +}; + +union msr_longhaul { + struct { + unsigned RevisionID:4, // 3:0 + RevisionKey:4, // 7:4 + EnableSoftBusRatio:1, // 8 + EnableSoftVID:1, // 9 + EnableSoftBSEL:1, // 10 + Reserved:3, // 11:13 + SoftBusRatio4:1, // 14 + VRMRev:1, // 15 + SoftBusRatio:4, // 19:16 + SoftVID:5, // 24:20 + Reserved2:3, // 27:25 + SoftBSEL:2, // 29:28 + Reserved3:2, // 31:30 + MaxMHzBR:4, // 35:32 + MaximumVID:5, // 40:36 + MaxMHzFSB:2, // 42:41 + MaxMHzBR4:1, // 43 + Reserved4:4, // 47:44 + MinMHzBR:4, // 51:48 + MinimumVID:5, // 56:52 + MinMHzFSB:2, // 58:57 + MinMHzBR4:1, // 59 + Reserved5:4; // 63:60 + } bits; + unsigned long long val; +}; + +/* + * Clock ratio tables. Div/Mod by 10 to get ratio. + * The eblcr ones specify the ratio read from the CPU. + * The clock_ratio ones specify what to write to the CPU. + */ + +/* + * VIA C3 Samuel 1 & Samuel 2 (stepping 0) + */ +static int __initdata samuel1_clock_ratio[16] = { + -1, /* 0000 -> RESERVED */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + -1, /* 0011 -> RESERVED */ + -1, /* 0100 -> RESERVED */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + -1, /* 1110 -> RESERVED */ + -1, /* 1111 -> RESERVED */ +}; + +static int __initdata samuel1_eblcr[16] = { + 50, /* 0000 -> RESERVED */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + -1, /* 0011 -> RESERVED */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + -1, /* 0111 -> RESERVED */ + -1, /* 1000 -> RESERVED */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + -1, /* 1100 -> RESERVED */ + 75, /* 1101 -> 7.5x */ + -1, /* 1110 -> RESERVED */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 Samuel2 Stepping 1->15 + */ +static int __initdata samuel2_eblcr[16] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 110, /* 0111 -> 11.0x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 130, /* 1110 -> 13.0x */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 Ezra + */ +static int __initdata ezra_clock_ratio[16] = { + 100, /* 0000 -> 10.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ +}; + +static int __initdata ezra_eblcr[16] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ +}; + +/* + * VIA C3 (Ezra-T) [C5M]. + */ +static int __initdata ezrat_clock_ratio[32] = { + 100, /* 0000 -> 10.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + + -1, /* 0000 -> RESERVED (10.0x) */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + -1, /* 0011 -> RESERVED (9.0x)*/ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + -1, /* 1111 -> RESERVED (12.0x) */ +}; + +static int __initdata ezrat_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 30, /* 0001 -> 3.0x */ + 40, /* 0010 -> 4.0x */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + 35, /* 0101 -> 3.5x */ + 45, /* 0110 -> 4.5x */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + + -1, /* 0000 -> RESERVED (9.0x) */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + -1, /* 0011 -> RESERVED (10.0x)*/ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + -1, /* 1100 -> RESERVED (12.0x) */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145, /* 1111 -> 14.5x */ +}; + +/* + * VIA C3 Nehemiah */ + +static int __initdata nehemiah_a_clock_ratio[32] = { + 100, /* 0000 -> 10.0x */ + 160, /* 0001 -> 16.0x */ + -1, /* 0010 -> RESERVED */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + -1, /* 0101 -> RESERVED */ + -1, /* 0110 -> RESERVED */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + 100, /* 0000 -> 10.0x */ + -1, /* 0001 -> RESERVED */ + 120, /* 0010 -> 12.0x */ + 90, /* 0011 -> 9.0x */ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 120, /* 1111 -> 12.0x */ +}; + +static int __initdata nehemiah_b_clock_ratio[32] = { + 100, /* 0000 -> 10.0x */ + 160, /* 0001 -> 16.0x */ + -1, /* 0010 -> RESERVED */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + -1, /* 0101 -> RESERVED */ + -1, /* 0110 -> RESERVED */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + 100, /* 0000 -> 10.0x */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + 90, /* 0011 -> 9.0x */ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 120, /* 1111 -> 12.0x */ +}; + +static int __initdata nehemiah_c_clock_ratio[32] = { + 100, /* 0000 -> 10.0x */ + 160, /* 0001 -> 16.0x */ + 40, /* 0010 -> RESERVED */ + 90, /* 0011 -> 9.0x */ + 95, /* 0100 -> 9.5x */ + -1, /* 0101 -> RESERVED */ + 45, /* 0110 -> RESERVED */ + 55, /* 0111 -> 5.5x */ + 60, /* 1000 -> 6.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 50, /* 1011 -> 5.0x */ + 65, /* 1100 -> 6.5x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 120, /* 1111 -> 12.0x */ + 100, /* 0000 -> 10.0x */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + 90, /* 0011 -> 9.0x */ + 105, /* 0100 -> 10.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 135, /* 0111 -> 13.5x */ + 140, /* 1000 -> 14.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 130, /* 1011 -> 13.0x */ + 145, /* 1100 -> 14.5x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 120, /* 1111 -> 12.0x */ +}; + +static int __initdata nehemiah_a_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 160, /* 0001 -> 16.0x */ + -1, /* 0010 -> RESERVED */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + -1, /* 0101 -> RESERVED */ + -1, /* 0110 -> RESERVED */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + 90, /* 0000 -> 9.0x */ + -1, /* 0001 -> RESERVED */ + 120, /* 0010 -> 12.0x */ + 100, /* 0011 -> 10.0x */ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + 120, /* 1100 -> 12.0x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145 /* 1111 -> 14.5x */ + /* end of table */ +}; +static int __initdata nehemiah_b_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 160, /* 0001 -> 16.0x */ + -1, /* 0010 -> RESERVED */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + -1, /* 0101 -> RESERVED */ + -1, /* 0110 -> RESERVED */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + 90, /* 0000 -> 9.0x */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + 100, /* 0011 -> 10.0x */ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + 120, /* 1100 -> 12.0x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145 /* 1111 -> 14.5x */ + /* end of table */ +}; +static int __initdata nehemiah_c_eblcr[32] = { + 50, /* 0000 -> 5.0x */ + 160, /* 0001 -> 16.0x */ + 40, /* 0010 -> RESERVED */ + 100, /* 0011 -> 10.0x */ + 55, /* 0100 -> 5.5x */ + -1, /* 0101 -> RESERVED */ + 45, /* 0110 -> RESERVED */ + 95, /* 0111 -> 9.5x */ + 90, /* 1000 -> 9.0x */ + 70, /* 1001 -> 7.0x */ + 80, /* 1010 -> 8.0x */ + 60, /* 1011 -> 6.0x */ + 120, /* 1100 -> 12.0x */ + 75, /* 1101 -> 7.5x */ + 85, /* 1110 -> 8.5x */ + 65, /* 1111 -> 6.5x */ + 90, /* 0000 -> 9.0x */ + 110, /* 0001 -> 11.0x */ + 120, /* 0010 -> 12.0x */ + 100, /* 0011 -> 10.0x */ + 135, /* 0100 -> 13.5x */ + 115, /* 0101 -> 11.5x */ + 125, /* 0110 -> 12.5x */ + 105, /* 0111 -> 10.5x */ + 130, /* 1000 -> 13.0x */ + 150, /* 1001 -> 15.0x */ + 160, /* 1010 -> 16.0x */ + 140, /* 1011 -> 14.0x */ + 120, /* 1100 -> 12.0x */ + 155, /* 1101 -> 15.5x */ + -1, /* 1110 -> RESERVED (13.0x) */ + 145 /* 1111 -> 14.5x */ + /* end of table */ +}; + +/* + * Voltage scales. Div/Mod by 1000 to get actual voltage. + * Which scale to use depends on the VRM type in use. + */ +static int __initdata vrm85scales[32] = { + 1250, 1200, 1150, 1100, 1050, 1800, 1750, 1700, + 1650, 1600, 1550, 1500, 1450, 1400, 1350, 1300, + 1275, 1225, 1175, 1125, 1075, 1825, 1775, 1725, + 1675, 1625, 1575, 1525, 1475, 1425, 1375, 1325, +}; + +static int __initdata mobilevrmscales[32] = { + 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, + 1600, 1550, 1500, 1450, 1500, 1350, 1300, -1, + 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, + 1075, 1050, 1025, 1000, 975, 950, 925, -1, +}; + diff --git a/arch/i386/kernel/cpu/cpufreq/longrun.c b/arch/i386/kernel/cpu/cpufreq/longrun.c new file mode 100644 index 000000000000..e3868de4dc2e --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/longrun.c @@ -0,0 +1,326 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/cpufreq.h> + +#include <asm/msr.h> +#include <asm/processor.h> +#include <asm/timex.h> + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longrun", msg) + +static struct cpufreq_driver longrun_driver; + +/** + * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz + * values into per cent values. In TMTA microcode, the following is valid: + * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) + */ +static unsigned int longrun_low_freq, longrun_high_freq; + + +/** + * longrun_get_policy - get the current LongRun policy + * @policy: struct cpufreq_policy where current policy is written into + * + * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS + * and MSR_TMTA_LONGRUN_CTRL + */ +static void __init longrun_get_policy(struct cpufreq_policy *policy) +{ + u32 msr_lo, msr_hi; + + rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi); + if (msr_lo & 0x01) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi); + msr_lo &= 0x0000007F; + msr_hi &= 0x0000007F; + + if ( longrun_high_freq <= longrun_low_freq ) { + /* Assume degenerate Longrun table */ + policy->min = policy->max = longrun_high_freq; + } else { + policy->min = longrun_low_freq + msr_lo * + ((longrun_high_freq - longrun_low_freq) / 100); + policy->max = longrun_low_freq + msr_hi * + ((longrun_high_freq - longrun_low_freq) / 100); + } + policy->cpu = 0; +} + + +/** + * longrun_set_policy - sets a new CPUFreq policy + * @policy: new policy + * + * Sets a new CPUFreq policy on LongRun-capable processors. This function + * has to be called with cpufreq_driver locked. + */ +static int longrun_set_policy(struct cpufreq_policy *policy) +{ + u32 msr_lo, msr_hi; + u32 pctg_lo, pctg_hi; + + if (!policy) + return -EINVAL; + + if ( longrun_high_freq <= longrun_low_freq ) { + /* Assume degenerate Longrun table */ + pctg_lo = pctg_hi = 100; + } else { + pctg_lo = (policy->min - longrun_low_freq) / + ((longrun_high_freq - longrun_low_freq) / 100); + pctg_hi = (policy->max - longrun_low_freq) / + ((longrun_high_freq - longrun_low_freq) / 100); + } + + if (pctg_hi > 100) + pctg_hi = 100; + if (pctg_lo > pctg_hi) + pctg_lo = pctg_hi; + + /* performance or economy mode */ + rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + msr_lo &= 0xFFFFFFFE; + switch (policy->policy) { + case CPUFREQ_POLICY_PERFORMANCE: + msr_lo |= 0x00000001; + break; + case CPUFREQ_POLICY_POWERSAVE: + break; + } + wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); + + /* lower and upper boundary */ + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + msr_lo &= 0xFFFFFF80; + msr_hi &= 0xFFFFFF80; + msr_lo |= pctg_lo; + msr_hi |= pctg_hi; + wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + + return 0; +} + + +/** + * longrun_verify_poliy - verifies a new CPUFreq policy + * @policy: the policy to verify + * + * Validates a new CPUFreq policy. This function has to be called with + * cpufreq_driver locked. + */ +static int longrun_verify_policy(struct cpufreq_policy *policy) +{ + if (!policy) + return -EINVAL; + + policy->cpu = 0; + cpufreq_verify_within_limits(policy, + policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + + if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && + (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) + return -EINVAL; + + return 0; +} + +static unsigned int longrun_get(unsigned int cpu) +{ + u32 eax, ebx, ecx, edx; + + if (cpu) + return 0; + + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + dprintk("cpuid eax is %u\n", eax); + + return (eax * 1000); +} + +/** + * longrun_determine_freqs - determines the lowest and highest possible core frequency + * @low_freq: an int to put the lowest frequency into + * @high_freq: an int to put the highest frequency into + * + * Determines the lowest and highest possible core frequencies on this CPU. + * This is necessary to calculate the performance percentage according to + * TMTA rules: + * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) + */ +static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) +{ + u32 msr_lo, msr_hi; + u32 save_lo, save_hi; + u32 eax, ebx, ecx, edx; + u32 try_hi; + struct cpuinfo_x86 *c = cpu_data; + + if (!low_freq || !high_freq) + return -EINVAL; + + if (cpu_has(c, X86_FEATURE_LRTI)) { + /* if the LongRun Table Interface is present, the + * detection is a bit easier: + * For minimum frequency, read out the maximum + * level (msr_hi), write that into "currently + * selected level", and read out the frequency. + * For maximum frequency, read out level zero. + */ + /* minimum */ + rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); + wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); + rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); + *low_freq = msr_lo * 1000; /* to kHz */ + + /* maximum */ + wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); + rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); + *high_freq = msr_lo * 1000; /* to kHz */ + + dprintk("longrun table interface told %u - %u kHz\n", *low_freq, *high_freq); + + if (*low_freq > *high_freq) + *low_freq = *high_freq; + return 0; + } + + /* set the upper border to the value determined during TSC init */ + *high_freq = (cpu_khz / 1000); + *high_freq = *high_freq * 1000; + dprintk("high frequency is %u kHz\n", *high_freq); + + /* get current borders */ + rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + save_lo = msr_lo & 0x0000007F; + save_hi = msr_hi & 0x0000007F; + + /* if current perf_pctg is larger than 90%, we need to decrease the + * upper limit to make the calculation more accurate. + */ + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + /* try decreasing in 10% steps, some processors react only + * on some barrier values */ + for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -=10) { + /* set to 0 to try_hi perf_pctg */ + msr_lo &= 0xFFFFFF80; + msr_hi &= 0xFFFFFF80; + msr_lo |= 0; + msr_hi |= try_hi; + wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); + + /* read out current core MHz and current perf_pctg */ + cpuid(0x80860007, &eax, &ebx, &ecx, &edx); + + /* restore values */ + wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); + } + dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax); + + /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) + * eqals + * low_freq * ( 1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) + * + * high_freq * perf_pctg is stored tempoarily into "ebx". + */ + ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ + + if ((ecx > 95) || (ecx == 0) || (eax < ebx)) + return -EIO; + + edx = (eax - ebx) / (100 - ecx); + *low_freq = edx * 1000; /* back to kHz */ + + dprintk("low frequency is %u kHz\n", *low_freq); + + if (*low_freq > *high_freq) + *low_freq = *high_freq; + + return 0; +} + + +static int __init longrun_cpu_init(struct cpufreq_policy *policy) +{ + int result = 0; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + /* detect low and high frequency */ + result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); + if (result) + return result; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = longrun_low_freq; + policy->cpuinfo.max_freq = longrun_high_freq; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + longrun_get_policy(policy); + + return 0; +} + + +static struct cpufreq_driver longrun_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = longrun_verify_policy, + .setpolicy = longrun_set_policy, + .get = longrun_get, + .init = longrun_cpu_init, + .name = "longrun", + .owner = THIS_MODULE, +}; + + +/** + * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver + * + * Initializes the LongRun support. + */ +static int __init longrun_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + if (c->x86_vendor != X86_VENDOR_TRANSMETA || + !cpu_has(c, X86_FEATURE_LONGRUN)) + return -ENODEV; + + return cpufreq_register_driver(&longrun_driver); +} + + +/** + * longrun_exit - unregisters LongRun support + */ +static void __exit longrun_exit(void) +{ + cpufreq_unregister_driver(&longrun_driver); +} + + +MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION ("LongRun driver for Transmeta Crusoe and Efficeon processors."); +MODULE_LICENSE ("GPL"); + +module_init(longrun_init); +module_exit(longrun_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c new file mode 100644 index 000000000000..aa622d52c6e5 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c @@ -0,0 +1,337 @@ +/* + * Pentium 4/Xeon CPU on demand clock modulation/speed scaling + * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> + * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> + * (C) 2002 Arjan van de Ven <arjanv@redhat.com> + * (C) 2002 Tora T. Engstad + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The author(s) of this software shall not be held liable for damages + * of any nature resulting due to the use of this software. This + * software is provided AS-IS with no warranties. + * + * Date Errata Description + * 20020525 N44, O17 12.5% or 25% DC causes lockup + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cpufreq.h> +#include <linux/slab.h> +#include <linux/cpumask.h> + +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/timex.h> + +#include "speedstep-lib.h" + +#define PFX "p4-clockmod: " +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "p4-clockmod", msg) + +/* + * Duty Cycle (3bits), note DC_DISABLE is not specified in + * intel docs i just use it to mean disable + */ +enum { + DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, + DC_64PT, DC_75PT, DC_88PT, DC_DISABLE +}; + +#define DC_ENTRIES 8 + + +static int has_N44_O17_errata[NR_CPUS]; +static unsigned int stock_freq; +static struct cpufreq_driver p4clockmod_driver; +static unsigned int cpufreq_p4_get(unsigned int cpu); + +static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) +{ + u32 l, h; + + if (!cpu_online(cpu) || (newstate > DC_DISABLE) || (newstate == DC_RESV)) + return -EINVAL; + + rdmsr(MSR_IA32_THERM_STATUS, l, h); + + if (l & 0x01) + dprintk("CPU#%d currently thermal throttled\n", cpu); + + if (has_N44_O17_errata[cpu] && (newstate == DC_25PT || newstate == DC_DFLT)) + newstate = DC_38PT; + + rdmsr(MSR_IA32_THERM_CONTROL, l, h); + if (newstate == DC_DISABLE) { + dprintk("CPU#%d disabling modulation\n", cpu); + wrmsr(MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); + } else { + dprintk("CPU#%d setting duty cycle to %d%%\n", + cpu, ((125 * newstate) / 10)); + /* bits 63 - 5 : reserved + * bit 4 : enable/disable + * bits 3-1 : duty cycle + * bit 0 : reserved + */ + l = (l & ~14); + l = l | (1<<4) | ((newstate & 0x7)<<1); + wrmsr(MSR_IA32_THERM_CONTROL, l, h); + } + + return 0; +} + + +static struct cpufreq_frequency_table p4clockmod_table[] = { + {DC_RESV, CPUFREQ_ENTRY_INVALID}, + {DC_DFLT, 0}, + {DC_25PT, 0}, + {DC_38PT, 0}, + {DC_50PT, 0}, + {DC_64PT, 0}, + {DC_75PT, 0}, + {DC_88PT, 0}, + {DC_DISABLE, 0}, + {DC_RESV, CPUFREQ_TABLE_END}, +}; + + +static int cpufreq_p4_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = DC_RESV; + struct cpufreq_freqs freqs; + cpumask_t cpus_allowed; + int i; + + if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], target_freq, relation, &newstate)) + return -EINVAL; + + freqs.old = cpufreq_p4_get(policy->cpu); + freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; + + if (freqs.new == freqs.old) + return 0; + + /* notifiers */ + for_each_cpu_mask(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software + * Developer's Manual, Volume 3 + */ + cpus_allowed = current->cpus_allowed; + + for_each_cpu_mask(i, policy->cpus) { + cpumask_t this_cpu = cpumask_of_cpu(i); + + set_cpus_allowed(current, this_cpu); + BUG_ON(smp_processor_id() != i); + + cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); + } + set_cpus_allowed(current, cpus_allowed); + + /* notifiers */ + for_each_cpu_mask(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + + return 0; +} + + +static int cpufreq_p4_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); +} + + +static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0x06) && (c->x86_model == 0x09)) { + /* Pentium M (Banias) */ + printk(KERN_WARNING PFX "Warning: Pentium M detected. " + "The speedstep_centrino module offers voltage scaling" + " in addition of frequency scaling. You should use " + "that instead of p4-clockmod, if possible.\n"); + return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); + } + + if ((c->x86 == 0x06) && (c->x86_model == 0x0D)) { + /* Pentium M (Dothan) */ + printk(KERN_WARNING PFX "Warning: Pentium M detected. " + "The speedstep_centrino module offers voltage scaling" + " in addition of frequency scaling. You should use " + "that instead of p4-clockmod, if possible.\n"); + /* on P-4s, the TSC runs with constant frequency independent whether + * throttling is active or not. */ + p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; + return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); + } + + if (c->x86 != 0xF) { + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <linux@brodo.de>\n"); + return 0; + } + + /* on P-4s, the TSC runs with constant frequency independent whether + * throttling is active or not. */ + p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; + + if (speedstep_detect_processor() == SPEEDSTEP_PROCESSOR_P4M) { + printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " + "The speedstep-ich or acpi cpufreq modules offer " + "voltage scaling in addition of frequency scaling. " + "You should use either one instead of p4-clockmod, " + "if possible.\n"); + return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4M); + } + + return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_P4D); +} + + + +static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; + int cpuid = 0; + unsigned int i; + +#ifdef CONFIG_SMP + policy->cpus = cpu_sibling_map[policy->cpu]; +#endif + + /* Errata workaround */ + cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; + switch (cpuid) { + case 0x0f07: + case 0x0f0a: + case 0x0f11: + case 0x0f12: + has_N44_O17_errata[policy->cpu] = 1; + dprintk("has errata -- disabling low frequencies\n"); + } + + /* get max frequency */ + stock_freq = cpufreq_p4_get_frequency(c); + if (!stock_freq) + return -EINVAL; + + /* table init */ + for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { + if ((i<2) && (has_N44_O17_errata[policy->cpu])) + p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; + else + p4clockmod_table[i].frequency = (stock_freq * i)/8; + } + cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); + + /* cpuinfo and default policy values */ + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = 1000000; /* assumed */ + policy->cur = stock_freq; + + return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); +} + + +static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int cpufreq_p4_get(unsigned int cpu) +{ + cpumask_t cpus_allowed; + u32 l, h; + + cpus_allowed = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + BUG_ON(smp_processor_id() != cpu); + + rdmsr(MSR_IA32_THERM_CONTROL, l, h); + + set_cpus_allowed(current, cpus_allowed); + + if (l & 0x10) { + l = l >> 1; + l &= 0x7; + } else + l = DC_DISABLE; + + if (l != DC_DISABLE) + return (stock_freq * l / 8); + + return stock_freq; +} + +static struct freq_attr* p4clockmod_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver p4clockmod_driver = { + .verify = cpufreq_p4_verify, + .target = cpufreq_p4_target, + .init = cpufreq_p4_cpu_init, + .exit = cpufreq_p4_cpu_exit, + .get = cpufreq_p4_get, + .name = "p4-clockmod", + .owner = THIS_MODULE, + .attr = p4clockmod_attr, +}; + + +static int __init cpufreq_p4_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + int ret; + + /* + * THERM_CONTROL is architectural for IA32 now, so + * we can rely on the capability checks + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) || + !test_bit(X86_FEATURE_ACC, c->x86_capability)) + return -ENODEV; + + ret = cpufreq_register_driver(&p4clockmod_driver); + if (!ret) + printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock Modulation available\n"); + + return (ret); +} + + +static void __exit cpufreq_p4_exit(void) +{ + cpufreq_unregister_driver(&p4clockmod_driver); +} + + +MODULE_AUTHOR ("Zwane Mwaikambo <zwane@commfireservices.com>"); +MODULE_DESCRIPTION ("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); +MODULE_LICENSE ("GPL"); + +late_initcall(cpufreq_p4_init); +module_exit(cpufreq_p4_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k6.c b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c new file mode 100644 index 000000000000..222f8cfe3c57 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k6.c @@ -0,0 +1,256 @@ +/* + * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) + * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/ioport.h> +#include <linux/slab.h> + +#include <asm/msr.h> +#include <asm/timex.h> +#include <asm/io.h> + + +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ + +static unsigned int busfreq; /* FSB, in 10 kHz */ +static unsigned int max_multiplier; + + +/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ +static struct cpufreq_frequency_table clock_ratio[] = { + {45, /* 000 -> 4.5x */ 0}, + {50, /* 001 -> 5.0x */ 0}, + {40, /* 010 -> 4.0x */ 0}, + {55, /* 011 -> 5.5x */ 0}, + {20, /* 100 -> 2.0x */ 0}, + {30, /* 101 -> 3.0x */ 0}, + {60, /* 110 -> 6.0x */ 0}, + {35, /* 111 -> 3.5x */ 0}, + {0, CPUFREQ_TABLE_END} +}; + + +/** + * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier + * + * Returns the current setting of the frequency multiplier. Core clock + * speed is frequency of the Front-Side Bus multiplied with this value. + */ +static int powernow_k6_get_cpu_multiplier(void) +{ + u64 invalue = 0; + u32 msrval; + + msrval = POWERNOW_IOPORT + 0x1; + wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ + invalue=inl(POWERNOW_IOPORT + 0x8); + msrval = POWERNOW_IOPORT + 0x0; + wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ + + return clock_ratio[(invalue >> 5)&7].index; +} + + +/** + * powernow_k6_set_state - set the PowerNow! multiplier + * @best_i: clock_ratio[best_i] is the target multiplier + * + * Tries to change the PowerNow! multiplier + */ +static void powernow_k6_set_state (unsigned int best_i) +{ + unsigned long outvalue=0, invalue=0; + unsigned long msrval; + struct cpufreq_freqs freqs; + + if (clock_ratio[best_i].index > max_multiplier) { + printk(KERN_ERR "cpufreq: invalid target frequency\n"); + return; + } + + freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); + freqs.new = busfreq * clock_ratio[best_i].index; + freqs.cpu = 0; /* powernow-k6.c is UP only driver */ + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* we now need to transform best_i to the BVC format, see AMD#23446 */ + + outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); + + msrval = POWERNOW_IOPORT + 0x1; + wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ + invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = invalue & 0xf; + outvalue = outvalue | invalue; + outl(outvalue ,(POWERNOW_IOPORT + 0x8)); + msrval = POWERNOW_IOPORT + 0x0; + wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return; +} + + +/** + * powernow_k6_verify - verifies a new CPUfreq policy + * @policy: new policy + * + * Policy must be within lowest and highest possible CPU Frequency, + * and at least one possible state must be within min and max. + */ +static int powernow_k6_verify(struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); +} + + +/** + * powernow_k6_setpolicy - sets a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * sets a new CPUFreq policy + */ +static int powernow_k6_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + + if (cpufreq_frequency_table_target(policy, &clock_ratio[0], target_freq, relation, &newstate)) + return -EINVAL; + + powernow_k6_set_state(newstate); + + return 0; +} + + +static int powernow_k6_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + int result; + + if (policy->cpu != 0) + return -ENODEV; + + /* get frequencies */ + max_multiplier = powernow_k6_get_cpu_multiplier(); + busfreq = cpu_khz / max_multiplier; + + /* table init */ + for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + if (clock_ratio[i].index > max_multiplier) + clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; + else + clock_ratio[i].frequency = busfreq * clock_ratio[i].index; + } + + /* cpuinfo and default policy values */ + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = busfreq * max_multiplier; + + result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); + if (result) + return (result); + + cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); + + return 0; +} + + +static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int i; + for (i=0; i<8; i++) { + if (i==max_multiplier) + powernow_k6_set_state(i); + } + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int powernow_k6_get(unsigned int cpu) +{ + return busfreq * powernow_k6_get_cpu_multiplier(); +} + +static struct freq_attr* powernow_k6_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver powernow_k6_driver = { + .verify = powernow_k6_verify, + .target = powernow_k6_target, + .init = powernow_k6_cpu_init, + .exit = powernow_k6_cpu_exit, + .get = powernow_k6_get, + .name = "powernow-k6", + .owner = THIS_MODULE, + .attr = powernow_k6_attr, +}; + + +/** + * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver + * + * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported + * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero + * on success. + */ +static int __init powernow_k6_init(void) +{ + struct cpuinfo_x86 *c = cpu_data; + + if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || + ((c->x86_model != 12) && (c->x86_model != 13))) + return -ENODEV; + + if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { + printk("cpufreq: PowerNow IOPORT region already used.\n"); + return -EIO; + } + + if (cpufreq_register_driver(&powernow_k6_driver)) { + release_region (POWERNOW_IOPORT, 16); + return -EINVAL; + } + + return 0; +} + + +/** + * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support + * + * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. + */ +static void __exit powernow_k6_exit(void) +{ + cpufreq_unregister_driver(&powernow_k6_driver); + release_region (POWERNOW_IOPORT, 16); +} + + +MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE ("GPL"); + +module_init(powernow_k6_init); +module_exit(powernow_k6_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c new file mode 100644 index 000000000000..913f652623d9 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c @@ -0,0 +1,690 @@ +/* + * AMD K7 Powernow driver. + * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. + * (C) 2003-2004 Dave Jones <davej@redhat.com> + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by AMD. + * + * Errata 5: Processor may fail to execute a FID/VID change in presence of interrupt. + * - We cli/sti on stepping A0 CPUs around the FID/VID transition. + * Errata 15: Processors with half frequency multipliers may hang upon wakeup from disconnect. + * - We disable half multipliers if ACPI is used on A0 stepping CPUs. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/dmi.h> + +#include <asm/msr.h> +#include <asm/timex.h> +#include <asm/io.h> +#include <asm/system.h> + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI +#include <linux/acpi.h> +#include <acpi/processor.h> +#endif + +#include "powernow-k7.h" + +#define PFX "powernow: " + + +struct psb_s { + u8 signature[10]; + u8 tableversion; + u8 flags; + u16 settlingtime; + u8 reserved1; + u8 numpst; +}; + +struct pst_s { + u32 cpuid; + u8 fsbspeed; + u8 maxfid; + u8 startvid; + u8 numpstates; +}; + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI +union powernow_acpi_control_t { + struct { + unsigned long fid:5, + vid:5, + sgtc:20, + res1:2; + } bits; + unsigned long val; +}; +#endif + +#ifdef CONFIG_CPU_FREQ_DEBUG +/* divide by 1000 to get VCore voltage in V. */ +static int mobile_vid_table[32] = { + 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, + 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, + 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, + 1075, 1050, 1025, 1000, 975, 950, 925, 0, +}; +#endif + +/* divide by 10 to get FID. */ +static int fid_codes[32] = { + 110, 115, 120, 125, 50, 55, 60, 65, + 70, 75, 80, 85, 90, 95, 100, 105, + 30, 190, 40, 200, 130, 135, 140, 210, + 150, 225, 160, 165, 170, 180, -1, -1, +}; + +/* This parameter is used in order to force ACPI instead of legacy method for + * configuration purpose. + */ + +static int acpi_force; + +static struct cpufreq_frequency_table *powernow_table; + +static unsigned int can_scale_bus; +static unsigned int can_scale_vid; +static unsigned int minimum_speed=-1; +static unsigned int maximum_speed; +static unsigned int number_scales; +static unsigned int fsb; +static unsigned int latency; +static char have_a0; + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k7", msg) + +static int check_fsb(unsigned int fsbspeed) +{ + int delta; + unsigned int f = fsb / 1000; + + delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; + return (delta < 5); +} + +static int check_powernow(void) +{ + struct cpuinfo_x86 *c = cpu_data; + unsigned int maxei, eax, ebx, ecx, edx; + + if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) { +#ifdef MODULE + printk (KERN_INFO PFX "This module only works with AMD K7 CPUs\n"); +#endif + return 0; + } + + /* Get maximum capabilities */ + maxei = cpuid_eax (0x80000000); + if (maxei < 0x80000007) { /* Any powernow info ? */ +#ifdef MODULE + printk (KERN_INFO PFX "No powernow capabilities detected\n"); +#endif + return 0; + } + + if ((c->x86_model == 6) && (c->x86_mask == 0)) { + printk (KERN_INFO PFX "K7 660[A0] core detected, enabling errata workarounds\n"); + have_a0 = 1; + } + + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + /* Check we can actually do something before we say anything.*/ + if (!(edx & (1 << 1 | 1 << 2))) + return 0; + + printk (KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); + + if (edx & 1 << 1) { + printk ("frequency"); + can_scale_bus=1; + } + + if ((edx & (1 << 1 | 1 << 2)) == 0x6) + printk (" and "); + + if (edx & 1 << 2) { + printk ("voltage"); + can_scale_vid=1; + } + + printk (".\n"); + return 1; +} + + +static int get_ranges (unsigned char *pst) +{ + unsigned int j; + unsigned int speed; + u8 fid, vid; + + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) * (number_scales + 1)), GFP_KERNEL); + if (!powernow_table) + return -ENOMEM; + memset(powernow_table, 0, (sizeof(struct cpufreq_frequency_table) * (number_scales + 1))); + + for (j=0 ; j < number_scales; j++) { + fid = *pst++; + + powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; + powernow_table[j].index = fid; /* lower 8 bits */ + + speed = powernow_table[j].frequency; + + if ((fid_codes[fid] % 10)==5) { +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + if (have_a0 == 1) + powernow_table[j].frequency = CPUFREQ_ENTRY_INVALID; +#endif + } + + if (speed < minimum_speed) + minimum_speed = speed; + if (speed > maximum_speed) + maximum_speed = speed; + + vid = *pst++; + powernow_table[j].index |= (vid << 8); /* upper 8 bits */ + + dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " + "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, + fid_codes[fid] % 10, speed/1000, vid, + mobile_vid_table[vid]/1000, + mobile_vid_table[vid]%1000); + } + powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; + powernow_table[number_scales].index = 0; + + return 0; +} + + +static void change_FID(int fid) +{ + union msr_fidvidctl fidvidctl; + + rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); + if (fidvidctl.bits.FID != fid) { + fidvidctl.bits.SGTC = latency; + fidvidctl.bits.FID = fid; + fidvidctl.bits.VIDC = 0; + fidvidctl.bits.FIDC = 1; + wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); + } +} + + +static void change_VID(int vid) +{ + union msr_fidvidctl fidvidctl; + + rdmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); + if (fidvidctl.bits.VID != vid) { + fidvidctl.bits.SGTC = latency; + fidvidctl.bits.VID = vid; + fidvidctl.bits.FIDC = 0; + fidvidctl.bits.VIDC = 1; + wrmsrl (MSR_K7_FID_VID_CTL, fidvidctl.val); + } +} + + +static void change_speed (unsigned int index) +{ + u8 fid, vid; + struct cpufreq_freqs freqs; + union msr_fidvidstatus fidvidstatus; + int cfid; + + /* fid are the lower 8 bits of the index we stored into + * the cpufreq frequency table in powernow_decode_bios, + * vid are the upper 8 bits. + */ + + fid = powernow_table[index].index & 0xFF; + vid = (powernow_table[index].index & 0xFF00) >> 8; + + freqs.cpu = 0; + + rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); + cfid = fidvidstatus.bits.CFID; + freqs.old = fsb * fid_codes[cfid] / 10; + + freqs.new = powernow_table[index].frequency; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* Now do the magic poking into the MSRs. */ + + if (have_a0 == 1) /* A0 errata 5 */ + local_irq_disable(); + + if (freqs.old > freqs.new) { + /* Going down, so change FID first */ + change_FID(fid); + change_VID(vid); + } else { + /* Going up, so change VID first */ + change_VID(vid); + change_FID(fid); + } + + + if (have_a0 == 1) + local_irq_enable(); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); +} + + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + +static struct acpi_processor_performance *acpi_processor_perf; + +static int powernow_acpi_init(void) +{ + int i; + int retval = 0; + union powernow_acpi_control_t pc; + + if (acpi_processor_perf != NULL && powernow_table != NULL) { + retval = -EINVAL; + goto err0; + } + + acpi_processor_perf = kmalloc(sizeof(struct acpi_processor_performance), + GFP_KERNEL); + + if (!acpi_processor_perf) { + retval = -ENOMEM; + goto err0; + } + + memset(acpi_processor_perf, 0, sizeof(struct acpi_processor_performance)); + + if (acpi_processor_register_performance(acpi_processor_perf, 0)) { + retval = -EIO; + goto err1; + } + + if (acpi_processor_perf->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + if (acpi_processor_perf->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) { + retval = -ENODEV; + goto err2; + } + + number_scales = acpi_processor_perf->state_count; + + if (number_scales < 2) { + retval = -ENODEV; + goto err2; + } + + powernow_table = kmalloc((number_scales + 1) * (sizeof(struct cpufreq_frequency_table)), GFP_KERNEL); + if (!powernow_table) { + retval = -ENOMEM; + goto err2; + } + + memset(powernow_table, 0, ((number_scales + 1) * sizeof(struct cpufreq_frequency_table))); + + pc.val = (unsigned long) acpi_processor_perf->states[0].control; + for (i = 0; i < number_scales; i++) { + u8 fid, vid; + unsigned int speed; + + pc.val = (unsigned long) acpi_processor_perf->states[i].control; + dprintk ("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", + i, + (u32) acpi_processor_perf->states[i].core_frequency, + (u32) acpi_processor_perf->states[i].power, + (u32) acpi_processor_perf->states[i].transition_latency, + (u32) acpi_processor_perf->states[i].control, + pc.bits.sgtc); + + vid = pc.bits.vid; + fid = pc.bits.fid; + + powernow_table[i].frequency = fsb * fid_codes[fid] / 10; + powernow_table[i].index = fid; /* lower 8 bits */ + powernow_table[i].index |= (vid << 8); /* upper 8 bits */ + + speed = powernow_table[i].frequency; + + if ((fid_codes[fid] % 10)==5) { + if (have_a0 == 1) + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + } + + dprintk (" FID: 0x%x (%d.%dx [%dMHz]) " + "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, + fid_codes[fid] % 10, speed/1000, vid, + mobile_vid_table[vid]/1000, + mobile_vid_table[vid]%1000); + + if (latency < pc.bits.sgtc) + latency = pc.bits.sgtc; + + if (speed < minimum_speed) + minimum_speed = speed; + if (speed > maximum_speed) + maximum_speed = speed; + } + + powernow_table[i].frequency = CPUFREQ_TABLE_END; + powernow_table[i].index = 0; + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + return 0; + +err2: + acpi_processor_unregister_performance(acpi_processor_perf, 0); +err1: + kfree(acpi_processor_perf); +err0: + printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); + acpi_processor_perf = NULL; + return retval; +} +#else +static int powernow_acpi_init(void) +{ + printk(KERN_INFO PFX "no support for ACPI processor found." + " Please recompile your kernel with ACPI processor\n"); + return -EINVAL; +} +#endif + +static int powernow_decode_bios (int maxfid, int startvid) +{ + struct psb_s *psb; + struct pst_s *pst; + unsigned int i, j; + unsigned char *p; + unsigned int etuple; + unsigned int ret; + + etuple = cpuid_eax(0x80000001); + + for (i=0xC0000; i < 0xffff0 ; i+=16) { + + p = phys_to_virt(i); + + if (memcmp(p, "AMDK7PNOW!", 10) == 0){ + dprintk ("Found PSB header at %p\n", p); + psb = (struct psb_s *) p; + dprintk ("Table version: 0x%x\n", psb->tableversion); + if (psb->tableversion != 0x12) { + printk (KERN_INFO PFX "Sorry, only v1.2 tables supported right now\n"); + return -ENODEV; + } + + dprintk ("Flags: 0x%x\n", psb->flags); + if ((psb->flags & 1)==0) { + dprintk ("Mobile voltage regulator\n"); + } else { + dprintk ("Desktop voltage regulator\n"); + } + + latency = psb->settlingtime; + if (latency < 100) { + printk (KERN_INFO PFX "BIOS set settling time to %d microseconds." + "Should be at least 100. Correcting.\n", latency); + latency = 100; + } + dprintk ("Settling Time: %d microseconds.\n", psb->settlingtime); + dprintk ("Has %d PST tables. (Only dumping ones relevant to this CPU).\n", psb->numpst); + + p += sizeof (struct psb_s); + + pst = (struct pst_s *) p; + + for (i = 0 ; i <psb->numpst; i++) { + pst = (struct pst_s *) p; + number_scales = pst->numpstates; + + if ((etuple == pst->cpuid) && check_fsb(pst->fsbspeed) && + (maxfid==pst->maxfid) && (startvid==pst->startvid)) + { + dprintk ("PST:%d (@%p)\n", i, pst); + dprintk (" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", + pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); + + ret = get_ranges ((char *) pst + sizeof (struct pst_s)); + return ret; + + } else { + p = (char *) pst + sizeof (struct pst_s); + for (j=0 ; j < number_scales; j++) + p+=2; + } + } + printk (KERN_INFO PFX "No PST tables match this cpuid (0x%x)\n", etuple); + printk (KERN_INFO PFX "This is indicative of a broken BIOS.\n"); + + return -EINVAL; + } + p++; + } + + return -ENODEV; +} + + +static int powernow_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate; + + if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, relation, &newstate)) + return -EINVAL; + + change_speed(newstate); + + return 0; +} + + +static int powernow_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, powernow_table); +} + +/* + * We use the fact that the bus frequency is somehow + * a multiple of 100000/3 khz, then we compute sgtc according + * to this multiple. + * That way, we match more how AMD thinks all of that work. + * We will then get the same kind of behaviour already tested under + * the "well-known" other OS. + */ +static int __init fixup_sgtc(void) +{ + unsigned int sgtc; + unsigned int m; + + m = fsb / 3333; + if ((m % 10) >= 5) + m += 5; + + m /= 10; + + sgtc = 100 * m * latency; + sgtc = sgtc / 3; + if (sgtc > 0xfffff) { + printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); + sgtc = 0xfffff; + } + return sgtc; +} + +static unsigned int powernow_get(unsigned int cpu) +{ + union msr_fidvidstatus fidvidstatus; + unsigned int cfid; + + if (cpu) + return 0; + rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); + cfid = fidvidstatus.bits.CFID; + + return (fsb * fid_codes[cfid] / 10); +} + + +static int __init acer_cpufreq_pst(struct dmi_system_id *d) +{ + printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident); + printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n"); + printk(KERN_WARNING "cpufreq scaling has been disabled as a result of this.\n"); + return 0; +} + +/* + * Some Athlon laptops have really fucked PST tables. + * A BIOS update is all that can save them. + * Mention this, and disable cpufreq. + */ +static struct dmi_system_id __initdata powernow_dmi_table[] = { + { + .callback = acer_cpufreq_pst, + .ident = "Acer Aspire", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), + DMI_MATCH(DMI_BIOS_VERSION, "3A71"), + }, + }, + { } +}; + +static int __init powernow_cpu_init (struct cpufreq_policy *policy) +{ + union msr_fidvidstatus fidvidstatus; + int result; + + if (policy->cpu != 0) + return -ENODEV; + + rdmsrl (MSR_K7_FID_VID_STATUS, fidvidstatus.val); + + /* A K7 with powernow technology is set to max frequency by BIOS */ + fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.MFID]; + if (!fsb) { + printk(KERN_WARNING PFX "can not determine bus frequency\n"); + return -EINVAL; + } + dprintk("FSB: %3d.%03d MHz\n", fsb/1000, fsb%1000); + + if (dmi_check_system(powernow_dmi_table) || acpi_force) { + printk (KERN_INFO PFX "PSB/PST known to be broken. Trying ACPI instead\n"); + result = powernow_acpi_init(); + } else { + result = powernow_decode_bios(fidvidstatus.bits.MFID, fidvidstatus.bits.SVID); + if (result) { + printk (KERN_INFO PFX "Trying ACPI perflib\n"); + maximum_speed = 0; + minimum_speed = -1; + latency = 0; + result = powernow_acpi_init(); + if (result) { + printk (KERN_INFO PFX "ACPI and legacy methods failed\n"); + printk (KERN_INFO PFX "See http://www.codemonkey.org.uk/projects/cpufreq/powernow-k7.shtml\n"); + } + } else { + /* SGTC use the bus clock as timer */ + latency = fixup_sgtc(); + printk(KERN_INFO PFX "SGTC: %d\n", latency); + } + } + + if (result) + return result; + + printk (KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", + minimum_speed/1000, maximum_speed/1000); + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + + policy->cpuinfo.transition_latency = cpufreq_scale(2000000UL, fsb, latency); + + policy->cur = powernow_get(0); + + cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); + + return cpufreq_frequency_table_cpuinfo(policy, powernow_table); +} + +static int powernow_cpu_exit (struct cpufreq_policy *policy) { + cpufreq_frequency_table_put_attr(policy->cpu); + +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + if (acpi_processor_perf) { + acpi_processor_unregister_performance(acpi_processor_perf, 0); + kfree(acpi_processor_perf); + } +#endif + + if (powernow_table) + kfree(powernow_table); + + return 0; +} + +static struct freq_attr* powernow_table_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver powernow_driver = { + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, +}; + +static int __init powernow_init (void) +{ + if (check_powernow()==0) + return -ENODEV; + return cpufreq_register_driver(&powernow_driver); +} + + +static void __exit powernow_exit (void) +{ + cpufreq_unregister_driver(&powernow_driver); +} + +module_param(acpi_force, int, 0444); +MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); + +MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); +MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); +MODULE_LICENSE ("GPL"); + +late_initcall(powernow_init); +module_exit(powernow_exit); + diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.h b/arch/i386/kernel/cpu/cpufreq/powernow-k7.h new file mode 100644 index 000000000000..f8a63b3664e3 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.h @@ -0,0 +1,44 @@ +/* + * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ + * (C) 2003 Dave Jones. + * + * Licensed under the terms of the GNU GPL License version 2. + * + * AMD-specific information + * + */ + +union msr_fidvidctl { + struct { + unsigned FID:5, // 4:0 + reserved1:3, // 7:5 + VID:5, // 12:8 + reserved2:3, // 15:13 + FIDC:1, // 16 + VIDC:1, // 17 + reserved3:2, // 19:18 + FIDCHGRATIO:1, // 20 + reserved4:11, // 31-21 + SGTC:20, // 32:51 + reserved5:12; // 63:52 + } bits; + unsigned long long val; +}; + +union msr_fidvidstatus { + struct { + unsigned CFID:5, // 4:0 + reserved1:3, // 7:5 + SFID:5, // 12:8 + reserved2:3, // 15:13 + MFID:5, // 20:16 + reserved3:11, // 31:21 + CVID:5, // 36:32 + reserved4:3, // 39:37 + SVID:5, // 44:40 + reserved5:3, // 47:45 + MVID:5, // 52:48 + reserved6:11; // 63:53 + } bits; + unsigned long long val; +}; diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c new file mode 100644 index 000000000000..a65ff7e32e5d --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -0,0 +1,1135 @@ +/* + * (c) 2003, 2004 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Support : paul.devriendt@amd.com + * + * Based on the powernow-k7.c module written by Dave Jones. + * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs + * (C) 2004 Dominik Brodowski <linux@brodo.de> + * (C) 2004 Pavel Machek <pavel@suse.cz> + * Licensed under the terms of the GNU GPL License version 2. + * Based upon datasheets & sample CPUs kindly provided by AMD. + * + * Valuable input gratefully received from Dave Jones, Pavel Machek, + * Dominik Brodowski, and others. + * Processor information obtained from Chapter 9 (Power and Thermal Management) + * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" available for download from www.amd.com + * + * Tables for specific CPUs can be infrerred from + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf + */ + +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/delay.h> + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI +#include <linux/acpi.h> +#include <acpi/processor.h> +#endif + +#define PFX "powernow-k8: " +#define BFX PFX "BIOS error: " +#define VERSION "version 1.00.09e" +#include "powernow-k8.h" + +/* serialize freq changes */ +static DECLARE_MUTEX(fidvid_sem); + +static struct powernow_k8_data *powernow_data[NR_CPUS]; + +/* Return a frequency in MHz, given an input fid */ +static u32 find_freq_from_fid(u32 fid) +{ + return 800 + (fid * 100); +} + +/* Return a frequency in KHz, given an input fid */ +static u32 find_khz_freq_from_fid(u32 fid) +{ + return 1000 * find_freq_from_fid(fid); +} + +/* Return a voltage in miliVolts, given an input vid */ +static u32 find_millivolts_from_vid(struct powernow_k8_data *data, u32 vid) +{ + return 1550-vid*25; +} + +/* Return the vco fid for an input fid + * + * Each "low" fid has corresponding "high" fid, and you can get to "low" fids + * only from corresponding high fids. This returns "high" fid corresponding to + * "low" one. + */ +static u32 convert_fid_to_vco_fid(u32 fid) +{ + if (fid < HI_FID_TABLE_BOTTOM) { + return 8 + (2 * fid); + } else { + return fid; + } +} + +/* + * Return 1 if the pending bit is set. Unless we just instructed the processor + * to transition to a new state, seeing this bit set is really bad news. + */ +static int pending_bit_stuck(void) +{ + u32 lo, hi; + + rdmsr(MSR_FIDVID_STATUS, lo, hi); + return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; +} + +/* + * Update the global current fid / vid values from the status msr. + * Returns 1 on error. + */ +static int query_current_values_with_pending_wait(struct powernow_k8_data *data) +{ + u32 lo, hi; + u32 i = 0; + + lo = MSR_S_LO_CHANGE_PENDING; + while (lo & MSR_S_LO_CHANGE_PENDING) { + if (i++ > 0x1000000) { + printk(KERN_ERR PFX "detected change pending stuck\n"); + return 1; + } + rdmsr(MSR_FIDVID_STATUS, lo, hi); + } + + data->currvid = hi & MSR_S_HI_CURRENT_VID; + data->currfid = lo & MSR_S_LO_CURRENT_FID; + + return 0; +} + +/* the isochronous relief time */ +static void count_off_irt(struct powernow_k8_data *data) +{ + udelay((1 << data->irt) * 10); + return; +} + +/* the voltage stabalization time */ +static void count_off_vst(struct powernow_k8_data *data) +{ + udelay(data->vstable * VST_UNITS_20US); + return; +} + +/* need to init the control msr to a safe value (for each cpu) */ +static void fidvid_msr_init(void) +{ + u32 lo, hi; + u8 fid, vid; + + rdmsr(MSR_FIDVID_STATUS, lo, hi); + vid = hi & MSR_S_HI_CURRENT_VID; + fid = lo & MSR_S_LO_CURRENT_FID; + lo = fid | (vid << MSR_C_LO_VID_SHIFT); + hi = MSR_C_HI_STP_GNT_BENIGN; + dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); + wrmsr(MSR_FIDVID_CTL, lo, hi); +} + + +/* write the new fid value along with the other control fields to the msr */ +static int write_new_fid(struct powernow_k8_data *data, u32 fid) +{ + u32 lo; + u32 savevid = data->currvid; + + if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { + printk(KERN_ERR PFX "internal error - overflow on fid write\n"); + return 1; + } + + lo = fid | (data->currvid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; + + dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", + fid, lo, data->plllock * PLL_LOCK_CONVERSION); + + wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); + + if (query_current_values_with_pending_wait(data)) + return 1; + + count_off_irt(data); + + if (savevid != data->currvid) { + printk(KERN_ERR PFX "vid change on fid trans, old 0x%x, new 0x%x\n", + savevid, data->currvid); + return 1; + } + + if (fid != data->currfid) { + printk(KERN_ERR PFX "fid trans failed, fid 0x%x, curr 0x%x\n", fid, + data->currfid); + return 1; + } + + return 0; +} + +/* Write a new vid to the hardware */ +static int write_new_vid(struct powernow_k8_data *data, u32 vid) +{ + u32 lo; + u32 savefid = data->currfid; + + if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { + printk(KERN_ERR PFX "internal error - overflow on vid write\n"); + return 1; + } + + lo = data->currfid | (vid << MSR_C_LO_VID_SHIFT) | MSR_C_LO_INIT_FID_VID; + + dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", + vid, lo, STOP_GRANT_5NS); + + wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (savefid != data->currfid) { + printk(KERN_ERR PFX "fid changed on vid trans, old 0x%x new 0x%x\n", + savefid, data->currfid); + return 1; + } + + if (vid != data->currvid) { + printk(KERN_ERR PFX "vid trans failed, vid 0x%x, curr 0x%x\n", vid, + data->currvid); + return 1; + } + + return 0; +} + +/* + * Reduce the vid by the max of step or reqvid. + * Decreasing vid codes represent increasing voltages: + * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of 0x1f is off. + */ +static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step) +{ + if ((data->currvid - reqvid) > step) + reqvid = data->currvid - step; + + if (write_new_vid(data, reqvid)) + return 1; + + count_off_vst(data); + + return 0; +} + +/* Change the fid and vid, by the 3 phases. */ +static int transition_fid_vid(struct powernow_k8_data *data, u32 reqfid, u32 reqvid) +{ + if (core_voltage_pre_transition(data, reqvid)) + return 1; + + if (core_frequency_transition(data, reqfid)) + return 1; + + if (core_voltage_post_transition(data, reqvid)) + return 1; + + if (query_current_values_with_pending_wait(data)) + return 1; + + if ((reqfid != data->currfid) || (reqvid != data->currvid)) { + printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, curr 0x%x 0x%x\n", + smp_processor_id(), + reqfid, reqvid, data->currfid, data->currvid); + return 1; + } + + dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", + smp_processor_id(), data->currfid, data->currvid); + + return 0; +} + +/* Phase 1 - core voltage transition ... setup voltage */ +static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid) +{ + u32 rvosteps = data->rvo; + u32 savefid = data->currfid; + + dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, reqvid 0x%x, rvo 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid, reqvid, data->rvo); + + while (data->currvid > reqvid) { + dprintk("ph1: curr 0x%x, req vid 0x%x\n", + data->currvid, reqvid); + if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) + return 1; + } + + while ((rvosteps > 0) && ((data->rvo + data->currvid) > reqvid)) { + if (data->currvid == 0) { + rvosteps = 0; + } else { + dprintk("ph1: changing vid for rvo, req 0x%x\n", + data->currvid - 1); + if (decrease_vid_code_by_step(data, data->currvid - 1, 1)) + return 1; + rvosteps--; + } + } + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (savefid != data->currfid) { + printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", data->currfid); + return 1; + } + + dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +/* Phase 2 - core frequency transition */ +static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) +{ + u32 vcoreqfid, vcocurrfid, vcofiddiff, savevid = data->currvid; + + if ((reqfid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { + printk(KERN_ERR PFX "ph2: illegal lo-lo transition 0x%x 0x%x\n", + reqfid, data->currfid); + return 1; + } + + if (data->currfid == reqfid) { + printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", data->currfid); + return 0; + } + + dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, reqfid 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid, reqfid); + + vcoreqfid = convert_fid_to_vco_fid(reqfid); + vcocurrfid = convert_fid_to_vco_fid(data->currfid); + vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid + : vcoreqfid - vcocurrfid; + + while (vcofiddiff > 2) { + if (reqfid > data->currfid) { + if (data->currfid > LO_FID_TABLE_TOP) { + if (write_new_fid(data, data->currfid + 2)) { + return 1; + } + } else { + if (write_new_fid + (data, 2 + convert_fid_to_vco_fid(data->currfid))) { + return 1; + } + } + } else { + if (write_new_fid(data, data->currfid - 2)) + return 1; + } + + vcocurrfid = convert_fid_to_vco_fid(data->currfid); + vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid + : vcoreqfid - vcocurrfid; + } + + if (write_new_fid(data, reqfid)) + return 1; + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (data->currfid != reqfid) { + printk(KERN_ERR PFX + "ph2: mismatch, failed fid transition, curr 0x%x, req 0x%x\n", + data->currfid, reqfid); + return 1; + } + + if (savevid != data->currvid) { + printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", + savevid, data->currvid); + return 1; + } + + dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +/* Phase 3 - core voltage transition flow ... jump to the final vid. */ +static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid) +{ + u32 savefid = data->currfid; + u32 savereqvid = reqvid; + + dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", + smp_processor_id(), + data->currfid, data->currvid); + + if (reqvid != data->currvid) { + if (write_new_vid(data, reqvid)) + return 1; + + if (savefid != data->currfid) { + printk(KERN_ERR PFX + "ph3: bad fid change, save 0x%x, curr 0x%x\n", + savefid, data->currfid); + return 1; + } + + if (data->currvid != reqvid) { + printk(KERN_ERR PFX + "ph3: failed vid transition\n, req 0x%x, curr 0x%x", + reqvid, data->currvid); + return 1; + } + } + + if (query_current_values_with_pending_wait(data)) + return 1; + + if (savereqvid != data->currvid) { + dprintk("ph3 failed, currvid 0x%x\n", data->currvid); + return 1; + } + + if (savefid != data->currfid) { + dprintk("ph3 failed, currfid changed 0x%x\n", + data->currfid); + return 1; + } + + dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n", + data->currfid, data->currvid); + + return 0; +} + +static int check_supported_cpu(unsigned int cpu) +{ + cpumask_t oldmask = CPU_MASK_ALL; + u32 eax, ebx, ecx, edx; + unsigned int rc = 0; + + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + schedule(); + + if (smp_processor_id() != cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", cpu); + goto out; + } + + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) + goto out; + + eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || + ((eax & CPUID_XFAM) != CPUID_XFAM_K8) || + ((eax & CPUID_XMOD) > CPUID_XMOD_REV_E)) { + printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); + goto out; + } + + eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); + if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { + printk(KERN_INFO PFX + "No frequency change capabilities detected\n"); + goto out; + } + + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { + printk(KERN_INFO PFX "Power state transitions not supported\n"); + goto out; + } + + rc = 1; + +out: + set_cpus_allowed(current, oldmask); + schedule(); + return rc; + +} + +static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) +{ + unsigned int j; + u8 lastfid = 0xff; + + for (j = 0; j < data->numps; j++) { + if (pst[j].vid > LEAST_VID) { + printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); + return -EINVAL; + } + if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ + printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); + return -ENODEV; + } + if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ + printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); + return -ENODEV; + } + if ((pst[j].fid > MAX_FID) + || (pst[j].fid & 1) + || (j && (pst[j].fid < HI_FID_TABLE_BOTTOM))) { + /* Only first fid is allowed to be in "low" range */ + printk(KERN_ERR PFX "two low fids - %d : 0x%x\n", j, pst[j].fid); + return -EINVAL; + } + if (pst[j].fid < lastfid) + lastfid = pst[j].fid; + } + if (lastfid & 1) { + printk(KERN_ERR PFX "lastfid invalid\n"); + return -EINVAL; + } + if (lastfid > LO_FID_TABLE_TOP) + printk(KERN_INFO PFX "first fid not from lo freq table\n"); + + return 0; +} + +static void print_basics(struct powernow_k8_data *data) +{ + int j; + for (j = 0; j < data->numps; j++) { + if (data->powernow_table[j].frequency != CPUFREQ_ENTRY_INVALID) + printk(KERN_INFO PFX " %d : fid 0x%x (%d MHz), vid 0x%x (%d mV)\n", j, + data->powernow_table[j].index & 0xff, + data->powernow_table[j].frequency/1000, + data->powernow_table[j].index >> 8, + find_millivolts_from_vid(data, data->powernow_table[j].index >> 8)); + } + if (data->batps) + printk(KERN_INFO PFX "Only %d pstates on battery\n", data->batps); +} + +static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, u8 maxvid) +{ + struct cpufreq_frequency_table *powernow_table; + unsigned int j; + + if (data->batps) { /* use ACPI support to get full speed on mains power */ + printk(KERN_WARNING PFX "Only %d pstates usable (use ACPI driver for full range\n", data->batps); + data->numps = data->batps; + } + + for ( j=1; j<data->numps; j++ ) { + if (pst[j-1].fid >= pst[j].fid) { + printk(KERN_ERR PFX "PST out of sequence\n"); + return -EINVAL; + } + } + + if (data->numps < 2) { + printk(KERN_ERR PFX "no p states to transition\n"); + return -ENODEV; + } + + if (check_pst_table(data, pst, maxvid)) + return -EINVAL; + + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->numps + 1)), GFP_KERNEL); + if (!powernow_table) { + printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); + return -ENOMEM; + } + + for (j = 0; j < data->numps; j++) { + powernow_table[j].index = pst[j].fid; /* lower 8 bits */ + powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ + powernow_table[j].frequency = find_khz_freq_from_fid(pst[j].fid); + } + powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; + powernow_table[data->numps].index = 0; + + if (query_current_values_with_pending_wait(data)) { + kfree(powernow_table); + return -EIO; + } + + dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); + data->powernow_table = powernow_table; + print_basics(data); + + for (j = 0; j < data->numps; j++) + if ((pst[j].fid==data->currfid) && (pst[j].vid==data->currvid)) + return 0; + + dprintk("currfid/vid do not match PST, ignoring\n"); + return 0; +} + +/* Find and validate the PSB/PST table in BIOS. */ +static int find_psb_table(struct powernow_k8_data *data) +{ + struct psb_s *psb; + unsigned int i; + u32 mvs; + u8 maxvid; + u32 cpst = 0; + u32 thiscpuid; + + for (i = 0xc0000; i < 0xffff0; i += 0x10) { + /* Scan BIOS looking for the signature. */ + /* It can not be at ffff0 - it is too big. */ + + psb = phys_to_virt(i); + if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) + continue; + + dprintk("found PSB header at 0x%p\n", psb); + + dprintk("table vers: 0x%x\n", psb->tableversion); + if (psb->tableversion != PSB_VERSION_1_4) { + printk(KERN_INFO BFX "PSB table is not v1.4\n"); + return -ENODEV; + } + + dprintk("flags: 0x%x\n", psb->flags1); + if (psb->flags1) { + printk(KERN_ERR BFX "unknown flags\n"); + return -ENODEV; + } + + data->vstable = psb->vstable; + dprintk("voltage stabilization time: %d(*20us)\n", data->vstable); + + dprintk("flags2: 0x%x\n", psb->flags2); + data->rvo = psb->flags2 & 3; + data->irt = ((psb->flags2) >> 2) & 3; + mvs = ((psb->flags2) >> 4) & 3; + data->vidmvs = 1 << mvs; + data->batps = ((psb->flags2) >> 6) & 3; + + dprintk("ramp voltage offset: %d\n", data->rvo); + dprintk("isochronous relief time: %d\n", data->irt); + dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); + + dprintk("numpst: 0x%x\n", psb->num_tables); + cpst = psb->num_tables; + if ((psb->cpuid == 0x00000fc0) || (psb->cpuid == 0x00000fe0) ){ + thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + if ((thiscpuid == 0x00000fc0) || (thiscpuid == 0x00000fe0) ) { + cpst = 1; + } + } + if (cpst != 1) { + printk(KERN_ERR BFX "numpst must be 1\n"); + return -ENODEV; + } + + data->plllock = psb->plllocktime; + dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); + dprintk("maxfid: 0x%x\n", psb->maxfid); + dprintk("maxvid: 0x%x\n", psb->maxvid); + maxvid = psb->maxvid; + + data->numps = psb->numps; + dprintk("numpstates: 0x%x\n", data->numps); + return fill_powernow_table(data, (struct pst_s *)(psb+1), maxvid); + } + /* + * If you see this message, complain to BIOS manufacturer. If + * he tells you "we do not support Linux" or some similar + * nonsense, remember that Windows 2000 uses the same legacy + * mechanism that the old Linux PSB driver uses. Tell them it + * is broken with Windows 2000. + * + * The reference to the AMD documentation is chapter 9 in the + * BIOS and Kernel Developer's Guide, which is available on + * www.amd.com + */ + printk(KERN_ERR PFX "BIOS error - no PSB\n"); + return -ENODEV; +} + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI +static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) +{ + if (!data->acpi_data.state_count) + return; + + data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; + data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; +} + +static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) +{ + int i; + int cntlofreq = 0; + struct cpufreq_frequency_table *powernow_table; + + if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + dprintk("register performance failed\n"); + return -EIO; + } + + /* verify the data contained in the ACPI structures */ + if (data->acpi_data.state_count <= 1) { + dprintk("No ACPI P-States\n"); + goto err_out; + } + + if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + dprintk("Invalid control/status registers (%x - %x)\n", + data->acpi_data.control_register.space_id, + data->acpi_data.status_register.space_id); + goto err_out; + } + + /* fill in data->powernow_table */ + powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) + * (data->acpi_data.state_count + 1)), GFP_KERNEL); + if (!powernow_table) { + dprintk("powernow_table memory alloc failure\n"); + goto err_out; + } + + for (i = 0; i < data->acpi_data.state_count; i++) { + u32 fid = data->acpi_data.states[i].control & FID_MASK; + u32 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + + dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); + + powernow_table[i].index = fid; /* lower 8 bits */ + powernow_table[i].index |= (vid << 8); /* upper 8 bits */ + powernow_table[i].frequency = find_khz_freq_from_fid(fid); + + /* verify frequency is OK */ + if ((powernow_table[i].frequency > (MAX_FREQ * 1000)) || + (powernow_table[i].frequency < (MIN_FREQ * 1000))) { + dprintk("invalid freq %u kHz, ignoring\n", powernow_table[i].frequency); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; + } + + /* verify voltage is OK - BIOSs are using "off" to indicate invalid */ + if (vid == 0x1f) { + dprintk("invalid vid %u, ignoring\n", vid); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; + } + + if (fid < HI_FID_TABLE_BOTTOM) { + if (cntlofreq) { + /* if both entries are the same, ignore this + * one... + */ + if ((powernow_table[i].frequency != powernow_table[cntlofreq].frequency) || + (powernow_table[i].index != powernow_table[cntlofreq].index)) { + printk(KERN_ERR PFX "Too many lo freq table entries\n"); + goto err_out_mem; + } + + dprintk("double low frequency table entry, ignoring it.\n"); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; + } else + cntlofreq = i; + } + + if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", + powernow_table[i].frequency, + (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; + } + } + + powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data.state_count].index = 0; + data->powernow_table = powernow_table; + + /* fill in data */ + data->numps = data->acpi_data.state_count; + print_basics(data); + powernow_k8_acpi_pst_values(data, 0); + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + return 0; + +err_out_mem: + kfree(powernow_table); + +err_out: + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + + /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ + data->acpi_data.state_count = 0; + + return -ENODEV; +} + +static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) +{ + if (data->acpi_data.state_count) + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); +} + +#else +static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } +static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } +static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } +#endif /* CONFIG_X86_POWERNOW_K8_ACPI */ + +/* Take a frequency, and issue the fid/vid transition command */ +static int transition_frequency(struct powernow_k8_data *data, unsigned int index) +{ + u32 fid; + u32 vid; + int res; + struct cpufreq_freqs freqs; + + dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); + + /* fid are the lower 8 bits of the index we stored into + * the cpufreq frequency table in find_psb_table, vid are + * the upper 8 bits. + */ + + fid = data->powernow_table[index].index & 0xFF; + vid = (data->powernow_table[index].index & 0xFF00) >> 8; + + dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); + + if (query_current_values_with_pending_wait(data)) + return 1; + + if ((data->currvid == vid) && (data->currfid == fid)) { + dprintk("target matches current values (fid 0x%x, vid 0x%x)\n", + fid, vid); + return 0; + } + + if ((fid < HI_FID_TABLE_BOTTOM) && (data->currfid < HI_FID_TABLE_BOTTOM)) { + printk("ignoring illegal change in lo freq table-%x to 0x%x\n", + data->currfid, fid); + return 1; + } + + dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", + smp_processor_id(), fid, vid); + + freqs.cpu = data->cpu; + + freqs.old = find_khz_freq_from_fid(data->currfid); + freqs.new = find_khz_freq_from_fid(fid); + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + down(&fidvid_sem); + res = transition_fid_vid(data, fid, vid); + up(&fidvid_sem); + + freqs.new = find_khz_freq_from_fid(data->currfid); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return res; +} + +/* Driver entry point to switch to the target frequency */ +static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) +{ + cpumask_t oldmask = CPU_MASK_ALL; + struct powernow_k8_data *data = powernow_data[pol->cpu]; + u32 checkfid = data->currfid; + u32 checkvid = data->currvid; + unsigned int newstate; + int ret = -EIO; + + /* only run on specific CPU from here on */ + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); + schedule(); + + if (smp_processor_id() != pol->cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu); + goto err_out; + } + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing targ, change pending bit set\n"); + goto err_out; + } + + dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", + pol->cpu, targfreq, pol->min, pol->max, relation); + + if (query_current_values_with_pending_wait(data)) { + ret = -EIO; + goto err_out; + } + + dprintk("targ: curr fid 0x%x, vid 0x%x\n", + data->currfid, data->currvid); + + if ((checkvid != data->currvid) || (checkfid != data->currfid)) { + printk(KERN_ERR PFX + "error - out of sync, fid 0x%x 0x%x, vid 0x%x 0x%x\n", + checkfid, data->currfid, checkvid, data->currvid); + } + + if (cpufreq_frequency_table_target(pol, data->powernow_table, targfreq, relation, &newstate)) + goto err_out; + + powernow_k8_acpi_pst_values(data, newstate); + + if (transition_frequency(data, newstate)) { + printk(KERN_ERR PFX "transition frequency failed\n"); + ret = 1; + goto err_out; + } + + pol->cur = find_khz_freq_from_fid(data->currfid); + ret = 0; + +err_out: + set_cpus_allowed(current, oldmask); + schedule(); + + return ret; +} + +/* Driver entry point to verify the policy and range of frequencies */ +static int powernowk8_verify(struct cpufreq_policy *pol) +{ + struct powernow_k8_data *data = powernow_data[pol->cpu]; + + return cpufreq_frequency_table_verify(pol, data->powernow_table); +} + +/* per CPU init entry point to the driver */ +static int __init powernowk8_cpu_init(struct cpufreq_policy *pol) +{ + struct powernow_k8_data *data; + cpumask_t oldmask = CPU_MASK_ALL; + int rc; + + if (!check_supported_cpu(pol->cpu)) + return -ENODEV; + + data = kmalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); + if (!data) { + printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); + return -ENOMEM; + } + memset(data,0,sizeof(struct powernow_k8_data)); + + data->cpu = pol->cpu; + + if (powernow_k8_cpu_init_acpi(data)) { + /* + * Use the PSB BIOS structure. This is only availabe on + * an UP version, and is deprecated by AMD. + */ + + if ((num_online_cpus() != 1) || (num_possible_cpus() != 1)) { + printk(KERN_INFO PFX "MP systems not supported by PSB BIOS structure\n"); + kfree(data); + return -ENODEV; + } + if (pol->cpu != 0) { + printk(KERN_ERR PFX "init not cpu 0\n"); + kfree(data); + return -ENODEV; + } + rc = find_psb_table(data); + if (rc) { + kfree(data); + return -ENODEV; + } + } + + /* only run on specific CPU from here on */ + oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(pol->cpu)); + schedule(); + + if (smp_processor_id() != pol->cpu) { + printk(KERN_ERR "limiting to cpu %u failed\n", pol->cpu); + goto err_out; + } + + if (pending_bit_stuck()) { + printk(KERN_ERR PFX "failing init, change pending bit set\n"); + goto err_out; + } + + if (query_current_values_with_pending_wait(data)) + goto err_out; + + fidvid_msr_init(); + + /* run on any CPU again */ + set_cpus_allowed(current, oldmask); + schedule(); + + pol->governor = CPUFREQ_DEFAULT_GOVERNOR; + + /* Take a crude guess here. + * That guess was in microseconds, so multiply with 1000 */ + pol->cpuinfo.transition_latency = (((data->rvo + 8) * data->vstable * VST_UNITS_20US) + + (3 * (1 << data->irt) * 10)) * 1000; + + pol->cur = find_khz_freq_from_fid(data->currfid); + dprintk("policy current frequency %d kHz\n", pol->cur); + + /* min/max the cpu is capable of */ + if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { + printk(KERN_ERR PFX "invalid powernow_table\n"); + powernow_k8_cpu_exit_acpi(data); + kfree(data->powernow_table); + kfree(data); + return -EINVAL; + } + + cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); + + printk("cpu_init done, current fid 0x%x, vid 0x%x\n", + data->currfid, data->currvid); + + powernow_data[pol->cpu] = data; + + return 0; + +err_out: + set_cpus_allowed(current, oldmask); + schedule(); + powernow_k8_cpu_exit_acpi(data); + + kfree(data); + return -ENODEV; +} + +static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) +{ + struct powernow_k8_data *data = powernow_data[pol->cpu]; + + if (!data) + return -EINVAL; + + powernow_k8_cpu_exit_acpi(data); + + cpufreq_frequency_table_put_attr(pol->cpu); + + kfree(data->powernow_table); + kfree(data); + + return 0; +} + +static unsigned int powernowk8_get (unsigned int cpu) +{ + struct powernow_k8_data *data = powernow_data[cpu]; + cpumask_t oldmask = current->cpus_allowed; + unsigned int khz = 0; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (smp_processor_id() != cpu) { + printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); + set_cpus_allowed(current, oldmask); + return 0; + } + preempt_disable(); + + if (query_current_values_with_pending_wait(data)) + goto out; + + khz = find_khz_freq_from_fid(data->currfid); + + out: + preempt_enable_no_resched(); + set_cpus_allowed(current, oldmask); + + return khz; +} + +static struct freq_attr* powernow_k8_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver cpufreq_amd64_driver = { + .verify = powernowk8_verify, + .target = powernowk8_target, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, +}; + +/* driver entry point for init */ +static int __init powernowk8_init(void) +{ + unsigned int i, supported_cpus = 0; + + for (i=0; i<NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + if (check_supported_cpu(i)) + supported_cpus++; + } + + if (supported_cpus == num_online_cpus()) { + printk(KERN_INFO PFX "Found %d AMD Athlon 64 / Opteron processors (" VERSION ")\n", + supported_cpus); + return cpufreq_register_driver(&cpufreq_amd64_driver); + } + + return -ENODEV; +} + +/* driver entry point for term */ +static void __exit powernowk8_exit(void) +{ + dprintk("exit\n"); + + cpufreq_unregister_driver(&cpufreq_amd64_driver); +} + +MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com>"); +MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); +MODULE_LICENSE("GPL"); + +late_initcall(powernowk8_init); +module_exit(powernowk8_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h new file mode 100644 index 000000000000..63ebc8470f52 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h @@ -0,0 +1,176 @@ +/* + * (c) 2003, 2004 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + */ + +struct powernow_k8_data { + unsigned int cpu; + + u32 numps; /* number of p-states */ + u32 batps; /* number of p-states supported on battery */ + + /* these values are constant when the PSB is used to determine + * vid/fid pairings, but are modified during the ->target() call + * when ACPI is used */ + u32 rvo; /* ramp voltage offset */ + u32 irt; /* isochronous relief time */ + u32 vidmvs; /* usable value calculated from mvs */ + u32 vstable; /* voltage stabilization time, units 20 us */ + u32 plllock; /* pll lock time, units 1 us */ + + /* keep track of the current fid / vid */ + u32 currvid, currfid; + + /* the powernow_table includes all frequency and vid/fid pairings: + * fid are the lower 8 bits of the index, vid are the upper 8 bits. + * frequency is in kHz */ + struct cpufreq_frequency_table *powernow_table; + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI + /* the acpi table needs to be kept. it's only available if ACPI was + * used to determine valid frequency/vid/fid states */ + struct acpi_processor_performance acpi_data; +#endif +}; + + +/* processor's cpuid instruction support */ +#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ +#define CPUID_XFAM 0x0ff00000 /* extended family */ +#define CPUID_XFAM_K8 0 +#define CPUID_XMOD 0x000f0000 /* extended model */ +#define CPUID_XMOD_REV_E 0x00020000 +#define CPUID_USE_XFAM_XMOD 0x00000f00 +#define CPUID_GET_MAX_CAPABILITIES 0x80000000 +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define P_STATE_TRANSITION_CAPABLE 6 + +/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ +/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ +/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ +/* the register number is placed in ecx, and the data is returned in edx:eax. */ + +#define MSR_FIDVID_CTL 0xc0010041 +#define MSR_FIDVID_STATUS 0xc0010042 + +/* Field definitions within the FID VID Low Control MSR : */ +#define MSR_C_LO_INIT_FID_VID 0x00010000 +#define MSR_C_LO_NEW_VID 0x00001f00 +#define MSR_C_LO_NEW_FID 0x0000002f +#define MSR_C_LO_VID_SHIFT 8 + +/* Field definitions within the FID VID High Control MSR : */ +#define MSR_C_HI_STP_GNT_TO 0x000fffff + +/* Field definitions within the FID VID Low Status MSR : */ +#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ +#define MSR_S_LO_MAX_RAMP_VID 0x1f000000 +#define MSR_S_LO_MAX_FID 0x003f0000 +#define MSR_S_LO_START_FID 0x00003f00 +#define MSR_S_LO_CURRENT_FID 0x0000003f + +/* Field definitions within the FID VID High Status MSR : */ +#define MSR_S_HI_MAX_WORKING_VID 0x001f0000 +#define MSR_S_HI_START_VID 0x00001f00 +#define MSR_S_HI_CURRENT_VID 0x0000001f +#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 + +/* + * There are restrictions frequencies have to follow: + * - only 1 entry in the low fid table ( <=1.4GHz ) + * - lowest entry in the high fid table must be >= 2 * the entry in the + * low fid table + * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry + * in the low fid table + * - the parts can only step at 200 MHz intervals, so 1.9 GHz is never valid + * - lowest frequency must be >= interprocessor hypertransport link speed + * (only applies to MP systems obviously) + */ + +/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ +#define LO_FID_TABLE_TOP 6 /* fid values marking the boundary */ +#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ + +#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ +#define HI_VCOFREQ_TABLE_BOTTOM 1600 + +#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ + +#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ +#define LEAST_VID 0x1e /* Lowest (numerically highest) useful vid value */ + +#define MIN_FREQ 800 /* Min and max freqs, per spec */ +#define MAX_FREQ 5000 + +#define INVALID_FID_MASK 0xffffffc1 /* not a valid fid if these bits are set */ +#define INVALID_VID_MASK 0xffffffe0 /* not a valid vid if these bits are set */ + +#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ + +#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ + +#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ +#define VST_UNITS_20US 20 /* Voltage Stabalization Time is in units of 20us */ + +/* + * Most values of interest are enocoded in a single field of the _PSS + * entries: the "control" value. + */ + +#define IRT_SHIFT 30 +#define RVO_SHIFT 28 +#define PLL_L_SHIFT 20 +#define MVS_SHIFT 18 +#define VST_SHIFT 11 +#define VID_SHIFT 6 +#define IRT_MASK 3 +#define RVO_MASK 3 +#define PLL_L_MASK 0x7f +#define MVS_MASK 3 +#define VST_MASK 0x7f +#define VID_MASK 0x1f +#define FID_MASK 0x3f + + +/* + * Version 1.4 of the PSB table. This table is constructed by BIOS and is + * to tell the OS's power management driver which VIDs and FIDs are + * supported by this particular processor. + * If the data in the PSB / PST is wrong, then this driver will program the + * wrong values into hardware, which is very likely to lead to a crash. + */ + +#define PSB_ID_STRING "AMDK7PNOW!" +#define PSB_ID_STRING_LEN 10 + +#define PSB_VERSION_1_4 0x14 + +struct psb_s { + u8 signature[10]; + u8 tableversion; + u8 flags1; + u16 vstable; + u8 flags2; + u8 num_tables; + u32 cpuid; + u8 plllocktime; + u8 maxfid; + u8 maxvid; + u8 numps; +}; + +/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ +struct pst_s { + u8 fid; + u8 vid; +}; + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) + +static int core_voltage_pre_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); +static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); + +static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c new file mode 100644 index 000000000000..07d5612dc00f --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -0,0 +1,715 @@ +/* + * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium + * M (part of the Centrino chipset). + * + * Despite the "SpeedStep" in the name, this is almost entirely unlike + * traditional SpeedStep. + * + * Modelled on speedstep.c + * + * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org> + * + * WARNING WARNING WARNING + * + * This driver manipulates the PERF_CTL MSR, which is only somewhat + * documented. While it seems to work on my laptop, it has not been + * tested anywhere else, and it may not work for you, do strange + * things or simply crash. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/config.h> +#include <linux/delay.h> +#include <linux/compiler.h> + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI +#include <linux/acpi.h> +#include <acpi/processor.h> +#endif + +#include <asm/msr.h> +#include <asm/processor.h> +#include <asm/cpufeature.h> + +#include "speedstep-est-common.h" + +#define PFX "speedstep-centrino: " +#define MAINTAINER "Jeremy Fitzhardinge <jeremy@goop.org>" + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) + + +struct cpu_id +{ + __u8 x86; /* CPU family */ + __u8 x86_model; /* model */ + __u8 x86_mask; /* stepping */ +}; + +enum { + CPU_BANIAS, + CPU_DOTHAN_A1, + CPU_DOTHAN_A2, + CPU_DOTHAN_B0, +}; + +static const struct cpu_id cpu_ids[] = { + [CPU_BANIAS] = { 6, 9, 5 }, + [CPU_DOTHAN_A1] = { 6, 13, 1 }, + [CPU_DOTHAN_A2] = { 6, 13, 2 }, + [CPU_DOTHAN_B0] = { 6, 13, 6 }, +}; +#define N_IDS (sizeof(cpu_ids)/sizeof(cpu_ids[0])) + +struct cpu_model +{ + const struct cpu_id *cpu_id; + const char *model_name; + unsigned max_freq; /* max clock in kHz */ + + struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ +}; +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); + +/* Operating points for current CPU */ +static struct cpu_model *centrino_model[NR_CPUS]; +static const struct cpu_id *centrino_cpu[NR_CPUS]; + +static struct cpufreq_driver centrino_driver; + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE + +/* Computes the correct form for IA32_PERF_CTL MSR for a particular + frequency/voltage operating point; frequency in MHz, volts in mV. + This is stored as "index" in the structure. */ +#define OP(mhz, mv) \ + { \ + .frequency = (mhz) * 1000, \ + .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ + } + +/* + * These voltage tables were derived from the Intel Pentium M + * datasheet, document 25261202.pdf, Table 5. I have verified they + * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium + * M. + */ + +/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ +static struct cpufreq_frequency_table banias_900[] = +{ + OP(600, 844), + OP(800, 988), + OP(900, 1004), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ +static struct cpufreq_frequency_table banias_1000[] = +{ + OP(600, 844), + OP(800, 972), + OP(900, 988), + OP(1000, 1004), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ +static struct cpufreq_frequency_table banias_1100[] = +{ + OP( 600, 956), + OP( 800, 1020), + OP( 900, 1100), + OP(1000, 1164), + OP(1100, 1180), + { .frequency = CPUFREQ_TABLE_END } +}; + + +/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ +static struct cpufreq_frequency_table banias_1200[] = +{ + OP( 600, 956), + OP( 800, 1004), + OP( 900, 1020), + OP(1000, 1100), + OP(1100, 1164), + OP(1200, 1180), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.30GHz (Banias) */ +static struct cpufreq_frequency_table banias_1300[] = +{ + OP( 600, 956), + OP( 800, 1260), + OP(1000, 1292), + OP(1200, 1356), + OP(1300, 1388), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.40GHz (Banias) */ +static struct cpufreq_frequency_table banias_1400[] = +{ + OP( 600, 956), + OP( 800, 1180), + OP(1000, 1308), + OP(1200, 1436), + OP(1400, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.50GHz (Banias) */ +static struct cpufreq_frequency_table banias_1500[] = +{ + OP( 600, 956), + OP( 800, 1116), + OP(1000, 1228), + OP(1200, 1356), + OP(1400, 1452), + OP(1500, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.60GHz (Banias) */ +static struct cpufreq_frequency_table banias_1600[] = +{ + OP( 600, 956), + OP( 800, 1036), + OP(1000, 1164), + OP(1200, 1276), + OP(1400, 1420), + OP(1600, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; + +/* Intel Pentium M processor 1.70GHz (Banias) */ +static struct cpufreq_frequency_table banias_1700[] = +{ + OP( 600, 956), + OP( 800, 1004), + OP(1000, 1116), + OP(1200, 1228), + OP(1400, 1308), + OP(1700, 1484), + { .frequency = CPUFREQ_TABLE_END } +}; +#undef OP + +#define _BANIAS(cpuid, max, name) \ +{ .cpu_id = cpuid, \ + .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ + .max_freq = (max)*1000, \ + .op_points = banias_##max, \ +} +#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) + +/* CPU models, their operating frequency range, and freq/voltage + operating points */ +static struct cpu_model models[] = +{ + _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), + BANIAS(1000), + BANIAS(1100), + BANIAS(1200), + BANIAS(1300), + BANIAS(1400), + BANIAS(1500), + BANIAS(1600), + BANIAS(1700), + + /* NULL model_name is a wildcard */ + { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, + { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, + { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, + + { NULL, } +}; +#undef _BANIAS +#undef BANIAS + +static int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; + struct cpu_model *model; + + for(model = models; model->cpu_id != NULL; model++) + if (centrino_verify_cpu_id(cpu, model->cpu_id) && + (model->model_name == NULL || + strcmp(cpu->x86_model_id, model->model_name) == 0)) + break; + + if (model->cpu_id == NULL) { + /* No match at all */ + dprintk(KERN_INFO PFX "no support for CPU model \"%s\": " + "send /proc/cpuinfo to " MAINTAINER "\n", + cpu->x86_model_id); + return -ENOENT; + } + + if (model->op_points == NULL) { + /* Matched a non-match */ + dprintk(KERN_INFO PFX "no table support for CPU model \"%s\": \n", + cpu->x86_model_id); +#ifndef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI + dprintk(KERN_INFO PFX "try compiling with CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI enabled\n"); +#endif + return -ENOENT; + } + + centrino_model[policy->cpu] = model; + + dprintk("found \"%s\": max frequency: %dkHz\n", + model->model_name, model->max_freq); + + return 0; +} + +#else +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ + +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +{ + if ((c->x86 == x->x86) && + (c->x86_model == x->x86_model) && + (c->x86_mask == x->x86_mask)) + return 1; + return 0; +} + +/* To be called only after centrino_model is initialized */ +static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) +{ + int i; + + /* + * Extract clock in kHz from PERF_CTL value + * for centrino, as some DSDTs are buggy. + * Ideally, this can be done using the acpi_data structure. + */ + if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || + (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || + (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + msr = (msr >> 8) & 0xff; + return msr * 100000; + } + + if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + return 0; + + msr &= 0xffff; + for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { + if (msr == centrino_model[cpu]->op_points[i].index) + return centrino_model[cpu]->op_points[i].frequency; + } + if (failsafe) + return centrino_model[cpu]->op_points[i-1].frequency; + else + return 0; +} + +/* Return the current CPU frequency in kHz */ +static unsigned int get_cur_freq(unsigned int cpu) +{ + unsigned l, h; + unsigned clock_freq; + cpumask_t saved_mask; + + saved_mask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (smp_processor_id() != cpu) + return 0; + + rdmsr(MSR_IA32_PERF_STATUS, l, h); + clock_freq = extract_clock(l, cpu, 0); + + if (unlikely(clock_freq == 0)) { + /* + * On some CPUs, we can see transient MSR values (which are + * not present in _PSS), while CPU is doing some automatic + * P-state transition (like TM2). Get the last freq set + * in PERF_CTL. + */ + rdmsr(MSR_IA32_PERF_CTL, l, h); + clock_freq = extract_clock(l, cpu, 1); + } + + set_cpus_allowed(current, saved_mask); + return clock_freq; +} + + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI + +static struct acpi_processor_performance p; + +/* + * centrino_cpu_init_acpi - register with ACPI P-States library + * + * Register with the ACPI P-States library (part of drivers/acpi/processor.c) + * in order to determine correct frequency and voltage pairings by reading + * the _PSS of the ACPI DSDT or SSDT tables. + */ +static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) +{ + union acpi_object arg0 = {ACPI_TYPE_BUFFER}; + u32 arg0_buf[3]; + struct acpi_object_list arg_list = {1, &arg0}; + unsigned long cur_freq; + int result = 0, i; + unsigned int cpu = policy->cpu; + + /* _PDC settings */ + arg0.buffer.length = 12; + arg0.buffer.pointer = (u8 *) arg0_buf; + arg0_buf[0] = ACPI_PDC_REVISION_ID; + arg0_buf[1] = 1; + arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_MSR; + + p.pdc = &arg_list; + + /* register with ACPI core */ + if (acpi_processor_register_performance(&p, cpu)) { + dprintk(KERN_INFO PFX "obtaining ACPI data failed\n"); + return -EIO; + } + + /* verify the acpi_data */ + if (p.state_count <= 1) { + dprintk("No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + + if ((p.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (p.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + dprintk("Invalid control/status registers (%x - %x)\n", + p.control_register.space_id, p.status_register.space_id); + result = -EIO; + goto err_unreg; + } + + for (i=0; i<p.state_count; i++) { + if (p.states[i].control != p.states[i].status) { + dprintk("Different control (%x) and status values (%x)\n", + p.states[i].control, p.states[i].status); + result = -EINVAL; + goto err_unreg; + } + + if (!p.states[i].core_frequency) { + dprintk("Zero core frequency for state %u\n", i); + result = -EINVAL; + goto err_unreg; + } + + if (p.states[i].core_frequency > p.states[0].core_frequency) { + dprintk("P%u has larger frequency (%u) than P0 (%u), skipping\n", i, + p.states[i].core_frequency, p.states[0].core_frequency); + p.states[i].core_frequency = 0; + continue; + } + } + + centrino_model[cpu] = kmalloc(sizeof(struct cpu_model), GFP_KERNEL); + if (!centrino_model[cpu]) { + result = -ENOMEM; + goto err_unreg; + } + memset(centrino_model[cpu], 0, sizeof(struct cpu_model)); + + centrino_model[cpu]->model_name=NULL; + centrino_model[cpu]->max_freq = p.states[0].core_frequency * 1000; + centrino_model[cpu]->op_points = kmalloc(sizeof(struct cpufreq_frequency_table) * + (p.state_count + 1), GFP_KERNEL); + if (!centrino_model[cpu]->op_points) { + result = -ENOMEM; + goto err_kfree; + } + + for (i=0; i<p.state_count; i++) { + centrino_model[cpu]->op_points[i].index = p.states[i].control; + centrino_model[cpu]->op_points[i].frequency = p.states[i].core_frequency * 1000; + dprintk("adding state %i with frequency %u and control value %04x\n", + i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index); + } + centrino_model[cpu]->op_points[p.state_count].frequency = CPUFREQ_TABLE_END; + + cur_freq = get_cur_freq(cpu); + + for (i=0; i<p.state_count; i++) { + if (!p.states[i].core_frequency) { + dprintk("skipping state %u\n", i); + centrino_model[cpu]->op_points[i].frequency = CPUFREQ_ENTRY_INVALID; + continue; + } + + if (extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0) != + (centrino_model[cpu]->op_points[i].frequency)) { + dprintk("Invalid encoded frequency (%u vs. %u)\n", + extract_clock(centrino_model[cpu]->op_points[i].index, cpu, 0), + centrino_model[cpu]->op_points[i].frequency); + result = -EINVAL; + goto err_kfree_all; + } + + if (cur_freq == centrino_model[cpu]->op_points[i].frequency) + p.state = i; + } + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + return 0; + + err_kfree_all: + kfree(centrino_model[cpu]->op_points); + err_kfree: + kfree(centrino_model[cpu]); + err_unreg: + acpi_processor_unregister_performance(&p, cpu); + dprintk(KERN_INFO PFX "invalid ACPI data\n"); + return (result); +} +#else +static inline int centrino_cpu_init_acpi(struct cpufreq_policy *policy) { return -ENODEV; } +#endif + +static int centrino_cpu_init(struct cpufreq_policy *policy) +{ + struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; + unsigned freq; + unsigned l, h; + int ret; + int i; + + /* Only Intel makes Enhanced Speedstep-capable CPUs */ + if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + for (i = 0; i < N_IDS; i++) + if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) + break; + + if (i != N_IDS) + centrino_cpu[policy->cpu] = &cpu_ids[i]; + + if (is_const_loops_cpu(policy->cpu)) { + centrino_driver.flags |= CPUFREQ_CONST_LOOPS; + } + + if (centrino_cpu_init_acpi(policy)) { + if (policy->cpu != 0) + return -ENODEV; + + if (!centrino_cpu[policy->cpu]) { + dprintk(KERN_INFO PFX "found unsupported CPU with " + "Enhanced SpeedStep: send /proc/cpuinfo to " + MAINTAINER "\n"); + return -ENODEV; + } + + if (centrino_cpu_init_table(policy)) { + return -ENODEV; + } + } + + /* Check to see if Enhanced SpeedStep is enabled, and try to + enable it if not. */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + if (!(l & (1<<16))) { + l |= (1<<16); + dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); + wrmsr(MSR_IA32_MISC_ENABLE, l, h); + + /* check to see if it stuck */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + if (!(l & (1<<16))) { + printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + return -ENODEV; + } + } + + freq = get_cur_freq(policy->cpu); + + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cur = freq; + + dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); + + ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + if (ret) + return (ret); + + cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + + return 0; +} + +static int centrino_cpu_exit(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + + if (!centrino_model[cpu]) + return -ENODEV; + + cpufreq_frequency_table_put_attr(cpu); + +#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI + if (!centrino_model[cpu]->model_name) { + dprintk("unregistering and freeing ACPI data\n"); + acpi_processor_unregister_performance(&p, cpu); + kfree(centrino_model[cpu]->op_points); + kfree(centrino_model[cpu]); + } +#endif + + centrino_model[cpu] = NULL; + + return 0; +} + +/** + * centrino_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within this model's frequency range at least one + * border included. + */ +static int centrino_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); +} + +/** + * centrino_setpolicy - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int centrino_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + unsigned int msr, oldmsr, h, cpu = policy->cpu; + struct cpufreq_freqs freqs; + cpumask_t saved_mask; + int retval; + + if (centrino_model[cpu] == NULL) + return -ENODEV; + + /* + * Support for SMP systems. + * Make sure we are running on the CPU that wants to change frequency + */ + saved_mask = current->cpus_allowed; + set_cpus_allowed(current, policy->cpus); + if (!cpu_isset(smp_processor_id(), policy->cpus)) { + dprintk("couldn't limit to CPUs in this domain\n"); + return(-EAGAIN); + } + + if (cpufreq_frequency_table_target(policy, centrino_model[cpu]->op_points, target_freq, + relation, &newstate)) { + retval = -EINVAL; + goto migrate_end; + } + + msr = centrino_model[cpu]->op_points[newstate].index; + rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); + + if (msr == (oldmsr & 0xffff)) { + retval = 0; + dprintk("no change needed - msr was and needs to be %x\n", oldmsr); + goto migrate_end; + } + + freqs.cpu = cpu; + freqs.old = extract_clock(oldmsr, cpu, 0); + freqs.new = extract_clock(msr, cpu, 0); + + dprintk("target=%dkHz old=%d new=%d msr=%04x\n", + target_freq, freqs.old, freqs.new, msr); + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + /* all but 16 LSB are "reserved", so treat them with + care */ + oldmsr &= ~0xffff; + msr &= 0xffff; + oldmsr |= msr; + + wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + retval = 0; +migrate_end: + set_cpus_allowed(current, saved_mask); + return (retval); +} + +static struct freq_attr* centrino_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver centrino_driver = { + .name = "centrino", /* should be speedstep-centrino, + but there's a 16 char limit */ + .init = centrino_cpu_init, + .exit = centrino_cpu_exit, + .verify = centrino_verify, + .target = centrino_target, + .get = get_cur_freq, + .attr = centrino_attr, + .owner = THIS_MODULE, +}; + + +/** + * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver + * + * Initializes the Enhanced SpeedStep support. Returns -ENODEV on + * unsupported devices, -ENOENT if there's no voltage table for this + * particular CPU model, -EINVAL on problems during initiatization, + * and zero on success. + * + * This is quite picky. Not only does the CPU have to advertise the + * "est" flag in the cpuid capability flags, we look for a specific + * CPU model and stepping, and we need to have the exact model name in + * our voltage tables. That is, be paranoid about not releasing + * someone's valuable magic smoke. + */ +static int __init centrino_init(void) +{ + struct cpuinfo_x86 *cpu = cpu_data; + + if (!cpu_has(cpu, X86_FEATURE_EST)) + return -ENODEV; + + return cpufreq_register_driver(¢rino_driver); +} + +static void __exit centrino_exit(void) +{ + cpufreq_unregister_driver(¢rino_driver); +} + +MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>"); +MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); +MODULE_LICENSE ("GPL"); + +late_initcall(centrino_init); +module_exit(centrino_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h b/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h new file mode 100644 index 000000000000..5ce995c9d866 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-est-common.h @@ -0,0 +1,25 @@ +/* + * Routines common for drivers handling Enhanced Speedstep Technology + * Copyright (C) 2004 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * + * Licensed under the terms of the GNU GPL License version 2 -- see + * COPYING for details. + */ + +static inline int is_const_loops_cpu(unsigned int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + + if (c->x86_vendor != X86_VENDOR_INTEL || !cpu_has(c, X86_FEATURE_EST)) + return 0; + + /* + * on P-4s, the TSC runs with constant frequency independent of cpu freq + * when we use EST + */ + if (c->x86 == 0xf) + return 1; + + return 0; +} + diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c new file mode 100644 index 000000000000..5b7d18a06afa --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-ich.c @@ -0,0 +1,424 @@ +/* + * (C) 2001 Dave Jones, Arjan van de ven. + * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> + * + * Licensed under the terms of the GNU GPL License version 2. + * Based upon reverse engineered information, and on Intel documentation + * for chipsets ICH2-M and ICH3-M. + * + * Many thanks to Ducrot Bruno for finding and fixing the last + * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler + * for extensive testing. + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + + +/********************************************************************* + * SPEEDSTEP - DEFINITIONS * + *********************************************************************/ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/pci.h> +#include <linux/slab.h> + +#include "speedstep-lib.h" + + +/* speedstep_chipset: + * It is necessary to know which chipset is used. As accesses to + * this device occur at various places in this module, we need a + * static struct pci_dev * pointing to that device. + */ +static struct pci_dev *speedstep_chipset_dev; + + +/* speedstep_processor + */ +static unsigned int speedstep_processor = 0; + + +/* + * There are only two frequency states for each processor. Values + * are in kHz for the time being. + */ +static struct cpufreq_frequency_table speedstep_freqs[] = { + {SPEEDSTEP_HIGH, 0}, + {SPEEDSTEP_LOW, 0}, + {0, CPUFREQ_TABLE_END}, +}; + + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-ich", msg) + + +/** + * speedstep_set_state - set the SpeedStep state + * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + * Tries to change the SpeedStep state. + */ +static void speedstep_set_state (unsigned int state) +{ + u32 pmbase; + u8 pm2_blk; + u8 value; + unsigned long flags; + + if (!speedstep_chipset_dev || (state > 0x1)) + return; + + /* get PMBASE */ + pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); + if (!(pmbase & 0x01)) { + printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); + return; + } + + pmbase &= 0xFFFFFFFE; + if (!pmbase) { + printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); + return; + } + + /* Disable IRQs */ + local_irq_save(flags); + + /* read state */ + value = inb(pmbase + 0x50); + + dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + + /* write new state */ + value &= 0xFE; + value |= state; + + dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); + + /* Disable bus master arbitration */ + pm2_blk = inb(pmbase + 0x20); + pm2_blk |= 0x01; + outb(pm2_blk, (pmbase + 0x20)); + + /* Actual transition */ + outb(value, (pmbase + 0x50)); + + /* Restore bus master arbitration */ + pm2_blk &= 0xfe; + outb(pm2_blk, (pmbase + 0x20)); + + /* check if transition was successful */ + value = inb(pmbase + 0x50); + + /* Enable IRQs */ + local_irq_restore(flags); + + dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); + + if (state == (value & 0x1)) { + dprintk("change to %u MHz succeeded\n", (speedstep_get_processor_frequency(speedstep_processor) / 1000)); + } else { + printk (KERN_ERR "cpufreq: change failed - I/O error\n"); + } + + return; +} + + +/** + * speedstep_activate - activate SpeedStep control in the chipset + * + * Tries to activate the SpeedStep status and control registers. + * Returns -EINVAL on an unsupported chipset, and zero on success. + */ +static int speedstep_activate (void) +{ + u16 value = 0; + + if (!speedstep_chipset_dev) + return -EINVAL; + + pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); + if (!(value & 0x08)) { + value |= 0x08; + dprintk("activating SpeedStep (TM) registers\n"); + pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); + } + + return 0; +} + + +/** + * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic + * + * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to + * the LPC bridge / PM module which contains all power-management + * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected + * chipset, or zero on failure. + */ +static unsigned int speedstep_detect_chipset (void) +{ + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801DB_12, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) + return 4; /* 4-M */ + + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801CA_12, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) + return 3; /* 3-M */ + + + speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82801BA_10, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + if (speedstep_chipset_dev) { + /* speedstep.c causes lockups on Dell Inspirons 8000 and + * 8100 which use a pretty old revision of the 82815 + * host brige. Abort on these systems. + */ + static struct pci_dev *hostbridge; + u8 rev = 0; + + hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_82815_MC, + PCI_ANY_ID, + PCI_ANY_ID, + NULL); + + if (!hostbridge) + return 2; /* 2-M */ + + pci_read_config_byte(hostbridge, PCI_REVISION_ID, &rev); + if (rev < 5) { + dprintk("hostbridge does not support speedstep\n"); + speedstep_chipset_dev = NULL; + pci_dev_put(hostbridge); + return 0; + } + + pci_dev_put(hostbridge); + return 2; /* 2-M */ + } + + return 0; +} + +static unsigned int _speedstep_get(cpumask_t cpus) +{ + unsigned int speed; + cpumask_t cpus_allowed; + + cpus_allowed = current->cpus_allowed; + set_cpus_allowed(current, cpus); + speed = speedstep_get_processor_frequency(speedstep_processor); + set_cpus_allowed(current, cpus_allowed); + dprintk("detected %u kHz as current frequency\n", speed); + return speed; +} + +static unsigned int speedstep_get(unsigned int cpu) +{ + return _speedstep_get(cpumask_of_cpu(cpu)); +} + +/** + * speedstep_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: the target frequency + * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * + * Sets a new CPUFreq policy. + */ +static int speedstep_target (struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + unsigned int newstate = 0; + struct cpufreq_freqs freqs; + cpumask_t cpus_allowed; + int i; + + if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) + return -EINVAL; + + freqs.old = _speedstep_get(policy->cpus); + freqs.new = speedstep_freqs[newstate].frequency; + freqs.cpu = policy->cpu; + + dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new); + + /* no transition necessary */ + if (freqs.old == freqs.new) + return 0; + + cpus_allowed = current->cpus_allowed; + + for_each_cpu_mask(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + } + + /* switch to physical CPU where state is to be changed */ + set_cpus_allowed(current, policy->cpus); + + speedstep_set_state(newstate); + + /* allow to be run on all CPUs */ + set_cpus_allowed(current, cpus_allowed); + + for_each_cpu_mask(i, policy->cpus) { + freqs.cpu = i; + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + } + + return 0; +} + + +/** + * speedstep_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within speedstep_low_freq and speedstep_high_freq, with + * at least one border included. + */ +static int speedstep_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); +} + + +static int speedstep_cpu_init(struct cpufreq_policy *policy) +{ + int result = 0; + unsigned int speed; + cpumask_t cpus_allowed; + + /* only run on CPU to be set, or on its sibling */ +#ifdef CONFIG_SMP + policy->cpus = cpu_sibling_map[policy->cpu]; +#endif + + cpus_allowed = current->cpus_allowed; + set_cpus_allowed(current, policy->cpus); + + /* detect low and high frequency */ + result = speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &speedstep_set_state); + set_cpus_allowed(current, cpus_allowed); + if (result) + return result; + + /* get current speed setting */ + speed = _speedstep_get(policy->cpus); + if (!speed) + return -EIO; + + dprintk("currently at %s speed setting - %i MHz\n", + (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", + (speed / 1000)); + + /* cpuinfo and default policy values */ + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = speed; + + result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); + if (result) + return (result); + + cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); + + return 0; +} + + +static int speedstep_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static struct freq_attr* speedstep_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver speedstep_driver = { + .name = "speedstep-ich", + .verify = speedstep_verify, + .target = speedstep_target, + .init = speedstep_cpu_init, + .exit = speedstep_cpu_exit, + .get = speedstep_get, + .owner = THIS_MODULE, + .attr = speedstep_attr, +}; + + +/** + * speedstep_init - initializes the SpeedStep CPUFreq driver + * + * Initializes the SpeedStep support. Returns -ENODEV on unsupported + * devices, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init speedstep_init(void) +{ + /* detect processor */ + speedstep_processor = speedstep_detect_processor(); + if (!speedstep_processor) { + dprintk("Intel(R) SpeedStep(TM) capable processor not found\n"); + return -ENODEV; + } + + /* detect chipset */ + if (!speedstep_detect_chipset()) { + dprintk("Intel(R) SpeedStep(TM) for this chipset not (yet) available.\n"); + return -ENODEV; + } + + /* activate speedstep support */ + if (speedstep_activate()) { + pci_dev_put(speedstep_chipset_dev); + return -EINVAL; + } + + return cpufreq_register_driver(&speedstep_driver); +} + + +/** + * speedstep_exit - unregisters SpeedStep support + * + * Unregisters SpeedStep support. + */ +static void __exit speedstep_exit(void) +{ + pci_dev_put(speedstep_chipset_dev); + cpufreq_unregister_driver(&speedstep_driver); +} + + +MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); +MODULE_LICENSE ("GPL"); + +module_init(speedstep_init); +module_exit(speedstep_exit); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c new file mode 100644 index 000000000000..8ba430a9c3a2 --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c @@ -0,0 +1,385 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> + * + * Licensed under the terms of the GNU GPL License version 2. + * + * Library for common functions for Intel SpeedStep v.1 and v.2 support + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/pci.h> +#include <linux/slab.h> + +#include <asm/msr.h> +#include "speedstep-lib.h" + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-lib", msg) + +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +static int relaxed_check = 0; +#else +#define relaxed_check 0 +#endif + +/********************************************************************* + * GET PROCESSOR CORE SPEED IN KHZ * + *********************************************************************/ + +static unsigned int pentium3_get_frequency (unsigned int processor) +{ + /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ + struct { + unsigned int ratio; /* Frequency Multiplier (x10) */ + u8 bitmap; /* power on configuration bits + [27, 25:22] (in MSR 0x2a) */ + } msr_decode_mult [] = { + { 30, 0x01 }, + { 35, 0x05 }, + { 40, 0x02 }, + { 45, 0x06 }, + { 50, 0x00 }, + { 55, 0x04 }, + { 60, 0x0b }, + { 65, 0x0f }, + { 70, 0x09 }, + { 75, 0x0d }, + { 80, 0x0a }, + { 85, 0x26 }, + { 90, 0x20 }, + { 100, 0x2b }, + { 0, 0xff } /* error or unknown value */ + }; + + /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ + struct { + unsigned int value; /* Front Side Bus speed in MHz */ + u8 bitmap; /* power on configuration bits [18: 19] + (in MSR 0x2a) */ + } msr_decode_fsb [] = { + { 66, 0x0 }, + { 100, 0x2 }, + { 133, 0x1 }, + { 0, 0xff} + }; + + u32 msr_lo, msr_tmp; + int i = 0, j = 0; + + /* read MSR 0x2a - we only need the low 32 bits */ + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); + dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + msr_tmp = msr_lo; + + /* decode the FSB */ + msr_tmp &= 0x00c0000; + msr_tmp >>= 18; + while (msr_tmp != msr_decode_fsb[i].bitmap) { + if (msr_decode_fsb[i].bitmap == 0xff) + return 0; + i++; + } + + /* decode the multiplier */ + if (processor == SPEEDSTEP_PROCESSOR_PIII_C_EARLY) { + dprintk("workaround for early PIIIs\n"); + msr_lo &= 0x03c00000; + } else + msr_lo &= 0x0bc00000; + msr_lo >>= 22; + while (msr_lo != msr_decode_mult[j].bitmap) { + if (msr_decode_mult[j].bitmap == 0xff) + return 0; + j++; + } + + dprintk("speed is %u\n", (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); + + return (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100); +} + + +static unsigned int pentiumM_get_frequency(void) +{ + u32 msr_lo, msr_tmp; + + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); + dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); + + /* see table B-2 of 24547212.pdf */ + if (msr_lo & 0x00040000) { + printk(KERN_DEBUG "speedstep-lib: PM - invalid FSB: 0x%x 0x%x\n", msr_lo, msr_tmp); + return 0; + } + + msr_tmp = (msr_lo >> 22) & 0x1f; + dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * 100 * 1000)); + + return (msr_tmp * 100 * 1000); +} + + +static unsigned int pentium4_get_frequency(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 msr_lo, msr_hi, mult; + unsigned int fsb = 0; + + rdmsr(0x2c, msr_lo, msr_hi); + + dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); + + /* decode the FSB: see IA-32 Intel (C) Architecture Software + * Developer's Manual, Volume 3: System Prgramming Guide, + * revision #12 in Table B-1: MSRs in the Pentium 4 and + * Intel Xeon Processors, on page B-4 and B-5. + */ + if (c->x86_model < 2) + fsb = 100 * 1000; + else { + u8 fsb_code = (msr_lo >> 16) & 0x7; + switch (fsb_code) { + case 0: + fsb = 100 * 1000; + break; + case 1: + fsb = 13333 * 10; + break; + case 2: + fsb = 200 * 1000; + break; + } + } + + if (!fsb) + printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); + + /* Multiplier. */ + if (c->x86_model < 2) + mult = msr_lo >> 27; + else + mult = msr_lo >> 24; + + dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); + + return (fsb * mult); +} + + +unsigned int speedstep_get_processor_frequency(unsigned int processor) +{ + switch (processor) { + case SPEEDSTEP_PROCESSOR_PM: + return pentiumM_get_frequency(); + case SPEEDSTEP_PROCESSOR_P4D: + case SPEEDSTEP_PROCESSOR_P4M: + return pentium4_get_frequency(); + case SPEEDSTEP_PROCESSOR_PIII_T: + case SPEEDSTEP_PROCESSOR_PIII_C: + case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: + return pentium3_get_frequency(processor); + default: + return 0; + }; + return 0; +} +EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency); + + +/********************************************************************* + * DETECT SPEEDSTEP-CAPABLE PROCESSOR * + *********************************************************************/ + +unsigned int speedstep_detect_processor (void) +{ + struct cpuinfo_x86 *c = cpu_data; + u32 ebx, msr_lo, msr_hi; + + dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); + + if ((c->x86_vendor != X86_VENDOR_INTEL) || + ((c->x86 != 6) && (c->x86 != 0xF))) + return 0; + + if (c->x86 == 0xF) { + /* Intel Mobile Pentium 4-M + * or Intel Mobile Pentium 4 with 533 MHz FSB */ + if (c->x86_model != 2) + return 0; + + ebx = cpuid_ebx(0x00000001); + ebx &= 0x000000FF; + + dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); + + switch (c->x86_mask) { + case 4: + /* + * B-stepping [M-P4-M] + * sample has ebx = 0x0f, production has 0x0e. + */ + if ((ebx == 0x0e) || (ebx == 0x0f)) + return SPEEDSTEP_PROCESSOR_P4M; + break; + case 7: + /* + * C-stepping [M-P4-M] + * needs to have ebx=0x0e, else it's a celeron: + * cf. 25130917.pdf / page 7, footnote 5 even + * though 25072120.pdf / page 7 doesn't say + * samples are only of B-stepping... + */ + if (ebx == 0x0e) + return SPEEDSTEP_PROCESSOR_P4M; + break; + case 9: + /* + * D-stepping [M-P4-M or M-P4/533] + * + * this is totally strange: CPUID 0x0F29 is + * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. + * The latter need to be sorted out as they don't + * support speedstep. + * Celerons with CPUID 0x0F29 may have either + * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything + * specific. + * M-P4-Ms may have either ebx=0xe or 0xf [see above] + * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] + * also, M-P4M HTs have ebx=0x8, too + * For now, they are distinguished by the model_id string + */ + if ((ebx == 0x0e) || (strstr(c->x86_model_id,"Mobile Intel(R) Pentium(R) 4") != NULL)) + return SPEEDSTEP_PROCESSOR_P4M; + break; + default: + break; + } + return 0; + } + + switch (c->x86_model) { + case 0x0B: /* Intel PIII [Tualatin] */ + /* cpuid_ebx(1) is 0x04 for desktop PIII, + 0x06 for mobile PIII-M */ + ebx = cpuid_ebx(0x00000001); + dprintk("ebx is %x\n", ebx); + + ebx &= 0x000000FF; + + if (ebx != 0x06) + return 0; + + /* So far all PIII-M processors support SpeedStep. See + * Intel's 24540640.pdf of June 2003 + */ + + return SPEEDSTEP_PROCESSOR_PIII_T; + + case 0x08: /* Intel PIII [Coppermine] */ + + /* all mobile PIII Coppermines have FSB 100 MHz + * ==> sort out a few desktop PIIIs. */ + rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); + dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", msr_lo, msr_hi); + msr_lo &= 0x00c0000; + if (msr_lo != 0x0080000) + return 0; + + /* + * If the processor is a mobile version, + * platform ID has bit 50 set + * it has SpeedStep technology if either + * bit 56 or 57 is set + */ + rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); + dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", msr_lo, msr_hi); + if ((msr_hi & (1<<18)) && (relaxed_check ? 1 : (msr_hi & (3<<24)))) { + if (c->x86_mask == 0x01) { + dprintk("early PIII version\n"); + return SPEEDSTEP_PROCESSOR_PIII_C_EARLY; + } else + return SPEEDSTEP_PROCESSOR_PIII_C; + } + + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(speedstep_detect_processor); + + +/********************************************************************* + * DETECT SPEEDSTEP SPEEDS * + *********************************************************************/ + +unsigned int speedstep_get_freqs(unsigned int processor, + unsigned int *low_speed, + unsigned int *high_speed, + void (*set_state) (unsigned int state)) +{ + unsigned int prev_speed; + unsigned int ret = 0; + unsigned long flags; + + if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) + return -EINVAL; + + dprintk("trying to determine both speeds\n"); + + /* get current speed */ + prev_speed = speedstep_get_processor_frequency(processor); + if (!prev_speed) + return -EIO; + + dprintk("previous seped is %u\n", prev_speed); + + local_irq_save(flags); + + /* switch to low state */ + set_state(SPEEDSTEP_LOW); + *low_speed = speedstep_get_processor_frequency(processor); + if (!*low_speed) { + ret = -EIO; + goto out; + } + + dprintk("low seped is %u\n", *low_speed); + + /* switch to high state */ + set_state(SPEEDSTEP_HIGH); + *high_speed = speedstep_get_processor_frequency(processor); + if (!*high_speed) { + ret = -EIO; + goto out; + } + + dprintk("high seped is %u\n", *high_speed); + + if (*low_speed == *high_speed) { + ret = -ENODEV; + goto out; + } + + /* switch to previous state, if necessary */ + if (*high_speed != prev_speed) + set_state(SPEEDSTEP_LOW); + + out: + local_irq_restore(flags); + return (ret); +} +EXPORT_SYMBOL_GPL(speedstep_get_freqs); + +#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK +module_param(relaxed_check, int, 0444); +MODULE_PARM_DESC(relaxed_check, "Don't do all checks for speedstep capability."); +#endif + +MODULE_AUTHOR ("Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION ("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); +MODULE_LICENSE ("GPL"); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h new file mode 100644 index 000000000000..261a2c9b7f6b --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h @@ -0,0 +1,47 @@ +/* + * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> + * + * Licensed under the terms of the GNU GPL License version 2. + * + * Library for common functions for Intel SpeedStep v.1 and v.2 support + * + * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* + */ + + + +/* processors */ + +#define SPEEDSTEP_PROCESSOR_PIII_C_EARLY 0x00000001 /* Coppermine core */ +#define SPEEDSTEP_PROCESSOR_PIII_C 0x00000002 /* Coppermine core */ +#define SPEEDSTEP_PROCESSOR_PIII_T 0x00000003 /* Tualatin core */ +#define SPEEDSTEP_PROCESSOR_P4M 0x00000004 /* P4-M */ + +/* the following processors are not speedstep-capable and are not auto-detected + * in speedstep_detect_processor(). However, their speed can be detected using + * the speedstep_get_processor_frequency() call. */ +#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */ +#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */ + +/* speedstep states -- only two of them */ + +#define SPEEDSTEP_HIGH 0x00000000 +#define SPEEDSTEP_LOW 0x00000001 + + +/* detect a speedstep-capable processor */ +extern unsigned int speedstep_detect_processor (void); + +/* detect the current speed (in khz) of the processor */ +extern unsigned int speedstep_get_processor_frequency(unsigned int processor); + + +/* detect the low and high speeds of the processor. The callback + * set_state"'s first argument is either SPEEDSTEP_HIGH or + * SPEEDSTEP_LOW; the second argument is zero so that no + * cpufreq_notify_transition calls are initiated. + */ +extern unsigned int speedstep_get_freqs(unsigned int processor, + unsigned int *low_speed, + unsigned int *high_speed, + void (*set_state) (unsigned int state)); diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c new file mode 100644 index 000000000000..79440b3f087e --- /dev/null +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c @@ -0,0 +1,424 @@ +/* + * Intel SpeedStep SMI driver. + * + * (C) 2003 Hiroshi Miura <miura@da-cha.org> + * + * Licensed under the terms of the GNU GPL License version 2. + * + */ + + +/********************************************************************* + * SPEEDSTEP - DEFINITIONS * + *********************************************************************/ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <asm/ist.h> + +#include "speedstep-lib.h" + +/* speedstep system management interface port/command. + * + * These parameters are got from IST-SMI BIOS call. + * If user gives it, these are used. + * + */ +static int smi_port = 0; +static int smi_cmd = 0; +static unsigned int smi_sig = 0; + +/* info about the processor */ +static unsigned int speedstep_processor = 0; + +/* + * There are only two frequency states for each processor. Values + * are in kHz for the time being. + */ +static struct cpufreq_frequency_table speedstep_freqs[] = { + {SPEEDSTEP_HIGH, 0}, + {SPEEDSTEP_LOW, 0}, + {0, CPUFREQ_TABLE_END}, +}; + +#define GET_SPEEDSTEP_OWNER 0 +#define GET_SPEEDSTEP_STATE 1 +#define SET_SPEEDSTEP_STATE 2 +#define GET_SPEEDSTEP_FREQS 4 + +/* how often shall the SMI call be tried if it failed, e.g. because + * of DMA activity going on? */ +#define SMI_TRIES 5 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-smi", msg) + +/** + * speedstep_smi_ownership + */ +static int speedstep_smi_ownership (void) +{ + u32 command, result, magic; + u32 function = GET_SPEEDSTEP_OWNER; + unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + magic = virt_to_phys(magic_data); + + dprintk("trying to obtain ownership with command %x at port %x\n", command, smi_port); + + __asm__ __volatile__( + "out %%al, (%%dx)\n" + : "=D" (result) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) + ); + + dprintk("result is %x\n", result); + + return result; +} + +/** + * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. + * @low: the low frequency value is placed here + * @high: the high frequency value is placed here + * + * Only available on later SpeedStep-enabled systems, returns false results or + * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing + * shows that the latter occurs if !(ist_info.event & 0xFFFF). + */ +static int speedstep_smi_get_freqs (unsigned int *low, unsigned int *high) +{ + u32 command, result = 0, edi, high_mhz, low_mhz; + u32 state=0; + u32 function = GET_SPEEDSTEP_FREQS; + + if (!(ist_info.event & 0xFFFF)) { + dprintk("bug #1422 -- can't read freqs from BIOS\n", result); + return -ENODEV; + } + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + dprintk("trying to determine frequencies with command %x at port %x\n", command, smi_port); + + __asm__ __volatile__("movl $0, %%edi\n" + "out %%al, (%%dx)\n" + : "=a" (result), "=b" (high_mhz), "=c" (low_mhz), "=d" (state), "=D" (edi) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + ); + + dprintk("result %x, low_freq %u, high_freq %u\n", result, low_mhz, high_mhz); + + /* abort if results are obviously incorrect... */ + if ((high_mhz + low_mhz) < 600) + return -EINVAL; + + *high = high_mhz * 1000; + *low = low_mhz * 1000; + + return result; +} + +/** + * speedstep_get_state - set the SpeedStep state + * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + */ +static int speedstep_get_state (void) +{ + u32 function=GET_SPEEDSTEP_STATE; + u32 result, state, edi, command; + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + dprintk("trying to determine current setting with command %x at port %x\n", command, smi_port); + + __asm__ __volatile__("movl $0, %%edi\n" + "out %%al, (%%dx)\n" + : "=a" (result), "=b" (state), "=D" (edi) + : "a" (command), "b" (function), "c" (0), "d" (smi_port), "S" (0) + ); + + dprintk("state is %x, result is %x\n", state, result); + + return (state & 1); +} + + +/** + * speedstep_set_state - set the SpeedStep state + * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) + * + */ +static void speedstep_set_state (unsigned int state) +{ + unsigned int result = 0, command, new_state; + unsigned long flags; + unsigned int function=SET_SPEEDSTEP_STATE; + unsigned int retry = 0; + + if (state > 0x1) + return; + + /* Disable IRQs */ + local_irq_save(flags); + + command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); + + dprintk("trying to set frequency to state %u with command %x at port %x\n", state, command, smi_port); + + do { + if (retry) { + dprintk("retry %u, previous result %u, waiting...\n", retry, result); + mdelay(retry * 50); + } + retry++; + __asm__ __volatile__( + "movl $0, %%edi\n" + "out %%al, (%%dx)\n" + : "=b" (new_state), "=D" (result) + : "a" (command), "b" (function), "c" (state), "d" (smi_port), "S" (0) + ); + } while ((new_state != state) && (retry <= SMI_TRIES)); + + /* enable IRQs */ + local_irq_restore(flags); + + if (new_state == state) { + dprintk("change to %u MHz succeeded after %u tries with result %u\n", (speedstep_freqs[new_state].frequency / 1000), retry, result); + } else { + printk(KERN_ERR "cpufreq: change failed with new_state %u and result %u\n", new_state, result); + } + + return; +} + + +/** + * speedstep_target - set a new CPUFreq policy + * @policy: new policy + * @target_freq: new freq + * @relation: + * + * Sets a new CPUFreq policy/freq. + */ +static int speedstep_target (struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + unsigned int newstate = 0; + struct cpufreq_freqs freqs; + + if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate)) + return -EINVAL; + + freqs.old = speedstep_freqs[speedstep_get_state()].frequency; + freqs.new = speedstep_freqs[newstate].frequency; + freqs.cpu = 0; /* speedstep.c is UP only driver */ + + if (freqs.old == freqs.new) + return 0; + + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + speedstep_set_state(newstate); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + + return 0; +} + + +/** + * speedstep_verify - verifies a new CPUFreq policy + * @policy: new policy + * + * Limit must be within speedstep_low_freq and speedstep_high_freq, with + * at least one border included. + */ +static int speedstep_verify (struct cpufreq_policy *policy) +{ + return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); +} + + +static int speedstep_cpu_init(struct cpufreq_policy *policy) +{ + int result; + unsigned int speed,state; + + /* capability check */ + if (policy->cpu != 0) + return -ENODEV; + + result = speedstep_smi_ownership(); + if (result) { + dprintk("fails in aquiring ownership of a SMI interface.\n"); + return -EINVAL; + } + + /* detect low and high frequency */ + result = speedstep_smi_get_freqs(&speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency); + if (result) { + /* fall back to speedstep_lib.c dection mechanism: try both states out */ + dprintk("could not detect low and high frequencies by SMI call.\n"); + result = speedstep_get_freqs(speedstep_processor, + &speedstep_freqs[SPEEDSTEP_LOW].frequency, + &speedstep_freqs[SPEEDSTEP_HIGH].frequency, + &speedstep_set_state); + + if (result) { + dprintk("could not detect two different speeds -- aborting.\n"); + return result; + } else + dprintk("workaround worked.\n"); + } + + /* get current speed setting */ + state = speedstep_get_state(); + speed = speedstep_freqs[state].frequency; + + dprintk("currently at %s speed setting - %i MHz\n", + (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) ? "low" : "high", + (speed / 1000)); + + /* cpuinfo and default policy values */ + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cur = speed; + + result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); + if (result) + return (result); + + cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); + + return 0; +} + +static int speedstep_cpu_exit(struct cpufreq_policy *policy) +{ + cpufreq_frequency_table_put_attr(policy->cpu); + return 0; +} + +static unsigned int speedstep_get(unsigned int cpu) +{ + if (cpu) + return -ENODEV; + return speedstep_get_processor_frequency(speedstep_processor); +} + + +static int speedstep_resume(struct cpufreq_policy *policy) +{ + int result = speedstep_smi_ownership(); + + if (result) + dprintk("fails in re-aquiring ownership of a SMI interface.\n"); + + return result; +} + +static struct freq_attr* speedstep_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + +static struct cpufreq_driver speedstep_driver = { + .name = "speedstep-smi", + .verify = speedstep_verify, + .target = speedstep_target, + .init = speedstep_cpu_init, + .exit = speedstep_cpu_exit, + .get = speedstep_get, + .resume = speedstep_resume, + .owner = THIS_MODULE, + .attr = speedstep_attr, +}; + +/** + * speedstep_init - initializes the SpeedStep CPUFreq driver + * + * Initializes the SpeedStep support. Returns -ENODEV on unsupported + * BIOS, -EINVAL on problems during initiatization, and zero on + * success. + */ +static int __init speedstep_init(void) +{ + speedstep_processor = speedstep_detect_processor(); + + switch (speedstep_processor) { + case SPEEDSTEP_PROCESSOR_PIII_T: + case SPEEDSTEP_PROCESSOR_PIII_C: + case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: + break; + default: + speedstep_processor = 0; + } + + if (!speedstep_processor) { + dprintk ("No supported Intel CPU detected.\n"); + return -ENODEV; + } + + dprintk("signature:0x%.8lx, command:0x%.8lx, event:0x%.8lx, perf_level:0x%.8lx.\n", + ist_info.signature, ist_info.command, ist_info.event, ist_info.perf_level); + + + /* Error if no IST-SMI BIOS or no PARM + sig= 'ISGE' aka 'Intel Speedstep Gate E' */ + if ((ist_info.signature != 0x47534943) && ( + (smi_port == 0) || (smi_cmd == 0))) + return -ENODEV; + + if (smi_sig == 1) + smi_sig = 0x47534943; + else + smi_sig = ist_info.signature; + + /* setup smi_port from MODLULE_PARM or BIOS */ + if ((smi_port > 0xff) || (smi_port < 0)) { + return -EINVAL; + } else if (smi_port == 0) { + smi_port = ist_info.command & 0xff; + } + + if ((smi_cmd > 0xff) || (smi_cmd < 0)) { + return -EINVAL; + } else if (smi_cmd == 0) { + smi_cmd = (ist_info.command >> 16) & 0xff; + } + + return cpufreq_register_driver(&speedstep_driver); +} + + +/** + * speedstep_exit - unregisters SpeedStep support + * + * Unregisters SpeedStep support. + */ +static void __exit speedstep_exit(void) +{ + cpufreq_unregister_driver(&speedstep_driver); +} + +module_param(smi_port, int, 0444); +module_param(smi_cmd, int, 0444); +module_param(smi_sig, uint, 0444); + +MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value -- Intel's default setting is 0xb2"); +MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value -- Intel's default setting is 0x82"); +MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the SMI interface."); + +MODULE_AUTHOR ("Hiroshi Miura"); +MODULE_DESCRIPTION ("Speedstep driver for IST applet SMI interface."); +MODULE_LICENSE ("GPL"); + +module_init(speedstep_init); +module_exit(speedstep_exit); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c new file mode 100644 index 000000000000..ba4b01138c8f --- /dev/null +++ b/arch/i386/kernel/cpu/cyrix.c @@ -0,0 +1,439 @@ +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/delay.h> +#include <linux/pci.h> +#include <asm/dma.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/timer.h> + +#include "cpu.h" + +/* + * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU + */ +static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned char ccr2, ccr3; + unsigned long flags; + + /* we test for DEVID by checking whether CCR3 is writable */ + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, ccr3 ^ 0x80); + getCx86(0xc0); /* dummy to change bus */ + + if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */ + ccr2 = getCx86(CX86_CCR2); + setCx86(CX86_CCR2, ccr2 ^ 0x04); + getCx86(0xc0); /* dummy */ + + if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */ + *dir0 = 0xfd; + else { /* Cx486S A step */ + setCx86(CX86_CCR2, ccr2); + *dir0 = 0xfe; + } + } + else { + setCx86(CX86_CCR3, ccr3); /* restore CCR3 */ + + /* read DIR0 and DIR1 CPU registers */ + *dir0 = getCx86(CX86_DIR0); + *dir1 = getCx86(CX86_DIR1); + } + local_irq_restore(flags); +} + +/* + * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in + * order to identify the Cyrix CPU model after we're out of setup.c + * + * Actually since bugs.h doesn't even reference this perhaps someone should + * fix the documentation ??? + */ +static unsigned char Cx86_dir0_msb __initdata = 0; + +static char Cx86_model[][9] __initdata = { + "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", + "M II ", "Unknown" +}; +static char Cx486_name[][5] __initdata = { + "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", + "SRx2", "DRx2" +}; +static char Cx486S_name[][4] __initdata = { + "S", "S2", "Se", "S2e" +}; +static char Cx486D_name[][4] __initdata = { + "DX", "DX2", "?", "?", "?", "DX4" +}; +static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock"; +static char cyrix_model_mult1[] __initdata = "12??43"; +static char cyrix_model_mult2[] __initdata = "12233445"; + +/* + * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old + * BIOSes for compatibility with DOS games. This makes the udelay loop + * work correctly, and improves performance. + * + * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP + */ + +extern void calibrate_delay(void) __init; + +static void __init check_cx686_slop(struct cpuinfo_x86 *c) +{ + unsigned long flags; + + if (Cx86_dir0_msb == 3) { + unsigned char ccr3, ccr5; + + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ccr5 = getCx86(CX86_CCR5); + if (ccr5 & 2) + setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + local_irq_restore(flags); + + if (ccr5 & 2) { /* possible wrong calibration done */ + printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n"); + calibrate_delay(); + c->loops_per_jiffy = loops_per_jiffy; + } + } +} + + +static void __init set_cx86_reorder(void) +{ + u8 ccr3; + + printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n"); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + /* Load/Store Serialize to mem access disable (=reorder it) */ + setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + /* set load/store serialize from 1GB to 4GB */ + ccr3 |= 0xe0; + setCx86(CX86_CCR3, ccr3); +} + +static void __init set_cx86_memwb(void) +{ + u32 cr0; + + printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + + /* CCR2 bit 2: unlock NW bit */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + /* set 'Not Write-through' */ + cr0 = 0x20000000; + __asm__("movl %%cr0,%%eax\n\t" + "orl %0,%%eax\n\t" + "movl %%eax,%%cr0\n" + : : "r" (cr0) + :"ax"); + /* CCR2 bit 2: lock NW bit and set WT1 */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); +} + +static void __init set_cx86_inc(void) +{ + unsigned char ccr3; + + printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + /* PCR1 -- Performance Control */ + /* Incrementor on, whatever that is */ + setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); + /* PCR0 -- Performance Control */ + /* Incrementor Margin 10 */ + setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ +} + +/* + * Configure later MediaGX and/or Geode processor. + */ + +static void __init geode_configure(void) +{ + unsigned long flags; + u8 ccr3, ccr4; + local_irq_save(flags); + + /* Suspend on halt power saving and enable #SUSP pin */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ + + ccr4 = getCx86(CX86_CCR4); + ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */ + + setCx86(CX86_CCR3, ccr3); + + set_cx86_memwb(); + set_cx86_reorder(); + set_cx86_inc(); + + local_irq_restore(flags); +} + + +#ifdef CONFIG_PCI +static struct pci_device_id cyrix_55x0[] = { + { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, + { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, + { }, +}; +#endif + +static void __init init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; + char *buf = c->x86_model_id; + const char *p = NULL; + + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; + 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + clear_bit(0*32+31, c->x86_capability); + + /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */ + if ( test_bit(1*32+24, c->x86_capability) ) { + clear_bit(1*32+24, c->x86_capability); + set_bit(X86_FEATURE_CXMMX, c->x86_capability); + } + + do_cyrix_devid(&dir0, &dir1); + + check_cx686_slop(c); + + Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + dir0_lsn = dir0 & 0xf; /* model or clock multiplier */ + + /* common case step number/rev -- exceptions handled below */ + c->x86_model = (dir1 >> 4) + 1; + c->x86_mask = dir1 & 0xf; + + /* Now cook; the original recipe is by Channing Corn, from Cyrix. + * We do the same thing for each generation: we work out + * the model, multiplier and stepping. Black magic included, + * to make the silicon step/rev numbers match the printed ones. + */ + + switch (dir0_msn) { + unsigned char tmp; + + case 0: /* Cx486SLC/DLC/SRx/DRx */ + p = Cx486_name[dir0_lsn & 7]; + break; + + case 1: /* Cx486S/DX/DX2/DX4 */ + p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5] + : Cx486S_name[dir0_lsn & 3]; + break; + + case 2: /* 5x86 */ + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + p = Cx86_cb+2; + break; + + case 3: /* 6x86/6x86L */ + Cx86_cb[1] = ' '; + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + if (dir1 > 0x21) { /* 686L */ + Cx86_cb[0] = 'L'; + p = Cx86_cb; + (c->x86_model)++; + } else /* 686 */ + p = Cx86_cb+1; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability); + /* 6x86's contain this bug */ + c->coma_bug = 1; + break; + + case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ +#ifdef CONFIG_PCI + /* It isn't really a PCI quirk directly, but the cure is the + same. The MediaGX has deep magic SMM stuff that handles the + SB emulation. It thows away the fifo on disable_dma() which + is wrong and ruins the audio. + + Bug2: VSA1 has a wrap bug so that using maximum sized DMA + causes bad things. According to NatSemi VSA2 has another + bug to do with 'hlt'. I've not seen any boards using VSA2 + and X doesn't seem to support it either so who cares 8). + VSA1 we work around however. + */ + + printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); + isa_dma_bridge_buggy = 2; +#endif + c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ + + /* + * The 5510/5520 companion chips have a funky PIT. + */ + if (pci_dev_present(cyrix_55x0)) + pit_latch_buggy = 1; + + /* GXm supports extended cpuid levels 'ala' AMD */ + if (c->cpuid_level == 2) { + /* Enable cxMMX extensions (GX1 Datasheet 54) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + + /* GXlv/GXm/GX1 */ + if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) + geode_configure(); + get_model_name(c); /* get CPU marketing name */ + return; + } + else { /* MediaGX */ + Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; + p = Cx86_cb+2; + c->x86_model = (dir1 & 0x20) ? 1 : 2; + } + break; + + case 5: /* 6x86MX/M II */ + if (dir1 > 7) + { + dir0_msn++; /* M II */ + /* Enable MMX extensions (App note 108) */ + setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + } + else + { + c->coma_bug = 1; /* 6x86MX, it has the bug. */ + } + tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0; + Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7]; + p = Cx86_cb+tmp; + if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20)) + (c->x86_model)++; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability); + break; + + case 0xf: /* Cyrix 486 without DEVID registers */ + switch (dir0_lsn) { + case 0xd: /* either a 486SLC or DLC w/o DEVID */ + dir0_msn = 0; + p = Cx486_name[(c->hard_math) ? 1 : 0]; + break; + + case 0xe: /* a 486S A step */ + dir0_msn = 0; + p = Cx486S_name[0]; + break; + } + break; + + default: /* unknown (shouldn't happen, we know everyone ;-) */ + dir0_msn = 7; + break; + } + strcpy(buf, Cx86_model[dir0_msn & 7]); + if (p) strcat(buf, p); + return; +} + +/* + * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected + * by the fact that they preserve the flags across the division of 5/2. + * PII and PPro exhibit this behavior too, but they have cpuid available. + */ + +/* + * Perform the Cyrix 5/2 test. A Cyrix won't change + * the flags, while other 486 chips will. + */ +static inline int test_cyrix_52div(void) +{ + unsigned int test; + + __asm__ __volatile__( + "sahf\n\t" /* clear flags (%eax = 0x0005) */ + "div %b2\n\t" /* divide 5 by 2 */ + "lahf" /* store flags into %ah */ + : "=a" (test) + : "0" (5), "q" (2) + : "cc"); + + /* AH is 0x02 on Cyrix after the divide.. */ + return (unsigned char) (test >> 8) == 0x02; +} + +static void cyrix_identify(struct cpuinfo_x86 * c) +{ + /* Detect Cyrix with disabled CPUID */ + if ( c->x86 == 4 && test_cyrix_52div() ) { + unsigned char dir0, dir1; + + strcpy(c->x86_vendor_id, "CyrixInstead"); + c->x86_vendor = X86_VENDOR_CYRIX; + + /* Actually enable cpuid on the older cyrix */ + + /* Retrieve CPU revisions */ + + do_cyrix_devid(&dir0, &dir1); + + dir0>>=4; + + /* Check it is an affected model */ + + if (dir0 == 5 || dir0 == 3) + { + unsigned char ccr3, ccr4; + unsigned long flags; + printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ccr4 = getCx86(CX86_CCR4); + setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + local_irq_restore(flags); + } + } + generic_identify(c); +} + +static struct cpu_dev cyrix_cpu_dev __initdata = { + .c_vendor = "Cyrix", + .c_ident = { "CyrixInstead" }, + .c_init = init_cyrix, + .c_identify = cyrix_identify, +}; + +int __init cyrix_init_cpu(void) +{ + cpu_devs[X86_VENDOR_CYRIX] = &cyrix_cpu_dev; + return 0; +} + +//early_arch_initcall(cyrix_init_cpu); + +static struct cpu_dev nsc_cpu_dev __initdata = { + .c_vendor = "NSC", + .c_ident = { "Geode by NSC" }, + .c_init = init_cyrix, + .c_identify = generic_identify, +}; + +int __init nsc_init_cpu(void) +{ + cpu_devs[X86_VENDOR_NSC] = &nsc_cpu_dev; + return 0; +} + +//early_arch_initcall(nsc_init_cpu); diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c new file mode 100644 index 000000000000..b8d847b850dc --- /dev/null +++ b/arch/i386/kernel/cpu/intel.c @@ -0,0 +1,248 @@ +#include <linux/config.h> +#include <linux/init.h> +#include <linux/kernel.h> + +#include <linux/string.h> +#include <linux/bitops.h> +#include <linux/smp.h> +#include <linux/thread_info.h> + +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/uaccess.h> + +#include "cpu.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#endif + +extern int trap_init_f00f_bug(void); + +#ifdef CONFIG_X86_INTEL_USERCOPY +/* + * Alignment at which movsl is preferred for bulk memory copies. + */ +struct movsl_mask movsl_mask; +#endif + +void __init early_intel_workaround(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return; + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +} + +/* + * Early probe support logic for ppro memory erratum #50 + * + * This is called before we do cpu ident work + */ + +int __init ppro_with_ram_bug(void) +{ + /* Uses data from early_cpu_detect now */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask < 8) { + printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + return 1; + } + return 0; +} + + +/* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ +static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + rdmsr (MSR_IA32_MISC_ENABLE, lo, hi); + if ((lo & (1<<9)) == 0) { + printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); + printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); + lo |= (1<<9); /* Disable hw prefetching */ + wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + } + } +} + + +static void __init init_intel(struct cpuinfo_x86 *c) +{ + unsigned int l2 = 0; + char *p = NULL; + +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the system. + * Note that the workaround only should be initialized once... + */ + c->f00f_bug = 0; + if ( c->x86 == 5 ) { + static int f00f_workaround_enabled = 0; + + c->f00f_bug = 1; + if ( !f00f_workaround_enabled ) { + trap_init_f00f_bug(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + select_idle_routine(c); + l2 = init_intel_cacheinfo(c); + + /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_bit(X86_FEATURE_SEP, c->x86_capability); + + /* Names for the Pentium II/Celeron processors + detectable only by also checking the cache size. + Dixon is NOT a Celeron. */ + if (c->x86 == 6) { + switch (c->x86_model) { + case 5: + if (c->x86_mask == 0) { + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; + } + break; + + case 6: + if (l2 == 128) + p = "Celeron (Mendocino)"; + else if (c->x86_mask == 0 || c->x86_mask == 5) + p = "Celeron-A"; + break; + + case 8: + if (l2 == 128) + p = "Celeron (Coppermine)"; + break; + } + } + + if ( p ) + strcpy(c->x86_model_id, p); + + detect_ht(c); + + /* Work around errata */ + Intel_errata_workarounds(c); + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + + if (c->x86 == 15) + set_bit(X86_FEATURE_P4, c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_P3, c->x86_capability); +} + + +static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) +{ + /* Intel PIII Tualatin. This comes in two flavours. + * One has 256kb of cache, the other 512. We have no way + * to determine which, so we use a boottime override + * for the 512kb model, and assume 256 otherwise. + */ + if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + size = 256; + return size; +} + +static struct cpu_dev intel_cpu_dev __initdata = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, + .c_models = { + { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = + { + [0] = "486 DX-25/33", + [1] = "486 DX-50", + [2] = "486 SX", + [3] = "486 DX/2", + [4] = "486 SL", + [5] = "486 SX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB" + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = + { + [0] = "Pentium 60/66 A-step", + [1] = "Pentium 60/66", + [2] = "Pentium 75 - 200", + [3] = "OverDrive PODP5V83", + [4] = "Pentium MMX", + [7] = "Mobile Pentium 75 - 200", + [8] = "Mobile Pentium MMX" + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = + { + [0] = "Pentium Pro A-step", + [1] = "Pentium Pro", + [3] = "Pentium II (Klamath)", + [4] = "Pentium II (Deschutes)", + [5] = "Pentium II (Deschutes)", + [6] = "Mobile Pentium II", + [7] = "Pentium III (Katmai)", + [8] = "Pentium III (Coppermine)", + [10] = "Pentium III (Cascades)", + [11] = "Pentium III (Tualatin)", + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = + { + [0] = "Pentium 4 (Unknown)", + [1] = "Pentium 4 (Willamette)", + [2] = "Pentium 4 (Northwood)", + [4] = "Pentium 4 (Foster)", + [5] = "Pentium 4 (Foster)", + } + }, + }, + .c_init = init_intel, + .c_identify = generic_identify, + .c_size_cache = intel_size_cache, +}; + +__init int intel_cpu_init(void) +{ + cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev; + return 0; +} + +// arch_initcall(intel_cpu_init); + diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c new file mode 100644 index 000000000000..aeb5b4ef8c8b --- /dev/null +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -0,0 +1,598 @@ +/* + * Routines to indentify caches on Intel CPU. + * + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/device.h> +#include <linux/compiler.h> +#include <linux/cpu.h> + +#include <asm/processor.h> +#include <asm/smp.h> + +#define LVL_1_INST 1 +#define LVL_1_DATA 2 +#define LVL_2 3 +#define LVL_3 4 +#define LVL_TRACE 5 + +struct _cache_table +{ + unsigned char descriptor; + char cache_type; + short size; +}; + +/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +static struct _cache_table cache_table[] __initdata = +{ + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ + { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ + { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ + { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ + { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ + { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ + { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ + { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ + { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ + { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ + { 0x00, 0, 0} +}; + + +enum _cache_type +{ + CACHE_TYPE_NULL = 0, + CACHE_TYPE_DATA = 1, + CACHE_TYPE_INST = 2, + CACHE_TYPE_UNIFIED = 3 +}; + +union _cpuid4_leaf_eax { + struct { + enum _cache_type type:5; + unsigned int level:3; + unsigned int is_self_initializing:1; + unsigned int is_fully_associative:1; + unsigned int reserved:4; + unsigned int num_threads_sharing:12; + unsigned int num_cores_on_die:6; + } split; + u32 full; +}; + +union _cpuid4_leaf_ebx { + struct { + unsigned int coherency_line_size:12; + unsigned int physical_line_partition:10; + unsigned int ways_of_associativity:10; + } split; + u32 full; +}; + +union _cpuid4_leaf_ecx { + struct { + unsigned int number_of_sets:32; + } split; + u32 full; +}; + +struct _cpuid4_info { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned long size; + cpumask_t shared_cpu_map; +}; + +#define MAX_CACHE_LEAVES 4 +static unsigned short __devinitdata num_cache_leaves; + +static int __devinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +{ + unsigned int eax, ebx, ecx, edx; + union _cpuid4_leaf_eax cache_eax; + + cpuid_count(4, index, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + if (cache_eax.split.type == CACHE_TYPE_NULL) + return -1; + + this_leaf->eax.full = eax; + this_leaf->ebx.full = ebx; + this_leaf->ecx.full = ecx; + this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) * + (this_leaf->ebx.split.coherency_line_size + 1) * + (this_leaf->ebx.split.physical_line_partition + 1) * + (this_leaf->ebx.split.ways_of_associativity + 1); + return 0; +} + +static int __init find_num_cache_leaves(void) +{ + unsigned int eax, ebx, ecx, edx; + union _cpuid4_leaf_eax cache_eax; + int i; + int retval; + + retval = MAX_CACHE_LEAVES; + /* Do cpuid(4) loop to find out num_cache_leaves */ + for (i = 0; i < MAX_CACHE_LEAVES; i++) { + cpuid_count(4, i, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + if (cache_eax.split.type == CACHE_TYPE_NULL) { + retval = i; + break; + } + } + return retval; +} + +unsigned int __init init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ + unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + + if (c->cpuid_level > 4) { + static int is_initialized; + + if (is_initialized == 0) { + /* Init num_cache_leaves from boot CPU */ + num_cache_leaves = find_num_cache_leaves(); + is_initialized++; + } + + /* + * Whenever possible use cpuid(4), deterministic cache + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { + struct _cpuid4_info this_leaf; + + int retval; + + retval = cpuid4_cache_lookup(i, &this_leaf); + if (retval >= 0) { + switch(this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == + CACHE_TYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == + CACHE_TYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + break; + case 3: + new_l3 = this_leaf.size/1024; + break; + default: + break; + } + } + } + } + if (c->cpuid_level > 1) { + /* supports eax=2 call */ + int i, j, n; + int regs[4]; + unsigned char *dp = (unsigned char *)regs; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for ( i = 0 ; i < n ; i++ ) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for ( j = 0 ; j < 3 ; j++ ) { + if ( regs[j] < 0 ) regs[j] = 0; + } + + /* Byte 0 is level count, not a descriptor */ + for ( j = 1 ; j < 16 ; j++ ) { + unsigned char des = dp[j]; + unsigned char k = 0; + + /* look up this descriptor in the table */ + while (cache_table[k].descriptor != 0) + { + if (cache_table[k].descriptor == des) { + switch (cache_table[k].cache_type) { + case LVL_1_INST: + l1i += cache_table[k].size; + break; + case LVL_1_DATA: + l1d += cache_table[k].size; + break; + case LVL_2: + l2 += cache_table[k].size; + break; + case LVL_3: + l3 += cache_table[k].size; + break; + case LVL_TRACE: + trace += cache_table[k].size; + break; + } + + break; + } + + k++; + } + } + } + + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) + l2 = new_l2; + + if (new_l3) + l3 = new_l3; + + if ( trace ) + printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if ( l1i ) + printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + if ( l1d ) + printk(", L1 D cache: %dK\n", l1d); + else + printk("\n"); + if ( l2 ) + printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); + if ( l3 ) + printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); + + /* + * This assumes the L3 cache is shared; it typically lives in + * the northbridge. The L1 caches are included by the L2 + * cache, and so should not be included for the purpose of + * SMP switching weights. + */ + c->x86_cache_size = l2 ? l2 : (l1i+l1d); + } + + return l2; +} + +/* pointer to _cpuid4_info array (for each cache leaf) */ +static struct _cpuid4_info *cpuid4_info[NR_CPUS]; +#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y])) + +#ifdef CONFIG_SMP +static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf; + unsigned long num_threads_sharing; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + + if (num_threads_sharing == 1) + cpu_set(cpu, this_leaf->shared_cpu_map); +#ifdef CONFIG_X86_HT + else if (num_threads_sharing == smp_num_siblings) + this_leaf->shared_cpu_map = cpu_sibling_map[cpu]; +#endif + else + printk(KERN_INFO "Number of CPUs sharing cache didn't match " + "any known set of CPUs\n"); +} +#else +static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {} +#endif + +static void free_cache_attributes(unsigned int cpu) +{ + kfree(cpuid4_info[cpu]); + cpuid4_info[cpu] = NULL; +} + +static int __devinit detect_cache_attributes(unsigned int cpu) +{ + struct _cpuid4_info *this_leaf; + unsigned long j; + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + cpuid4_info[cpu] = kmalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (unlikely(cpuid4_info[cpu] == NULL)) + return -ENOMEM; + memset(cpuid4_info[cpu], 0, + sizeof(struct _cpuid4_info) * num_cache_leaves); + + /* Do cpuid and store the results */ + for (j = 0; j < num_cache_leaves; j++) { + this_leaf = CPUID4_INFO_IDX(cpu, j); + retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(retval < 0)) + goto err_out; + cache_shared_cpu_map_setup(cpu, j); + } + return 0; + +err_out: + free_cache_attributes(cpu); + return -ENOMEM; +} + +#ifdef CONFIG_SYSFS + +#include <linux/kobject.h> +#include <linux/sysfs.h> + +extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ + +/* pointer to kobject for cpuX/cache */ +static struct kobject * cache_kobject[NR_CPUS]; + +struct _index_kobject { + struct kobject kobj; + unsigned int cpu; + unsigned short index; +}; + +/* pointer to array of kobjects for cpuX/cache/indexY */ +static struct _index_kobject *index_kobject[NR_CPUS]; +#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y])) + +#define show_one_plus(file_name, object, val) \ +static ssize_t show_##file_name \ + (struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ +} + +show_one_plus(level, eax.split.level, 0); +show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); + +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +{ + return sprintf (buf, "%luK\n", this_leaf->size / 1024); +} + +static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf) +{ + char mask_str[NR_CPUS]; + cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map); + return sprintf(buf, "%s\n", mask_str); +} + +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { + switch(this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: + return sprintf(buf, "Data\n"); + break; + case CACHE_TYPE_INST: + return sprintf(buf, "Instruction\n"); + break; + case CACHE_TYPE_UNIFIED: + return sprintf(buf, "Unified\n"); + break; + default: + return sprintf(buf, "Unknown\n"); + break; + } +} + +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#define define_one_ro(_name) \ +static struct _cache_attr _name = \ + __ATTR(_name, 0444, show_##_name, NULL) + +define_one_ro(level); +define_one_ro(type); +define_one_ro(coherency_line_size); +define_one_ro(physical_line_partition); +define_one_ro(ways_of_associativity); +define_one_ro(number_of_sets); +define_one_ro(size); +define_one_ro(shared_cpu_map); + +static struct attribute * default_attrs[] = { + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + NULL +}; + +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) + +static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +{ + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->show ? + fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf) : + 0; + return ret; +} + +static ssize_t store(struct kobject * kobj, struct attribute * attr, + const char * buf, size_t count) +{ + return 0; +} + +static struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_cache = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +static struct kobj_type ktype_percpu_entry = { + .sysfs_ops = &sysfs_ops, +}; + +static void cpuid4_cache_sysfs_exit(unsigned int cpu) +{ + kfree(cache_kobject[cpu]); + kfree(index_kobject[cpu]); + cache_kobject[cpu] = NULL; + index_kobject[cpu] = NULL; + free_cache_attributes(cpu); +} + +static int __devinit cpuid4_cache_sysfs_init(unsigned int cpu) +{ + + if (num_cache_leaves == 0) + return -ENOENT; + + detect_cache_attributes(cpu); + if (cpuid4_info[cpu] == NULL) + return -ENOENT; + + /* Allocate all required memory */ + cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); + if (unlikely(cache_kobject[cpu] == NULL)) + goto err_out; + memset(cache_kobject[cpu], 0, sizeof(struct kobject)); + + index_kobject[cpu] = kmalloc( + sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + if (unlikely(index_kobject[cpu] == NULL)) + goto err_out; + memset(index_kobject[cpu], 0, + sizeof(struct _index_kobject) * num_cache_leaves); + + return 0; + +err_out: + cpuid4_cache_sysfs_exit(cpu); + return -ENOMEM; +} + +/* Add/Remove cache interface for CPU device */ +static int __devinit cache_add_dev(struct sys_device * sys_dev) +{ + unsigned int cpu = sys_dev->id; + unsigned long i, j; + struct _index_kobject *this_object; + int retval = 0; + + retval = cpuid4_cache_sysfs_init(cpu); + if (unlikely(retval < 0)) + return retval; + + cache_kobject[cpu]->parent = &sys_dev->kobj; + kobject_set_name(cache_kobject[cpu], "%s", "cache"); + cache_kobject[cpu]->ktype = &ktype_percpu_entry; + retval = kobject_register(cache_kobject[cpu]); + + for (i = 0; i < num_cache_leaves; i++) { + this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object->cpu = cpu; + this_object->index = i; + this_object->kobj.parent = cache_kobject[cpu]; + kobject_set_name(&(this_object->kobj), "index%1lu", i); + this_object->kobj.ktype = &ktype_cache; + retval = kobject_register(&(this_object->kobj)); + if (unlikely(retval)) { + for (j = 0; j < i; j++) { + kobject_unregister( + &(INDEX_KOBJECT_PTR(cpu,j)->kobj)); + } + kobject_unregister(cache_kobject[cpu]); + cpuid4_cache_sysfs_exit(cpu); + break; + } + } + return retval; +} + +static int __devexit cache_remove_dev(struct sys_device * sys_dev) +{ + unsigned int cpu = sys_dev->id; + unsigned long i; + + for (i = 0; i < num_cache_leaves; i++) + kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_unregister(cache_kobject[cpu]); + cpuid4_cache_sysfs_exit(cpu); + return 0; +} + +static struct sysdev_driver cache_sysdev_driver = { + .add = cache_add_dev, + .remove = __devexit_p(cache_remove_dev), +}; + +/* Register/Unregister the cpu_cache driver */ +static int __devinit cache_register_driver(void) +{ + if (num_cache_leaves == 0) + return 0; + + return sysdev_driver_register(&cpu_sysdev_class,&cache_sysdev_driver); +} + +device_initcall(cache_register_driver); + +#endif + diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile new file mode 100644 index 000000000000..30808f3d6715 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/Makefile @@ -0,0 +1,2 @@ +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c new file mode 100644 index 000000000000..8df52e86c4d2 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -0,0 +1,97 @@ +/* + * Athlon/Hammer specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine Check Handler For AMD Athlon/Duron */ +static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + for (i=1; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high&(1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + /* Clear it */ + wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + + if (recover&2) + panic ("CPU context corrupt"); + if (recover&1) + panic ("Unable to continue"); + printk (KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + + +/* AMD K7 machine check is Intel like */ +void __init amd_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + machine_check_vector = k7_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + /* Clear status for MC index 0 separately, we don't touch CTL, + * as some Athlons cause spurious MCEs when its enabled. */ + wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + for (i=1; i<nr_mce_banks; i++) { + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + } + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c new file mode 100644 index 000000000000..bf6d1aefafc0 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -0,0 +1,77 @@ +/* + * mce.c - x86 Machine Check Exception Reporting + * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/thread_info.h> + +#include <asm/processor.h> +#include <asm/system.h> + +#include "mce.h" + +int mce_disabled __initdata = 0; +int nr_mce_banks; + +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +/* This has to be run for each processor */ +void __init mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled==1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (c->x86==6 || c->x86==15) + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86==5) + intel_p5_mcheck_init(c); + if (c->x86==6) + intel_p6_mcheck_init(c); + if (c->x86==15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86==5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 0; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 0; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h new file mode 100644 index 000000000000..dc2416dfef15 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/mce.h @@ -0,0 +1,14 @@ +#include <linux/init.h> + +void amd_mcheck_init(struct cpuinfo_x86 *c); +void intel_p4_mcheck_init(struct cpuinfo_x86 *c); +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); + +/* Call the installed machine check handler for this CPU setup. */ +extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); + +extern int mce_disabled __initdata; +extern int nr_mce_banks; + diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c new file mode 100644 index 000000000000..7864ddfccf07 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -0,0 +1,93 @@ +/* + * Non Fatal Machine Check Exception Reporting + * + * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> + * + * This file contains routines to check for non-fatal MCEs every 15s + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/config.h> +#include <linux/irq.h> +#include <linux/workqueue.h> +#include <linux/interrupt.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +static int firstbank; + +#define MCE_RATE 15*HZ /* timer rate is 15s */ + +static void mce_checkregs (void *info) +{ + u32 low, high; + int i; + + for (i=firstbank; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high); + + if (high & (1<<31)) { + printk(KERN_INFO "MCE: The hardware reports a non " + "fatal, correctable incident occurred on " + "CPU %d.\n", + smp_processor_id()); + printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + + /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */ + wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } +} + +static void mce_work_fn(void *data); +static DECLARE_WORK(mce_work, mce_work_fn, NULL); + +static void mce_work_fn(void *data) +{ + on_each_cpu(mce_checkregs, NULL, 1, 1); + schedule_delayed_work(&mce_work, MCE_RATE); +} + +static int __init init_nonfatal_mce_checker(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* Check for MCE support */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return -ENODEV; + + /* Check for PPro style MCA */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return -ENODEV; + + /* Some Athlons misbehave when we frob bank 0 */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 6) + firstbank = 1; + else + firstbank = 0; + + /* + * Check for non-fatal errors every MCE_RATE s + */ + schedule_delayed_work(&mce_work, MCE_RATE); + printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; +} +module_init(init_nonfatal_mce_checker); + +MODULE_LICENSE("GPL"); diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c new file mode 100644 index 000000000000..8b16ceb929b4 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -0,0 +1,271 @@ +/* + * P4 specific Machine Check Exception Reporting + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/config.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> +#include <asm/apic.h> + +#include "mce.h" + +/* as supported by the P4/Xeon family */ +struct intel_mce_extended_msrs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + u32 edi; + u32 ebp; + u32 esp; + u32 eflags; + u32 eip; + /* u32 *reserved[]; */ +}; + +static int mce_num_extended_msrs = 0; + + +#ifdef CONFIG_X86_MCE_P4THERMAL +static void unexpected_thermal_interrupt(struct pt_regs *regs) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler */ +static void intel_thermal_interrupt(struct pt_regs *regs) +{ + u32 l, h; + unsigned int cpu = smp_processor_id(); + static unsigned long next[NR_CPUS]; + + ack_APIC_irq(); + + if (time_after(next[cpu], jiffies)) + return; + + next[cpu] = jiffies + HZ*5; + rdmsr(MSR_IA32_THERM_STATUS, l, h); + if (l & 0x1) { + printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); + printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", + cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + } +} + +/* Thermal interrupt handler for this CPU setup */ +static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; + +fastcall void smp_thermal_interrupt(struct pt_regs *regs) +{ + irq_enter(); + vendor_thermal_interrupt(regs); + irq_exit(); +} + +/* P4/Xeon Thermal regulation detect and init */ +static void __init intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + unsigned int cpu = smp_processor_id(); + + /* Thermal monitoring */ + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; /* -ENODEV */ + + /* Clock modulation */ + if (!cpu_has(c, X86_FEATURE_ACC)) + return; /* -ENODEV */ + + /* first check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already -zwanem. + */ + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1<<3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", + cpu); + return; /* -EBUSY */ + } + + /* check whether a vector already exists, temporarily masked? */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ + } + + /* The temperature transition interrupt handler setup */ + h = THERMAL_APIC_VECTOR; /* our delivery vector */ + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + apic_write_around(APIC_LVTTHMR, h); + + rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + + /* ok we're good to go... */ + vendor_thermal_interrupt = intel_thermal_interrupt; + + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); + + l = apic_read (APIC_LVTTHMR); + apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + return; +} +#endif /* CONFIG_X86_MCE_P4THERMAL */ + + +/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ +static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +{ + u32 h; + + if (mce_num_extended_msrs == 0) + goto done; + + rdmsr (MSR_IA32_MCG_EAX, r->eax, h); + rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); + rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); + rdmsr (MSR_IA32_MCG_EDX, r->edx, h); + rdmsr (MSR_IA32_MCG_ESI, r->esi, h); + rdmsr (MSR_IA32_MCG_EDI, r->edi, h); + rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); + rdmsr (MSR_IA32_MCG_ESP, r->esp, h); + rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); + rdmsr (MSR_IA32_MCG_EIP, r->eip, h); + + /* can we rely on kmalloc to do a dynamic + * allocation for the reserved registers? + */ +done: + return mce_num_extended_msrs; +} + +static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + struct intel_mce_extended_msrs dbg; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + if (intel_get_extended_msrs(&dbg)) { + printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", + smp_processor_id(), dbg.eip, dbg.eflags); + printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", + dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); + printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + dbg.esi, dbg.edi, dbg.ebp, dbg.esp); + } + + for (i=0; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high & (1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + } + } + + if (recover & 2) + panic ("CPU context corrupt"); + if (recover & 1) + panic ("Unable to continue"); + + printk(KERN_EMERG "Attempting to continue.\n"); + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not + * recoverable/continuable.This will allow BIOS to look at the MSRs + * for errors if the OS could not log the error. + */ + for (i=0; i<nr_mce_banks; i++) { + u32 msr; + msr = MSR_IA32_MC0_STATUS+i*4; + rdmsr (msr, low, high); + if (high&(1<<31)) { + /* Clear it */ + wrmsr(msr, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + + +void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + machine_check_vector = intel_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + for (i=0; i<nr_mce_banks; i++) { + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + } + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); + + /* Check for P4/Xeon extended MCE MSRs */ + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<9)) {/* MCG_EXT_P */ + mce_num_extended_msrs = (l >> 16) & 0xff; + printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" + " available\n", + smp_processor_id(), mce_num_extended_msrs); + +#ifdef CONFIG_X86_MCE_P4THERMAL + /* Check for P4/Xeon Thermal monitor */ + intel_init_thermal(c); +#endif + } +} diff --git a/arch/i386/kernel/cpu/mcheck/p5.c b/arch/i386/kernel/cpu/mcheck/p5.c new file mode 100644 index 000000000000..c45a1b485c80 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/p5.c @@ -0,0 +1,54 @@ +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine check handler for Pentium class Intel */ +static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) +{ + u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); + if(lotype&(1<<5)) + printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* Set up machine check reporting for processors with Intel style MCE */ +void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /*Check for MCE support */ + if( !cpu_has(c, X86_FEATURE_MCE) ) + return; + + /* Default P5 to off as its often misconnected */ + if(mce_disabled != -1) + return; + machine_check_vector = pentium_machine_check; + wmb(); + + /* Read registers before enabling */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + + /* Enable MCE */ + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); +} diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c new file mode 100644 index 000000000000..46640f8c2494 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/p6.c @@ -0,0 +1,115 @@ +/* + * P6 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine Check Handler For PII/PIII */ +static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + for (i=0; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high & (1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + } + } + + if (recover & 2) + panic ("CPU context corrupt"); + if (recover & 1) + panic ("Unable to continue"); + + printk (KERN_EMERG "Attempting to continue.\n"); + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not + * recoverable/continuable.This will allow BIOS to look at the MSRs + * for errors if the OS could not log the error. + */ + for (i=0; i<nr_mce_banks; i++) { + unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; + rdmsr (msr,low, high); + if (high & (1<<31)) { + /* Clear it */ + wrmsr (msr, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + +/* Set up machine check reporting for processors with Intel style MCE */ +void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + /* Check for MCE support */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + /* Check for PPro style MCA */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return; + + /* Ok machine check is available */ + machine_check_vector = intel_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + /* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */ + for (i=1; i<nr_mce_banks; i++) { + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + } + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c new file mode 100644 index 000000000000..753fa7acb984 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/winchip.c @@ -0,0 +1,37 @@ +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/interrupt.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine check handler for WinChip C6 */ +static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) +{ + printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void __init winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + machine_check_vector = winchip_machine_check; + wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); + lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo&= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); +} diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile new file mode 100644 index 000000000000..a25b701ab84e --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/Makefile @@ -0,0 +1,5 @@ +obj-y := main.o if.o generic.o state.o +obj-y += amd.o +obj-y += cyrix.o +obj-y += centaur.o + diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c new file mode 100644 index 000000000000..1a1e04b6fd00 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/amd.c @@ -0,0 +1,121 @@ +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "mtrr.h" + +static void +amd_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + unsigned long low, high; + + rdmsr(MSR_K6_UWCCR, low, high); + /* Upper dword is region 1, lower is region 0 */ + if (reg == 1) + low = high; + /* The base masks off on the right alignment */ + *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *type = 0; + if (low & 1) + *type = MTRR_TYPE_UNCACHABLE; + if (low & 2) + *type = MTRR_TYPE_WRCOMB; + if (!(low & 3)) { + *size = 0; + return; + } + /* + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits + * + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks + * + * eg 111 1111 1111 1100 is 512K + * + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... + */ + low = (~low) & 0x1FFFC; + *size = (low + 4) << (15 - PAGE_SHIFT); + return; +} + +static void amd_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +/* [SUMMARY] Set variable MTRR register on the local CPU. + <reg> The register to set. + <base> The base address of the region. + <size> The size of the region. If this is 0 the region is disabled. + <type> The type of the region. + <do_safe> If TRUE, do the change safely. If FALSE, safety measures should + be done externally. + [RETURNS] Nothing. +*/ +{ + u32 regs[2]; + + /* + * Low is MTRR0 , High MTRR 1 + */ + rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); + /* + * Blank to disable + */ + if (size == 0) + regs[reg] = 0; + else + /* Set the register to the base, the type (off by one) and an + inverted bitmask of the size The size is the only odd + bit. We are fed say 512K We invert this and we get 111 1111 + 1111 1011 but if you subtract one and invert you get the + desired 111 1111 1111 1100 mask + + But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) + | (base << PAGE_SHIFT) | (type + 1); + + /* + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr + */ + wbinvd(); + wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); +} + +static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* Apply the K6 block alignment and size rules + In order + o Uncached or gathering only + o 128K or bigger block + o Power of 2 block + o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + || (size & ~(size - 1)) - size || (base & (size - 1))) + return -EINVAL; + return 0; +} + +static struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, + .get = amd_get_mtrr, + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init amd_init_mtrr(void) +{ + set_mtrr_ops(&amd_mtrr_ops); + return 0; +} + +//arch_initcall(amd_mtrr_init); diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c new file mode 100644 index 000000000000..33f00ac314ef --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/centaur.c @@ -0,0 +1,223 @@ +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include "mtrr.h" + +static struct { + unsigned long high; + unsigned long low; +} centaur_mcr[8]; + +static u8 centaur_mcr_reserved; +static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ + +/* + * Report boot time MCR setups + */ + +static int +centaur_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free MTRR. + <base> The starting (base) address of the region. + <size> The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i, max; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + + max = num_var_ranges; + for (i = 0; i < max; ++i) { + if (centaur_mcr_reserved & (1 << i)) + continue; + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + return -ENOSPC; +} + +void +mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +{ + centaur_mcr[mcr].low = lo; + centaur_mcr[mcr].high = hi; +} + +static void +centaur_get_mcr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + *base = centaur_mcr[reg].high >> PAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) + *type = MTRR_TYPE_UNCACHABLE; + if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) + *type = MTRR_TYPE_WRBACK; + if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) + *type = MTRR_TYPE_WRBACK; + +} + +static void centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long low, high; + + if (size == 0) { + /* Disable */ + high = low = 0; + } else { + high = base << PAGE_SHIFT; + if (centaur_mcr_type == 0) + low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ + else { + if (type == MTRR_TYPE_UNCACHABLE) + low = -size << PAGE_SHIFT | 0x02; /* NC */ + else + low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + } + } + centaur_mcr[reg].high = high; + centaur_mcr[reg].low = low; + wrmsr(MSR_IDT_MCR0 + reg, low, high); +} + +#if 0 +/* + * Initialise the later (saner) Winchip MCR variant. In this version + * the BIOS can pass us the registers it has used (but not their values) + * and the control register is read/write + */ + +static void __init +centaur_mcr1_init(void) +{ + unsigned i; + u32 lo, hi; + + /* Unfortunately, MCR's are read-only, so there is no way to + * find out what the bios might have done. + */ + + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ + lo &= ~0x1C0; /* clear key */ + lo |= 0x040; /* set key to 1 */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ + } + + centaur_mcr_type = 1; + + /* + * Clear any unconfigured MCR's. + */ + + for (i = 0; i < 8; ++i) { + if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { + if (!(lo & (1 << (9 + i)))) + wrmsr(MSR_IDT_MCR0 + i, 0, 0); + else + /* + * If the BIOS set up an MCR we cannot see it + * but we don't wish to obliterate it + */ + centaur_mcr_reserved |= (1 << i); + } + } + /* + * Throw the main write-combining switch... + * However if OOSTORE is enabled then people have already done far + * cleverer things and we should behave. + */ + + lo |= 15; /* Write combine enables */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); +} + +/* + * Initialise the original winchip with read only MCR registers + * no used bitmask for the BIOS to pass on and write only control + */ + +static void __init +centaur_mcr0_init(void) +{ + unsigned i; + + /* Unfortunately, MCR's are read-only, so there is no way to + * find out what the bios might have done. + */ + + /* Clear any unconfigured MCR's. + * This way we are sure that the centaur_mcr array contains the actual + * values. The disadvantage is that any BIOS tweaks are thus undone. + * + */ + for (i = 0; i < 8; ++i) { + if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) + wrmsr(MSR_IDT_MCR0 + i, 0, 0); + } + + wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ +} + +/* + * Initialise Winchip series MCR registers + */ + +static void __init +centaur_mcr_init(void) +{ + struct set_mtrr_context ctxt; + + set_mtrr_prepare_save(&ctxt); + set_mtrr_cache_disable(&ctxt); + + if (boot_cpu_data.x86_model == 4) + centaur_mcr0_init(); + else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) + centaur_mcr1_init(); + + set_mtrr_done(&ctxt); +} +#endif + +static int centaur_validate_add_page(unsigned long base, + unsigned long size, unsigned int type) +{ + /* + * FIXME: Winchip2 supports uncached + */ + if (type != MTRR_TYPE_WRCOMB && + (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { + printk(KERN_WARNING + "mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" + : " is"); + return -EINVAL; + } + return 0; +} + +static struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, +// .init = centaur_mcr_init, + .set = centaur_set_mcr, + .get = centaur_get_mcr, + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init centaur_init_mtrr(void) +{ + set_mtrr_ops(¢aur_mtrr_ops); + return 0; +} + +//arch_initcall(centaur_init_mtrr); diff --git a/arch/i386/kernel/cpu/mtrr/changelog b/arch/i386/kernel/cpu/mtrr/changelog new file mode 100644 index 000000000000..af1368535955 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/changelog @@ -0,0 +1,229 @@ + ChangeLog + + Prehistory Martin Tischhäuser <martin@ikcbarka.fzk.de> + Initial register-setting code (from proform-1.0). + 19971216 Richard Gooch <rgooch@atnf.csiro.au> + Original version for /proc/mtrr interface, SMP-safe. + v1.0 + 19971217 Richard Gooch <rgooch@atnf.csiro.au> + Bug fix for ioctls()'s. + Added sample code in Documentation/mtrr.txt + v1.1 + 19971218 Richard Gooch <rgooch@atnf.csiro.au> + Disallow overlapping regions. + 19971219 Jens Maurer <jmaurer@menuett.rhein-main.de> + Register-setting fixups. + v1.2 + 19971222 Richard Gooch <rgooch@atnf.csiro.au> + Fixups for kernel 2.1.75. + v1.3 + 19971229 David Wragg <dpw@doc.ic.ac.uk> + Register-setting fixups and conformity with Intel conventions. + 19971229 Richard Gooch <rgooch@atnf.csiro.au> + Cosmetic changes and wrote this ChangeLog ;-) + 19980106 Richard Gooch <rgooch@atnf.csiro.au> + Fixups for kernel 2.1.78. + v1.4 + 19980119 David Wragg <dpw@doc.ic.ac.uk> + Included passive-release enable code (elsewhere in PCI setup). + v1.5 + 19980131 Richard Gooch <rgooch@atnf.csiro.au> + Replaced global kernel lock with private spinlock. + v1.6 + 19980201 Richard Gooch <rgooch@atnf.csiro.au> + Added wait for other CPUs to complete changes. + v1.7 + 19980202 Richard Gooch <rgooch@atnf.csiro.au> + Bug fix in definition of <set_mtrr> for UP. + v1.8 + 19980319 Richard Gooch <rgooch@atnf.csiro.au> + Fixups for kernel 2.1.90. + 19980323 Richard Gooch <rgooch@atnf.csiro.au> + Move SMP BIOS fixup before secondary CPUs call <calibrate_delay> + v1.9 + 19980325 Richard Gooch <rgooch@atnf.csiro.au> + Fixed test for overlapping regions: confused by adjacent regions + 19980326 Richard Gooch <rgooch@atnf.csiro.au> + Added wbinvd in <set_mtrr_prepare>. + 19980401 Richard Gooch <rgooch@atnf.csiro.au> + Bug fix for non-SMP compilation. + 19980418 David Wragg <dpw@doc.ic.ac.uk> + Fixed-MTRR synchronisation for SMP and use atomic operations + instead of spinlocks. + 19980418 Richard Gooch <rgooch@atnf.csiro.au> + Differentiate different MTRR register classes for BIOS fixup. + v1.10 + 19980419 David Wragg <dpw@doc.ic.ac.uk> + Bug fix in variable MTRR synchronisation. + v1.11 + 19980419 Richard Gooch <rgooch@atnf.csiro.au> + Fixups for kernel 2.1.97. + v1.12 + 19980421 Richard Gooch <rgooch@atnf.csiro.au> + Safer synchronisation across CPUs when changing MTRRs. + v1.13 + 19980423 Richard Gooch <rgooch@atnf.csiro.au> + Bugfix for SMP systems without MTRR support. + v1.14 + 19980427 Richard Gooch <rgooch@atnf.csiro.au> + Trap calls to <mtrr_add> and <mtrr_del> on non-MTRR machines. + v1.15 + 19980427 Richard Gooch <rgooch@atnf.csiro.au> + Use atomic bitops for setting SMP change mask. + v1.16 + 19980428 Richard Gooch <rgooch@atnf.csiro.au> + Removed spurious diagnostic message. + v1.17 + 19980429 Richard Gooch <rgooch@atnf.csiro.au> + Moved register-setting macros into this file. + Moved setup code from init/main.c to i386-specific areas. + v1.18 + 19980502 Richard Gooch <rgooch@atnf.csiro.au> + Moved MTRR detection outside conditionals in <mtrr_init>. + v1.19 + 19980502 Richard Gooch <rgooch@atnf.csiro.au> + Documentation improvement: mention Pentium II and AGP. + v1.20 + 19980521 Richard Gooch <rgooch@atnf.csiro.au> + Only manipulate interrupt enable flag on local CPU. + Allow enclosed uncachable regions. + v1.21 + 19980611 Richard Gooch <rgooch@atnf.csiro.au> + Always define <main_lock>. + v1.22 + 19980901 Richard Gooch <rgooch@atnf.csiro.au> + Removed module support in order to tidy up code. + Added sanity check for <mtrr_add>/<mtrr_del> before <mtrr_init>. + Created addition queue for prior to SMP commence. + v1.23 + 19980902 Richard Gooch <rgooch@atnf.csiro.au> + Ported patch to kernel 2.1.120-pre3. + v1.24 + 19980910 Richard Gooch <rgooch@atnf.csiro.au> + Removed sanity checks and addition queue: Linus prefers an OOPS. + v1.25 + 19981001 Richard Gooch <rgooch@atnf.csiro.au> + Fixed harmless compiler warning in include/asm-i386/mtrr.h + Fixed version numbering and history for v1.23 -> v1.24. + v1.26 + 19990118 Richard Gooch <rgooch@atnf.csiro.au> + Added devfs support. + v1.27 + 19990123 Richard Gooch <rgooch@atnf.csiro.au> + Changed locking to spin with reschedule. + Made use of new <smp_call_function>. + v1.28 + 19990201 Zoltán Böszörményi <zboszor@mail.externet.hu> + Extended the driver to be able to use Cyrix style ARRs. + 19990204 Richard Gooch <rgooch@atnf.csiro.au> + Restructured Cyrix support. + v1.29 + 19990204 Zoltán Böszörményi <zboszor@mail.externet.hu> + Refined ARR support: enable MAPEN in set_mtrr_prepare() + and disable MAPEN in set_mtrr_done(). + 19990205 Richard Gooch <rgooch@atnf.csiro.au> + Minor cleanups. + v1.30 + 19990208 Zoltán Böszörményi <zboszor@mail.externet.hu> + Protect plain 6x86s (and other processors without the + Page Global Enable feature) against accessing CR4 in + set_mtrr_prepare() and set_mtrr_done(). + 19990210 Richard Gooch <rgooch@atnf.csiro.au> + Turned <set_mtrr_up> and <get_mtrr> into function pointers. + v1.31 + 19990212 Zoltán Böszörményi <zboszor@mail.externet.hu> + Major rewrite of cyrix_arr_init(): do not touch ARRs, + leave them as the BIOS have set them up. + Enable usage of all 8 ARRs. + Avoid multiplications by 3 everywhere and other + code clean ups/speed ups. + 19990213 Zoltán Böszörményi <zboszor@mail.externet.hu> + Set up other Cyrix processors identical to the boot cpu. + Since Cyrix don't support Intel APIC, this is l'art pour l'art. + Weigh ARRs by size: + If size <= 32M is given, set up ARR# we were given. + If size > 32M is given, set up ARR7 only if it is free, + fail otherwise. + 19990214 Zoltán Böszörményi <zboszor@mail.externet.hu> + Also check for size >= 256K if we are to set up ARR7, + mtrr_add() returns the value it gets from set_mtrr() + 19990218 Zoltán Böszörményi <zboszor@mail.externet.hu> + Remove Cyrix "coma bug" workaround from here. + Moved to linux/arch/i386/kernel/setup.c and + linux/include/asm-i386/bugs.h + 19990228 Richard Gooch <rgooch@atnf.csiro.au> + Added MTRRIOC_KILL_ENTRY ioctl(2) + Trap for counter underflow in <mtrr_file_del>. + Trap for 4 MiB aligned regions for PPro, stepping <= 7. + 19990301 Richard Gooch <rgooch@atnf.csiro.au> + Created <get_free_region> hook. + 19990305 Richard Gooch <rgooch@atnf.csiro.au> + Temporarily disable AMD support now MTRR capability flag is set. + v1.32 + 19990308 Zoltán Böszörményi <zboszor@mail.externet.hu> + Adjust my changes (19990212-19990218) to Richard Gooch's + latest changes. (19990228-19990305) + v1.33 + 19990309 Richard Gooch <rgooch@atnf.csiro.au> + Fixed typo in <printk> message. + 19990310 Richard Gooch <rgooch@atnf.csiro.au> + Support K6-II/III based on Alan Cox's <alan@redhat.com> patches. + v1.34 + 19990511 Bart Hartgers <bart@etpmod.phys.tue.nl> + Support Centaur C6 MCR's. + 19990512 Richard Gooch <rgooch@atnf.csiro.au> + Minor cleanups. + v1.35 + 19990707 Zoltán Böszörményi <zboszor@mail.externet.hu> + Check whether ARR3 is protected in cyrix_get_free_region() + and mtrr_del(). The code won't attempt to delete or change it + from now on if the BIOS protected ARR3. It silently skips ARR3 + in cyrix_get_free_region() or returns with an error code from + mtrr_del(). + 19990711 Zoltán Böszörményi <zboszor@mail.externet.hu> + Reset some bits in the CCRs in cyrix_arr_init() to disable SMM + if ARR3 isn't protected. This is needed because if SMM is active + and ARR3 isn't protected then deleting and setting ARR3 again + may lock up the processor. With SMM entirely disabled, it does + not happen. + 19990812 Zoltán Böszörményi <zboszor@mail.externet.hu> + Rearrange switch() statements so the driver accomodates to + the fact that the AMD Athlon handles its MTRRs the same way + as Intel does. + 19990814 Zoltán Böszörményi <zboszor@mail.externet.hu> + Double check for Intel in mtrr_add()'s big switch() because + that revision check is only valid for Intel CPUs. + 19990819 Alan Cox <alan@redhat.com> + Tested Zoltan's changes on a pre production Athlon - 100% + success. + 19991008 Manfred Spraul <manfreds@colorfullife.com> + replaced spin_lock_reschedule() with a normal semaphore. + v1.36 + 20000221 Richard Gooch <rgooch@atnf.csiro.au> + Compile fix if procfs and devfs not enabled. + Formatting changes. + v1.37 + 20001109 H. Peter Anvin <hpa@zytor.com> + Use the new centralized CPU feature detects. + + v1.38 + 20010309 Dave Jones <davej@suse.de> + Add support for Cyrix III. + + v1.39 + 20010312 Dave Jones <davej@suse.de> + Ugh, I broke AMD support. + Reworked fix by Troels Walsted Hansen <troels@thule.no> + + v1.40 + 20010327 Dave Jones <davej@suse.de> + Adapted Cyrix III support to include VIA C3. + + v2.0 + 20020306 Patrick Mochel <mochel@osdl.org> + Split mtrr.c -> mtrr/*.c + Converted to Linux Kernel Coding Style + Fixed several minor nits in form + Moved some SMP-only functions out, so they can be used + for power management in the future. + TODO: Fix user interface cruft. diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c new file mode 100644 index 000000000000..933b0dd62f48 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -0,0 +1,364 @@ +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include <asm/io.h> +#include "mtrr.h" + +int arr3_protected; + +static void +cyrix_get_arr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + unsigned long flags; + unsigned char arr, ccr3, rcr, shift; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* Save flags and disable interrupts */ + local_irq_save(flags); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ((unsigned char *) base)[3] = getCx86(arr); + ((unsigned char *) base)[2] = getCx86(arr + 1); + ((unsigned char *) base)[1] = getCx86(arr + 2); + rcr = getCx86(CX86_RCR_BASE + reg); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + /* Enable interrupts if it was enabled previously */ + local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; + *base >>= PAGE_SHIFT; + + /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + * Note: shift==0xf means 4G, this is unsupported. + */ + if (shift) + *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); + else + *size = 0; + + /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ + if (reg < 7) { + switch (rcr) { + case 1: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRBACK; + break; + case 9: + *type = MTRR_TYPE_WRCOMB; + break; + case 24: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } else { + switch (rcr) { + case 0: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRCOMB; + break; + case 9: + *type = MTRR_TYPE_WRBACK; + break; + case 25: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } +} + +static int +cyrix_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free ARR. + <base> The starting (base) address of the region. + <size> The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + + /* If we are to set up a region >32M then look at ARR7 immediately */ + if (size > 0x2000) { + cyrix_get_arr(7, &lbase, &lsize, <ype); + if (lsize == 0) + return 7; + /* Else try ARR0-ARR6 first */ + } else { + for (i = 0; i < 7; i++) { + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((i == 3) && arr3_protected) + continue; + if (lsize == 0) + return i; + } + /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((lsize == 0) && (size >= 0x40)) + return i; + } + return -ENOSPC; +} + +static u32 cr4 = 0; +static u32 ccr3; + +static void prepare_set(void) +{ + u32 cr0; + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else were excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + +} + +static void post_set(void) +{ + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); +} + +static void cyrix_set_arr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned char arr, arr_type, arr_size; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ + if (reg >= 7) + size >>= 6; + + size &= 0x7fff; /* make sure arr_size <= 14 */ + for (arr_size = 0; size; arr_size++, size >>= 1) ; + + if (reg < 7) { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 1; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 9; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 24; + break; + default: + arr_type = 8; + break; + } + } else { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 0; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 8; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 25; + break; + default: + arr_type = 9; + break; + } + } + + prepare_set(); + + base <<= PAGE_SHIFT; + setCx86(arr, ((unsigned char *) &base)[3]); + setCx86(arr + 1, ((unsigned char *) &base)[2]); + setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(CX86_RCR_BASE + reg, arr_type); + + post_set(); +} + +typedef struct { + unsigned long base; + unsigned int size; + mtrr_type type; +} arr_state_t; + +static arr_state_t arr_state[8] __initdata = { + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} +}; + +static unsigned char ccr_state[7] __initdata = { 0, 0, 0, 0, 0, 0, 0 }; + +static void cyrix_set_all(void) +{ + int i; + + prepare_set(); + + /* the CCRs are not contiguous */ + for (i = 0; i < 4; i++) + setCx86(CX86_CCR0 + i, ccr_state[i]); + for (; i < 7; i++) + setCx86(CX86_CCR4 + i, ccr_state[i]); + for (i = 0; i < 8; i++) + cyrix_set_arr(i, arr_state[i].base, + arr_state[i].size, arr_state[i].type); + + post_set(); +} + +#if 0 +/* + * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection + * with the SMM (System Management Mode) mode. So we need the following: + * Check whether SMI_LOCK (CCR3 bit 0) is set + * if it is set, write a warning message: ARR3 cannot be changed! + * (it cannot be changed until the next processor reset) + * if it is reset, then we can change it, set all the needed bits: + * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) + * - disable access to SMM memory (CCR1 bit 2 reset) + * - disable SMM mode (CCR1 bit 1 reset) + * - disable write protection of ARR3 (CCR6 bit 1 reset) + * - (maybe) disable ARR3 + * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) + */ +static void __init +cyrix_arr_init(void) +{ + struct set_mtrr_context ctxt; + unsigned char ccr[7]; + int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; +#ifdef CONFIG_SMP + int i; +#endif + + /* flush cache and enable MAPEN */ + set_mtrr_prepare_save(&ctxt); + set_mtrr_cache_disable(&ctxt); + + /* Save all CCRs locally */ + ccr[0] = getCx86(CX86_CCR0); + ccr[1] = getCx86(CX86_CCR1); + ccr[2] = getCx86(CX86_CCR2); + ccr[3] = ctxt.ccr3; + ccr[4] = getCx86(CX86_CCR4); + ccr[5] = getCx86(CX86_CCR5); + ccr[6] = getCx86(CX86_CCR6); + + if (ccr[3] & 1) { + ccrc[3] = 1; + arr3_protected = 1; + } else { + /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and + * access to SMM memory through ARR3 (bit 7). + */ + if (ccr[1] & 0x80) { + ccr[1] &= 0x7f; + ccrc[1] |= 0x80; + } + if (ccr[1] & 0x04) { + ccr[1] &= 0xfb; + ccrc[1] |= 0x04; + } + if (ccr[1] & 0x02) { + ccr[1] &= 0xfd; + ccrc[1] |= 0x02; + } + arr3_protected = 0; + if (ccr[6] & 0x02) { + ccr[6] &= 0xfd; + ccrc[6] = 1; /* Disable write protection of ARR3 */ + setCx86(CX86_CCR6, ccr[6]); + } + /* Disable ARR3. This is safe now that we disabled SMM. */ + /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ + } + /* If we changed CCR1 in memory, change it in the processor, too. */ + if (ccrc[1]) + setCx86(CX86_CCR1, ccr[1]); + + /* Enable ARR usage by the processor */ + if (!(ccr[5] & 0x20)) { + ccr[5] |= 0x20; + ccrc[5] = 1; + setCx86(CX86_CCR5, ccr[5]); + } +#ifdef CONFIG_SMP + for (i = 0; i < 7; i++) + ccr_state[i] = ccr[i]; + for (i = 0; i < 8; i++) + cyrix_get_arr(i, + &arr_state[i].base, &arr_state[i].size, + &arr_state[i].type); +#endif + + set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */ + + if (ccrc[5]) + printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n"); + if (ccrc[3]) + printk(KERN_INFO "mtrr: ARR3 cannot be changed\n"); +/* + if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); + if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); + if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); +*/ + if (ccrc[6]) + printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n"); +} +#endif + +static struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, +// .init = cyrix_arr_init, + .set_all = cyrix_set_all, + .set = cyrix_set_arr, + .get = cyrix_get_arr, + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init cyrix_init_mtrr(void) +{ + set_mtrr_ops(&cyrix_mtrr_ops); + return 0; +} + +//arch_initcall(cyrix_init_mtrr); diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c new file mode 100644 index 000000000000..a4cce454d09b --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -0,0 +1,417 @@ +/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + because MTRRs can span upto 40 bits (36bits on most modern x86) */ +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <asm/io.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include <asm/system.h> +#include <asm/cpufeature.h> +#include <asm/tlbflush.h> +#include "mtrr.h" + +struct mtrr_state { + struct mtrr_var_range *var_ranges; + mtrr_type fixed_ranges[NUM_FIXED_RANGES]; + unsigned char enabled; + mtrr_type def_type; +}; + +static unsigned long smp_changes_mask; +static struct mtrr_state mtrr_state = {}; + +/* Get the MSR pair relating to a var range */ +static void __init +get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) +{ + rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); +} + +static void __init +get_fixed_ranges(mtrr_type * frs) +{ + unsigned int *p = (unsigned int *) frs; + int i; + + rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + + for (i = 0; i < 2; i++) + rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + for (i = 0; i < 8; i++) + rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); +} + +/* Grab all of the MTRR state for this CPU into *state */ +void __init get_mtrr_state(void) +{ + unsigned int i; + struct mtrr_var_range *vrs; + unsigned lo, dummy; + + if (!mtrr_state.var_ranges) { + mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), + GFP_KERNEL); + if (!mtrr_state.var_ranges) + return; + } + vrs = mtrr_state.var_ranges; + + for (i = 0; i < num_var_ranges; i++) + get_mtrr_var_range(i, &vrs[i]); + get_fixed_ranges(mtrr_state.fixed_ranges); + + rdmsr(MTRRdefType_MSR, lo, dummy); + mtrr_state.def_type = (lo & 0xff); + mtrr_state.enabled = (lo & 0xc00) >> 10; +} + +/* Free resources associated with a struct mtrr_state */ +void __init finalize_mtrr_state(void) +{ + if (mtrr_state.var_ranges) + kfree(mtrr_state.var_ranges); + mtrr_state.var_ranges = NULL; +} + +/* Some BIOS's are fucked and don't set all MTRRs the same! */ +void __init mtrr_state_warn(void) +{ + unsigned long mask = smp_changes_mask; + + if (!mask) + return; + if (mask & MTRR_CHANGE_MASK_FIXED) + printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_VARIABLE) + printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_DEFTYPE) + printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); + printk(KERN_INFO "mtrr: corrected configuration.\n"); +} + +/* Doesn't attempt to pass an error out to MTRR users + because it's quite complicated in some cases and probably not + worth it because the best error handling is to ignore it. */ +void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) +{ + if (wrmsr_safe(msr, a, b) < 0) + printk(KERN_ERR + "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + smp_processor_id(), msr, a, b); +} + +int generic_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free MTRR. + <base> The starting (base) address of the region. + <size> The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i, max; + mtrr_type ltype; + unsigned long lbase; + unsigned lsize; + + max = num_var_ranges; + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + return -ENOSPC; +} + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + unsigned int mask_lo, mask_hi, base_lo, base_hi; + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + if ((mask_lo & 0x800) == 0) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; + return; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + + /* Work out the shifted address mask. */ + mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) + | mask_lo >> PAGE_SHIFT; + + /* This works correctly if size is a power of two, i.e. a + contiguous range. */ + *size = -mask_lo; + *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & 0xff; +} + +static int set_fixed_ranges(mtrr_type * frs) +{ + unsigned int *p = (unsigned int *) frs; + int changed = FALSE; + int i; + unsigned int lo, hi; + + rdmsr(MTRRfix64K_00000_MSR, lo, hi); + if (p[0] != lo || p[1] != hi) { + mtrr_wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + changed = TRUE; + } + + for (i = 0; i < 2; i++) { + rdmsr(MTRRfix16K_80000_MSR + i, lo, hi); + if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { + mtrr_wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], + p[3 + i * 2]); + changed = TRUE; + } + } + + for (i = 0; i < 8; i++) { + rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi); + if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { + mtrr_wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], + p[7 + i * 2]); + changed = TRUE; + } + } + return changed; +} + +/* Set the MSR pair relating to a var range. Returns TRUE if + changes are made */ +static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +{ + unsigned int lo, hi; + int changed = FALSE; + + rdmsr(MTRRphysBase_MSR(index), lo, hi); + if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) + || (vr->base_hi & 0xfUL) != (hi & 0xfUL)) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + changed = TRUE; + } + + rdmsr(MTRRphysMask_MSR(index), lo, hi); + + if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) + || (vr->mask_hi & 0xfUL) != (hi & 0xfUL)) { + mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); + changed = TRUE; + } + return changed; +} + +static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) +/* [SUMMARY] Set the MTRR state for this CPU. + <state> The MTRR state information to read. + <ctxt> Some relevant CPU context. + [NOTE] The CPU must already be in a safe state for MTRR changes. + [RETURNS] 0 if no changes made, else a mask indication what was changed. +*/ +{ + unsigned int i; + unsigned long change_mask = 0; + + for (i = 0; i < num_var_ranges; i++) + if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) + change_mask |= MTRR_CHANGE_MASK_VARIABLE; + + if (set_fixed_ranges(mtrr_state.fixed_ranges)) + change_mask |= MTRR_CHANGE_MASK_FIXED; + + /* Set_mtrr_restore restores the old value of MTRRdefType, + so to set it we fiddle with the saved value */ + if ((deftype_lo & 0xff) != mtrr_state.def_type + || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); + change_mask |= MTRR_CHANGE_MASK_DEFTYPE; + } + + return change_mask; +} + + +static unsigned long cr4 = 0; +static u32 deftype_lo, deftype_hi; +static DEFINE_SPINLOCK(set_atomicity_lock); + +/* + * Since we are disabling the cache don't allow any interrupts - they + * would run extremely slow and would only increase the pain. The caller must + * ensure that local interrupts are disabled and are reenabled after post_set() + * has been called. + */ + +static void prepare_set(void) +{ + unsigned long cr0; + + /* Note that this is not ideal, since the cache is only flushed/disabled + for this CPU while the MTRRs are changed, but changing this requires + more invasive changes to the way the kernel boots */ + + spin_lock(&set_atomicity_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | 0x40000000; /* set CD flag */ + write_cr0(cr0); + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + __flush_tlb(); + + /* Save MTRR state */ + rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); +} + +static void post_set(void) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + __flush_tlb(); + + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); + spin_unlock(&set_atomicity_lock); +} + +static void generic_set_all(void) +{ + unsigned long mask, count; + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + /* Actually set the state */ + mask = set_mtrr_state(deftype_lo,deftype_hi); + + post_set(); + local_irq_restore(flags); + + /* Use the atomic bitops to update the global mask */ + for (count = 0; count < sizeof mask * 8; ++count) { + if (mask & 0x01) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } + +} + +static void generic_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +/* [SUMMARY] Set variable MTRR register on the local CPU. + <reg> The register to set. + <base> The base address of the region. + <size> The size of the region. If this is 0 the region is disabled. + <type> The type of the region. + <do_safe> If TRUE, do the change safely. If FALSE, safety measures should + be done externally. + [RETURNS] Nothing. +*/ +{ + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + if (size == 0) { + /* The invalid bit is kept in the mask, so we simply clear the + relevant mask register to disable a range. */ + mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + } else { + mtrr_wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, + (base & size_and_mask) >> (32 - PAGE_SHIFT)); + mtrr_wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, + (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + } + + post_set(); + local_irq_restore(flags); +} + +int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + unsigned long lbase, last; + + /* For Intel PPro stepping <= 7, must be 4 MiB aligned + and not touch 0x70000000->0x7003FFFF */ + if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask <= 7) { + if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + return -EINVAL; + } + if (!(base + size < 0x70000000 || base > 0x7003FFFF) && + (type == MTRR_TYPE_WRCOMB + || type == MTRR_TYPE_WRBACK)) { + printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + return -EINVAL; + } + } + + if (base + size < 0x100) { + printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", + base, size); + return -EINVAL; + } + /* Check upper bits of base and last are equal and lower bits are 0 + for base and 1 for last */ + last = base + size - 1; + for (lbase = base; !(lbase & 1) && (last & 1); + lbase = lbase >> 1, last = last >> 1) ; + if (lbase != last) { + printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", + base, size); + return -EINVAL; + } + return 0; +} + + +static int generic_have_wrcomb(void) +{ + unsigned long config, dummy; + rdmsr(MTRRcap_MSR, config, dummy); + return (config & (1 << 10)); +} + +int positive_have_wrcomb(void) +{ + return 1; +} + +/* generic structure... + */ +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, +}; diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c new file mode 100644 index 000000000000..1923e0aed26a --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -0,0 +1,374 @@ +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> + +#define LINE_SIZE 80 + +#include <asm/mtrr.h> +#include "mtrr.h" + +/* RED-PEN: this is accessed without any locking */ +extern unsigned int *usage_table; + + +#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) + +static char *mtrr_strings[MTRR_NUM_TYPES] = +{ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ +}; + +char *mtrr_attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +#ifdef CONFIG_PROC_FS + +static int +mtrr_file_add(unsigned long base, unsigned long size, + unsigned int type, char increment, struct file *file, int page) +{ + int reg, max; + unsigned int *fcount = FILE_FCOUNT(file); + + max = num_var_ranges; + if (fcount == NULL) { + fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); + if (!fcount) + return -ENOMEM; + memset(fcount, 0, max * sizeof *fcount); + FILE_FCOUNT(file) = fcount; + } + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_add_page(base, size, type, 1); + if (reg >= 0) + ++fcount[reg]; + return reg; +} + +static int +mtrr_file_del(unsigned long base, unsigned long size, + struct file *file, int page) +{ + int reg; + unsigned int *fcount = FILE_FCOUNT(file); + + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_del_page(-1, base, size); + if (reg < 0) + return reg; + if (fcount == NULL) + return reg; + if (fcount[reg] < 1) + return -EINVAL; + --fcount[reg]; + return reg; +} + +/* RED-PEN: seq_file can seek now. this is ignored. */ +static ssize_t +mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) +/* Format of control line: + "base=%Lx size=%Lx type=%s" OR: + "disable=%d" +*/ +{ + int i, err; + unsigned long reg; + unsigned long long base, size; + char *ptr; + char line[LINE_SIZE]; + size_t linelen; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!len) + return -EINVAL; + memset(line, 0, LINE_SIZE); + if (len > LINE_SIZE) + len = LINE_SIZE; + if (copy_from_user(line, buf, len - 1)) + return -EFAULT; + linelen = strlen(line); + ptr = line + linelen - 1; + if (linelen && *ptr == '\n') + *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { + reg = simple_strtoul(line + 8, &ptr, 0); + err = mtrr_del_page(reg, 0, 0); + if (err < 0) + return err; + return len; + } + if (strncmp(line, "base=", 5)) + return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); + for (; isspace(*ptr); ++ptr) ; + if (strncmp(ptr, "size=", 5)) + return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); + if ((base & 0xfff) || (size & 0xfff)) + return -EINVAL; + for (; isspace(*ptr); ++ptr) ; + if (strncmp(ptr, "type=", 5)) + return -EINVAL; + ptr += 5; + for (; isspace(*ptr); ++ptr) ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { + if (strcmp(ptr, mtrr_strings[i])) + continue; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + err = + mtrr_add_page((unsigned long) base, (unsigned long) size, i, + 1); + if (err < 0) + return err; + return len; + } + return -EINVAL; +} + +static int +mtrr_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long __arg) +{ + int err; + mtrr_type type; + struct mtrr_sentry sentry; + struct mtrr_gentry gentry; + void __user *arg = (void __user *) __arg; + + switch (cmd) { + default: + return -ENOTTY; + case MTRRIOC_ADD_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + file, 0); + if (err < 0) + return err; + break; + case MTRRIOC_SET_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); + if (err < 0) + return err; + break; + case MTRRIOC_DEL_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_file_del(sentry.base, sentry.size, file, 0); + if (err < 0) + return err; + break; + case MTRRIOC_KILL_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_del(-1, sentry.base, sentry.size); + if (err < 0) + return err; + break; + case MTRRIOC_GET_ENTRY: + if (copy_from_user(&gentry, arg, sizeof gentry)) + return -EFAULT; + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + + /* Hide entries that go above 4GB */ + if (gentry.base + gentry.size > 0x100000 + || gentry.size == 0x100000) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base <<= PAGE_SHIFT; + gentry.size <<= PAGE_SHIFT; + gentry.type = type; + } + + if (copy_to_user(arg, &gentry, sizeof gentry)) + return -EFAULT; + break; + case MTRRIOC_ADD_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + file, 1); + if (err < 0) + return err; + break; + case MTRRIOC_SET_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); + if (err < 0) + return err; + break; + case MTRRIOC_DEL_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_file_del(sentry.base, sentry.size, file, 1); + if (err < 0) + return err; + break; + case MTRRIOC_KILL_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + err = mtrr_del_page(-1, sentry.base, sentry.size); + if (err < 0) + return err; + break; + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_from_user(&gentry, arg, sizeof gentry)) + return -EFAULT; + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + gentry.type = type; + + if (copy_to_user(arg, &gentry, sizeof gentry)) + return -EFAULT; + break; + } + return 0; +} + +static int +mtrr_close(struct inode *ino, struct file *file) +{ + int i, max; + unsigned int *fcount = FILE_FCOUNT(file); + + if (fcount != NULL) { + max = num_var_ranges; + for (i = 0; i < max; ++i) { + while (fcount[i] > 0) { + mtrr_del(i, 0, 0); + --fcount[i]; + } + } + kfree(fcount); + FILE_FCOUNT(file) = NULL; + } + return single_release(ino, file); +} + +static int mtrr_seq_show(struct seq_file *seq, void *offset); + +static int mtrr_open(struct inode *inode, struct file *file) +{ + if (!mtrr_if) + return -EIO; + if (!mtrr_if->get) + return -ENXIO; + return single_open(file, mtrr_seq_show, NULL); +} + +static struct file_operations mtrr_fops = { + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .ioctl = mtrr_ioctl, + .release = mtrr_close, +}; + + +static struct proc_dir_entry *proc_root_mtrr; + + +static int mtrr_seq_show(struct seq_file *seq, void *offset) +{ + char factor; + int i, max, len; + mtrr_type type; + unsigned long base; + unsigned int size; + + len = 0; + max = num_var_ranges; + for (i = 0; i < max; i++) { + mtrr_if->get(i, &base, &size, &type); + if (size == 0) + usage_table[i] = 0; + else { + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* RED-PEN: base can be > 32bit */ + len += seq_printf(seq, + "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", + i, base, base >> (20 - PAGE_SHIFT), size, factor, + mtrr_attrib_to_str(type), usage_table[i]); + } + } + return 0; +} + +static int __init mtrr_if_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + proc_root_mtrr = + create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); + if (proc_root_mtrr) { + proc_root_mtrr->owner = THIS_MODULE; + proc_root_mtrr->proc_fops = &mtrr_fops; + } + return 0; +} + +arch_initcall(mtrr_if_init); +#endif /* CONFIG_PROC_FS */ diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c new file mode 100644 index 000000000000..8f67b490a7fd --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -0,0 +1,693 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/mtrr.h> + +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include "mtrr.h" + +#define MTRR_VERSION "2.0 (20020519)" + +u32 num_var_ranges = 0; + +unsigned int *usage_table; +static DECLARE_MUTEX(main_lock); + +u32 size_or_mask, size_and_mask; + +static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; + +struct mtrr_ops * mtrr_if = NULL; + +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +extern int arr3_protected; + +void set_mtrr_ops(struct mtrr_ops * ops) +{ + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +} + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev; + + if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { + /* ServerWorks LE chipsets have problems with write-combining + Don't allow it and leave room for other chipsets to be tagged */ + if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { + printk(KERN_INFO "mtrr: Serverworks LE detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + /* Intel 450NX errata # 23. Non ascending cachline evictions to + write combining memory may resulting in data corruption */ + if (dev->vendor == PCI_VENDOR_ID_INTEL && + dev->device == PCI_DEVICE_ID_INTEL_82451NX) { + printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + pci_dev_put(dev); + } + return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); +} + +/* This function returns the number of variable MTRRs */ +static void __init set_num_var_ranges(void) +{ + unsigned long config = 0, dummy; + + if (use_intel()) { + rdmsr(MTRRcap_MSR, config, dummy); + } else if (is_cpu(AMD)) + config = 2; + else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) + config = 8; + num_var_ranges = config & 0xff; +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) + == NULL) { + printk(KERN_ERR "mtrr: could not allocate\n"); + return; + } + for (i = 0; i < max; i++) + usage_table[i] = 1; +} + +struct set_mtrr_data { + atomic_t count; + atomic_t gate; + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +#ifdef CONFIG_SMP + +static void ipi_handler(void *info) +/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. + [RETURNS] Nothing. +*/ +{ + struct set_mtrr_data *data = info; + unsigned long flags; + + local_irq_save(flags); + + atomic_dec(&data->count); + while(!atomic_read(&data->gate)) + cpu_relax(); + + /* The master has cleared me to execute */ + if (data->smp_reg != ~0U) + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + else + mtrr_if->set_all(); + + atomic_dec(&data->count); + while(atomic_read(&data->gate)) + cpu_relax(); + + atomic_dec(&data->count); + local_irq_restore(flags); +} + +#endif + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Send IPI to do the following: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, first we set data.count to the number + * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait + * until it hits 0 and proceed. We set the data.gate flag and reset data.count. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it + * differently, so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate to + * be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Everyone then enables interrupts and we all continue on. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data; + unsigned long flags; + + data.smp_reg = reg; + data.smp_base = base; + data.smp_size = size; + data.smp_type = type; + atomic_set(&data.count, num_booting_cpus() - 1); + atomic_set(&data.gate,0); + + /* Start the ball rolling on other CPUs */ + if (smp_call_function(ipi_handler, &data, 1, 0) != 0) + panic("mtrr: timed out waiting for other CPUs\n"); + + local_irq_save(flags); + + while(atomic_read(&data.count)) + cpu_relax(); + + /* ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + atomic_set(&data.gate,1); + + /* do our MTRR business */ + + /* HACK! + * We use this same function to initialize the mtrrs on boot. + * The state of the boot cpu's mtrrs has been saved, and we want + * to replicate across all the APs. + * If we're doing that @reg is set to something special... + */ + if (reg != ~0U) + mtrr_if->set(reg,base,size,type); + + /* wait for the others */ + while(atomic_read(&data.count)) + cpu_relax(); + + atomic_set(&data.count, num_booting_cpus() - 1); + atomic_set(&data.gate,0); + + /* + * Wait here for everyone to have seen the gate change + * So we're the last ones to touch 'data' + */ + while(atomic_read(&data.count)) + cpu_relax(); + + local_irq_restore(flags); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (4 KB) + * @size: Physical size of region in pages (4 KB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, char increment) +{ + int i; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + int error; + + if (!mtrr_if) + return -ENXIO; + + if ((error = mtrr_if->validate_add_page(base,size,type))) + return error; + + if (type >= MTRR_NUM_TYPES) { + printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + printk(KERN_WARNING + "mtrr: your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (base & size_or_mask || size & size_or_mask) { + printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + + /* Search for existing MTRR */ + down(&main_lock); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (base >= lbase + lsize) + continue; + if ((base < lbase) && (base + size <= lbase)) + continue; + /* At this point we know there is some kind of overlap/enclosure */ + if ((base < lbase) || (base + size > lbase + lsize)) { + printk(KERN_WARNING + "mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%x000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (type == MTRR_TYPE_UNCACHABLE) + continue; + printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); + goto out; + } + if (increment) + ++usage_table[i]; + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size); + if (i >= 0) { + set_mtrr(i, base, size, type); + usage_table[i] = 1; + } else + printk(KERN_INFO "mtrr: no more MTRRs available\n"); + error = i; + out: + up(&main_lock); + return error; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ + +int +mtrr_add(unsigned long base, unsigned long size, unsigned int type, + char increment) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + int error = -EINVAL; + + if (!mtrr_if) + return -ENXIO; + + max = num_var_ranges; + down(&main_lock); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, + size); + goto out; + } + } + if (reg >= max) { + printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + goto out; + } + if (is_cpu(CYRIX) && !use_intel()) { + if ((reg == 3) && arr3_protected) { + printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); + goto out; + } + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + goto out; + } + if (usage_table[reg] < 1) { + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--usage_table[reg] < 1) + set_mtrr(reg, 0, 0, 0); + error = reg; + out: + up(&main_lock); + return error; +} +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ + +int +mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +EXPORT_SYMBOL(mtrr_add); +EXPORT_SYMBOL(mtrr_del); + +/* HACK ALERT! + * These should be called implicitly, but we can't yet until all the initcall + * stuff is done... + */ +extern void amd_init_mtrr(void); +extern void cyrix_init_mtrr(void); +extern void centaur_init_mtrr(void); + +static void __init init_ifs(void) +{ + amd_init_mtrr(); + cyrix_init_mtrr(); + centaur_init_mtrr(); +} + +static void __init init_other_cpus(void) +{ + if (use_intel()) + get_mtrr_state(); + + /* bring up the other processors */ + set_mtrr(~0U,0,0,0); + + if (use_intel()) { + finalize_mtrr_state(); + mtrr_state_warn(); + } +} + + +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; +}; + +static struct mtrr_value * mtrr_state; + +static int mtrr_save(struct sys_device * sysdev, u32 state) +{ + int i; + int size = num_var_ranges * sizeof(struct mtrr_value); + + mtrr_state = kmalloc(size,GFP_ATOMIC); + if (mtrr_state) + memset(mtrr_state,0,size); + else + return -ENOMEM; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, + &mtrr_state[i].lbase, + &mtrr_state[i].lsize, + &mtrr_state[i].ltype); + } + return 0; +} + +static int mtrr_restore(struct sys_device * sysdev) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_state[i].lsize) + set_mtrr(i, + mtrr_state[i].lbase, + mtrr_state[i].lsize, + mtrr_state[i].ltype); + } + kfree(mtrr_state); + return 0; +} + + + +static struct sysdev_driver mtrr_sysdev_driver = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + + +/** + * mtrr_init - initialize mtrrs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + * + */ +static int __init mtrr_init(void) +{ + init_ifs(); + + if (cpu_has_mtrr) { + mtrr_if = &generic_mtrr_ops; + size_or_mask = 0xff000000; /* 36 bits */ + size_and_mask = 0x00f00000; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* The original Athlon docs said that + total addressable memory is 44 bits wide. + It was not really clear whether its MTRRs + follow this or not. (Read: 44 or 36 bits). + However, "x86-64_overview.pdf" explicitly + states that "previous implementations support + 36 bit MTRRs" and also provides a way to + query the width (in bits) of the physical + addressable memory on the Hammer family. + */ + if (boot_cpu_data.x86 == 15 + && (cpuid_eax(0x80000000) >= 0x80000008)) { + u32 phys_addr; + phys_addr = cpuid_eax(0x80000008) & 0xff; + size_or_mask = + ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfff00000; + } + /* Athlon MTRRs use an Intel-compatible interface for + * getting and setting */ + break; + case X86_VENDOR_CENTAUR: + if (boot_cpu_data.x86 == 6) { + /* VIA Cyrix family have Intel style MTRRs, but don't support PAE */ + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + + default: + break; + } + } else { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (cpu_has_k6_mtrr) { + /* Pre-Athlon (K6) AMD CPU MTRRs */ + mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + case X86_VENDOR_CENTAUR: + if (cpu_has_centaur_mcr) { + mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + case X86_VENDOR_CYRIX: + if (cpu_has_cyrix_arr) { + mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + default: + break; + } + } + printk(KERN_INFO "mtrr: v%s\n",MTRR_VERSION); + + if (mtrr_if) { + set_num_var_ranges(); + init_table(); + init_other_cpus(); + + return sysdev_driver_register(&cpu_sysdev_class, + &mtrr_sysdev_driver); + } + return -ENXIO; +} + +subsys_initcall(mtrr_init); diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h new file mode 100644 index 000000000000..de1351245599 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -0,0 +1,98 @@ +/* + * local mtrr defines. + */ + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +#define MTRRcap_MSR 0x0fe +#define MTRRdefType_MSR 0x2ff + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + +#define NUM_FIXED_RANGES 88 +#define MTRRfix64K_00000_MSR 0x250 +#define MTRRfix16K_80000_MSR 0x258 +#define MTRRfix16K_A0000_MSR 0x259 +#define MTRRfix4K_C0000_MSR 0x268 +#define MTRRfix4K_C8000_MSR 0x269 +#define MTRRfix4K_D0000_MSR 0x26a +#define MTRRfix4K_D8000_MSR 0x26b +#define MTRRfix4K_E0000_MSR 0x26c +#define MTRRfix4K_E8000_MSR 0x26d +#define MTRRfix4K_F0000_MSR 0x26e +#define MTRRfix4K_F8000_MSR 0x26f + +#define MTRR_CHANGE_MASK_FIXED 0x01 +#define MTRR_CHANGE_MASK_VARIABLE 0x02 +#define MTRR_CHANGE_MASK_DEFTYPE 0x04 + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef u8 mtrr_type; + +struct mtrr_ops { + u32 vendor; + u32 use_intel_if; +// void (*init)(void); + void (*set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + void (*set_all)(void); + + void (*get)(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type); + int (*get_free_region) (unsigned long base, unsigned long size); + + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); +}; + +extern int generic_get_free_region(unsigned long base, unsigned long size); +extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +extern struct mtrr_ops generic_mtrr_ops; + +extern int positive_have_wrcomb(void); + +/* library functions for processor-specific routines */ +struct set_mtrr_context { + unsigned long flags; + unsigned long deftype_lo; + unsigned long deftype_hi; + unsigned long cr4val; + unsigned long ccr3; +}; + +struct mtrr_var_range { + unsigned long base_lo; + unsigned long base_hi; + unsigned long mask_lo; + unsigned long mask_hi; +}; + +void set_mtrr_done(struct set_mtrr_context *ctxt); +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); + +void get_mtrr_state(void); + +extern void set_mtrr_ops(struct mtrr_ops * ops); + +extern u32 size_or_mask, size_and_mask; +extern struct mtrr_ops * mtrr_if; + +#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) + +extern unsigned int num_var_ranges; + +void finalize_mtrr_state(void); +void mtrr_state_warn(void); +char *mtrr_attrib_to_str(int x); +void mtrr_wrmsr(unsigned, unsigned, unsigned); + diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c new file mode 100644 index 000000000000..f62ecd15811a --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/state.c @@ -0,0 +1,78 @@ +#include <linux/mm.h> +#include <linux/init.h> +#include <asm/io.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include "mtrr.h" + + +/* Put the processor into a state where MTRRs can be safely set */ +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) +{ + unsigned int cr0; + + /* Disable interrupts locally */ + local_irq_save(ctxt->flags); + + if (use_intel() || is_cpu(CYRIX)) { + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + ctxt->cr4val = read_cr4(); + write_cr4(ctxt->cr4val & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + if (use_intel()) + /* Save MTRR state */ + rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + else + /* Cyrix ARRs - everything else were excluded at the top */ + ctxt->ccr3 = getCx86(CX86_CCR3); + } +} + +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) +{ + if (use_intel()) + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + ctxt->deftype_hi); + else if (is_cpu(CYRIX)) + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); +} + +/* Restore the processor after a set_mtrr_prepare */ +void set_mtrr_done(struct set_mtrr_context *ctxt) +{ + if (use_intel() || is_cpu(CYRIX)) { + + /* Flush caches and TLBs */ + wbinvd(); + + /* Restore MTRRdefType */ + if (use_intel()) + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + else + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ctxt->ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(ctxt->cr4val); + } + /* Re-enable interrupts locally (if enabled previously) */ + local_irq_restore(ctxt->flags); +} + diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c new file mode 100644 index 000000000000..30898a260a5c --- /dev/null +++ b/arch/i386/kernel/cpu/nexgen.c @@ -0,0 +1,63 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <asm/processor.h> + +#include "cpu.h" + +/* + * Detect a NexGen CPU running without BIOS hypercode new enough + * to have CPUID. (Thanks to Herbert Oppmann) + */ + +static int __init deep_magic_nexgen_probe(void) +{ + int ret; + + __asm__ __volatile__ ( + " movw $0x5555, %%ax\n" + " xorw %%dx,%%dx\n" + " movw $2, %%cx\n" + " divw %%cx\n" + " movl $0, %%eax\n" + " jnz 1f\n" + " movl $1, %%eax\n" + "1:\n" + : "=a" (ret) : : "cx", "dx" ); + return ret; +} + +static void __init init_nexgen(struct cpuinfo_x86 * c) +{ + c->x86_cache_size = 256; /* A few had 1 MB... */ +} + +static void __init nexgen_identify(struct cpuinfo_x86 * c) +{ + /* Detect NexGen with old hypercode */ + if ( deep_magic_nexgen_probe() ) { + strcpy(c->x86_vendor_id, "NexGenDriven"); + } + generic_identify(c); +} + +static struct cpu_dev nexgen_cpu_dev __initdata = { + .c_vendor = "Nexgen", + .c_ident = { "NexGenDriven" }, + .c_models = { + { .vendor = X86_VENDOR_NEXGEN, + .family = 5, + .model_names = { [1] = "Nx586" } + }, + }, + .c_init = init_nexgen, + .c_identify = nexgen_identify, +}; + +int __init nexgen_init_cpu(void) +{ + cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; + return 0; +} + +//early_arch_initcall(nexgen_init_cpu); diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c new file mode 100644 index 000000000000..c8d83fdc237a --- /dev/null +++ b/arch/i386/kernel/cpu/proc.c @@ -0,0 +1,149 @@ +#include <linux/smp.h> +#include <linux/timex.h> +#include <linux/string.h> +#include <asm/semaphore.h> +#include <linux/seq_file.h> + +/* + * Get CPU information for use by the procfs. + */ +static int show_cpuinfo(struct seq_file *m, void *v) +{ + /* + * These flag bits must match the definitions in <asm/cpufeature.h>. + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu/<cpu_nr>/cpuid instead. + */ + static char *x86_cap_flags[] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + + /* AMD-defined */ + "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + }; + struct cpuinfo_x86 *c = v; + int i, n = c - cpu_data; + int fpu_exception; + +#ifdef CONFIG_SMP + if (!cpu_online(n)) + return 0; +#endif + seq_printf(m, "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + n, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_mask || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_mask); + else + seq_printf(m, "stepping\t: unknown\n"); + + if ( cpu_has(c, X86_FEATURE_TSC) ) { + seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", + cpu_khz / 1000, (cpu_khz % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size >= 0) + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); +#ifdef CONFIG_X86_HT + seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]); + seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); +#endif + + /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */ + fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); + seq_printf(m, "fdiv_bug\t: %s\n" + "hlt_bug\t\t: %s\n" + "f00f_bug\t: %s\n" + "coma_bug\t: %s\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "wp\t\t: %s\n" + "flags\t\t:", + c->fdiv_bug ? "yes" : "no", + c->hlt_works_ok ? "no" : "yes", + c->f00f_bug ? "yes" : "no", + c->coma_bug ? "yes" : "no", + c->hard_math ? "yes" : "no", + fpu_exception ? "yes" : "no", + c->cpuid_level, + c->wp_works_ok ? "yes" : "no"); + + for ( i = 0 ; i < 32*NCAPINTS ; i++ ) + if ( test_bit(i, c->x86_capability) && + x86_cap_flags[i] != NULL ) + seq_printf(m, " %s", x86_cap_flags[i]); + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + return *pos < NR_CPUS ? cpu_data + *pos : NULL; +} +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} +static void c_stop(struct seq_file *m, void *v) +{ +} +struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c new file mode 100644 index 000000000000..8602425628ca --- /dev/null +++ b/arch/i386/kernel/cpu/rise.c @@ -0,0 +1,53 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <asm/processor.h> + +#include "cpu.h" + +static void __init init_rise(struct cpuinfo_x86 *c) +{ + printk("CPU: Rise iDragon"); + if (c->x86_model > 2) + printk(" II"); + printk("\n"); + + /* Unhide possibly hidden capability flags + The mp6 iDragon family don't have MSRs. + We switch on extra features with this cpuid weirdness: */ + __asm__ ( + "movl $0x6363452a, %%eax\n\t" + "movl $0x3231206c, %%ecx\n\t" + "movl $0x2a32313a, %%edx\n\t" + "cpuid\n\t" + "movl $0x63634523, %%eax\n\t" + "movl $0x32315f6c, %%ecx\n\t" + "movl $0x2333313a, %%edx\n\t" + "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx" + ); + set_bit(X86_FEATURE_CX8, c->x86_capability); +} + +static struct cpu_dev rise_cpu_dev __initdata = { + .c_vendor = "Rise", + .c_ident = { "RiseRiseRise" }, + .c_models = { + { .vendor = X86_VENDOR_RISE, .family = 5, .model_names = + { + [0] = "iDragon", + [2] = "iDragon", + [8] = "iDragon II", + [9] = "iDragon II" + } + }, + }, + .c_init = init_rise, +}; + +int __init rise_init_cpu(void) +{ + cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev; + return 0; +} + +//early_arch_initcall(rise_init_cpu); diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c new file mode 100644 index 000000000000..f57e5ee94943 --- /dev/null +++ b/arch/i386/kernel/cpu/transmeta.c @@ -0,0 +1,107 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include "cpu.h" + +static void __init init_transmeta(struct cpuinfo_x86 *c) +{ + unsigned int cap_mask, uk, max, dummy; + unsigned int cms_rev1, cms_rev2; + unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + char cpu_info[65]; + + get_model_name(c); /* Same as AMD/Cyrix */ + display_cacheinfo(c); + + /* Print CMS and CPU revision */ + max = cpuid_eax(0x80860000); + cpu_rev = 0; + if ( max >= 0x80860001 ) { + cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); + if (cpu_rev != 0x02000000) { + printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + (cpu_rev >> 24) & 0xff, + (cpu_rev >> 16) & 0xff, + (cpu_rev >> 8) & 0xff, + cpu_rev & 0xff, + cpu_freq); + } + } + if ( max >= 0x80860002 ) { + cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); + if (cpu_rev == 0x02000000) { + printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n", + new_cpu_rev, cpu_freq); + } + printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + (cms_rev1 >> 24) & 0xff, + (cms_rev1 >> 16) & 0xff, + (cms_rev1 >> 8) & 0xff, + cms_rev1 & 0xff, + cms_rev2); + } + if ( max >= 0x80860006 ) { + cpuid(0x80860003, + (void *)&cpu_info[0], + (void *)&cpu_info[4], + (void *)&cpu_info[8], + (void *)&cpu_info[12]); + cpuid(0x80860004, + (void *)&cpu_info[16], + (void *)&cpu_info[20], + (void *)&cpu_info[24], + (void *)&cpu_info[28]); + cpuid(0x80860005, + (void *)&cpu_info[32], + (void *)&cpu_info[36], + (void *)&cpu_info[40], + (void *)&cpu_info[44]); + cpuid(0x80860006, + (void *)&cpu_info[48], + (void *)&cpu_info[52], + (void *)&cpu_info[56], + (void *)&cpu_info[60]); + cpu_info[64] = '\0'; + printk(KERN_INFO "CPU: %s\n", cpu_info); + } + + /* Unhide possibly hidden capability flags */ + rdmsr(0x80860004, cap_mask, uk); + wrmsr(0x80860004, ~0, uk); + c->x86_capability[0] = cpuid_edx(0x00000001); + wrmsr(0x80860004, cap_mask, uk); + + /* If we can run i686 user-space code, call us an i686 */ +#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) + if ( c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686 ) + c->x86 = 6; +} + +static void transmeta_identify(struct cpuinfo_x86 * c) +{ + u32 xlvl; + generic_identify(c); + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ( (xlvl & 0xffff0000) == 0x80860000 ) { + if ( xlvl >= 0x80860001 ) + c->x86_capability[2] = cpuid_edx(0x80860001); + } +} + +static struct cpu_dev transmeta_cpu_dev __initdata = { + .c_vendor = "Transmeta", + .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_init = init_transmeta, + .c_identify = transmeta_identify, +}; + +int __init transmeta_init_cpu(void) +{ + cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev; + return 0; +} + +//early_arch_initcall(transmeta_init_cpu); diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c new file mode 100644 index 000000000000..264fcad559d5 --- /dev/null +++ b/arch/i386/kernel/cpu/umc.c @@ -0,0 +1,33 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/processor.h> +#include "cpu.h" + +/* UMC chips appear to be only either 386 or 486, so no special init takes place. + */ +static void __init init_umc(struct cpuinfo_x86 * c) +{ + +} + +static struct cpu_dev umc_cpu_dev __initdata = { + .c_vendor = "UMC", + .c_ident = { "UMC UMC UMC" }, + .c_models = { + { .vendor = X86_VENDOR_UMC, .family = 4, .model_names = + { + [1] = "U5D", + [2] = "U5S", + } + }, + }, + .c_init = init_umc, +}; + +int __init umc_init_cpu(void) +{ + cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev; + return 0; +} + +//early_arch_initcall(umc_init_cpu); diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c new file mode 100644 index 000000000000..2e2756345bb2 --- /dev/null +++ b/arch/i386/kernel/cpuid.c @@ -0,0 +1,246 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * cpuid.c + * + * x86 CPUID access device + * + * This device is accessed by lseek() to the appropriate CPUID level + * and then read in chunks of 16 bytes. A larger size means multiple + * reads of consecutive levels. + * + * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include <linux/module.h> +#include <linux/config.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/smp.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/smp_lock.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/notifier.h> + +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +static struct class_simple *cpuid_class; + +#ifdef CONFIG_SMP + +struct cpuid_command { + int cpu; + u32 reg; + u32 *data; +}; + +static void cpuid_smp_cpuid(void *cmd_block) +{ + struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], + &cmd->data[3]); +} + +static inline void do_cpuid(int cpu, u32 reg, u32 * data) +{ + struct cpuid_command cmd; + + preempt_disable(); + if (cpu == smp_processor_id()) { + cpuid(reg, &data[0], &data[1], &data[2], &data[3]); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + cmd.data = data; + + smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); + } + preempt_enable(); +} +#else /* ! CONFIG_SMP */ + +static inline void do_cpuid(int cpu, u32 reg, u32 * data) +{ + cpuid(reg, &data[0], &data[1], &data[2], &data[3]); +} + +#endif /* ! CONFIG_SMP */ + +static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret; + + lock_kernel(); + + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + break; + default: + ret = -EINVAL; + } + + unlock_kernel(); + return ret; +} + +static ssize_t cpuid_read(struct file *file, char __user *buf, + size_t count, loff_t * ppos) +{ + char __user *tmp = buf; + u32 data[4]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + + if (count % 16) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 16) { + do_cpuid(cpu, reg, data); + if (copy_to_user(tmp, &data, 16)) + return -EFAULT; + tmp += 16; + *ppos = reg++; + } + + return tmp - buf; +} + +static int cpuid_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (c->cpuid_level < 0) + return -EIO; /* CPUID not supported */ + + return 0; +} + +/* + * File operations we support + */ +static struct file_operations cpuid_fops = { + .owner = THIS_MODULE, + .llseek = cpuid_seek, + .read = cpuid_read, + .open = cpuid_open, +}; + +static int cpuid_class_simple_device_add(int i) +{ + int err = 0; + struct class_device *class_err; + + class_err = class_simple_device_add(cpuid_class, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i); + if (IS_ERR(class_err)) + err = PTR_ERR(class_err); + return err; +} + +static int __devinit cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + cpuid_class_simple_device_add(cpu); + break; + case CPU_DEAD: + class_simple_device_remove(MKDEV(CPUID_MAJOR, cpu)); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cpuid_class_cpu_notifier = +{ + .notifier_call = cpuid_class_cpu_callback, +}; + +static int __init cpuid_init(void) +{ + int i, err = 0; + i = 0; + + if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", + CPUID_MAJOR); + err = -EBUSY; + goto out; + } + cpuid_class = class_simple_create(THIS_MODULE, "cpuid"); + if (IS_ERR(cpuid_class)) { + err = PTR_ERR(cpuid_class); + goto out_chrdev; + } + for_each_online_cpu(i) { + err = cpuid_class_simple_device_add(i); + if (err != 0) + goto out_class; + } + register_cpu_notifier(&cpuid_class_cpu_notifier); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) { + class_simple_device_remove(MKDEV(CPUID_MAJOR, i)); + } + class_simple_destroy(cpuid_class); +out_chrdev: + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); +out: + return err; +} + +static void __exit cpuid_exit(void) +{ + int cpu = 0; + + for_each_online_cpu(cpu) + class_simple_device_remove(MKDEV(CPUID_MAJOR, cpu)); + class_simple_destroy(cpuid_class); + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + unregister_cpu_notifier(&cpuid_class_cpu_notifier); +} + +module_init(cpuid_init); +module_exit(cpuid_exit); + +MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); +MODULE_DESCRIPTION("x86 generic CPUID driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c new file mode 100644 index 000000000000..6ed7e28f306c --- /dev/null +++ b/arch/i386/kernel/dmi_scan.c @@ -0,0 +1,487 @@ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/acpi.h> +#include <asm/io.h> +#include <linux/pm.h> +#include <asm/system.h> +#include <linux/dmi.h> +#include <linux/bootmem.h> + + +struct dmi_header +{ + u8 type; + u8 length; + u16 handle; +}; + +#undef DMI_DEBUG + +#ifdef DMI_DEBUG +#define dmi_printk(x) printk x +#else +#define dmi_printk(x) +#endif + +static char * __init dmi_string(struct dmi_header *dm, u8 s) +{ + u8 *bp=(u8 *)dm; + bp+=dm->length; + if(!s) + return ""; + s--; + while(s>0 && *bp) + { + bp+=strlen(bp); + bp++; + s--; + } + return bp; +} + +/* + * We have to be cautious here. We have seen BIOSes with DMI pointers + * pointing to completely the wrong place for example + */ + +static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *)) +{ + u8 *buf; + struct dmi_header *dm; + u8 *data; + int i=0; + + buf = bt_ioremap(base, len); + if(buf==NULL) + return -1; + + data = buf; + + /* + * Stop when we see all the items the table claimed to have + * OR we run off the end of the table (also happens) + */ + + while(i<num && data-buf+sizeof(struct dmi_header)<=len) + { + dm=(struct dmi_header *)data; + /* + * We want to know the total length (formated area and strings) + * before decoding to make sure we won't run off the table in + * dmi_decode or dmi_string + */ + data+=dm->length; + while(data-buf<len-1 && (data[0] || data[1])) + data++; + if(data-buf<len-1) + decode(dm); + data+=2; + i++; + } + bt_iounmap(buf, len); + return 0; +} + + +inline static int __init dmi_checksum(u8 *buf) +{ + u8 sum=0; + int a; + + for(a=0; a<15; a++) + sum+=buf[a]; + return (sum==0); +} + +static int __init dmi_iterate(void (*decode)(struct dmi_header *)) +{ + u8 buf[15]; + char __iomem *p, *q; + + /* + * no iounmap() for that ioremap(); it would be a no-op, but it's + * so early in setup that sucker gets confused into doing what + * it shouldn't if we actually call it. + */ + p = ioremap(0xF0000, 0x10000); + if (p == NULL) + return -1; + for (q = p; q < p + 0x10000; q += 16) { + memcpy_fromio(buf, q, 15); + if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) + { + u16 num=buf[13]<<8|buf[12]; + u16 len=buf[7]<<8|buf[6]; + u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; + + /* + * DMI version 0.0 means that the real version is taken from + * the SMBIOS version, which we don't know at this point. + */ + if(buf[14]!=0) + printk(KERN_INFO "DMI %d.%d present.\n", + buf[14]>>4, buf[14]&0x0F); + else + printk(KERN_INFO "DMI present.\n"); + dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n", + num, len)); + dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", + base)); + if(dmi_table(base,len, num, decode)==0) + return 0; + } + } + return -1; +} + +static char *dmi_ident[DMI_STRING_MAX]; + +/* + * Save a DMI string + */ + +static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) +{ + char *d = (char*)dm; + char *p = dmi_string(dm, d[string]); + if(p==NULL || *p == 0) + return; + if (dmi_ident[slot]) + return; + dmi_ident[slot] = alloc_bootmem(strlen(p)+1); + if(dmi_ident[slot]) + strcpy(dmi_ident[slot], p); + else + printk(KERN_ERR "dmi_save_ident: out of memory.\n"); +} + +/* + * Ugly compatibility crap. + */ +#define dmi_blacklist dmi_system_id +#define NO_MATCH { DMI_NONE, NULL} +#define MATCH DMI_MATCH + +/* + * Toshiba keyboard likes to repeat keys when they are not repeated. + */ + +static __init int broken_toshiba_keyboard(struct dmi_blacklist *d) +{ + printk(KERN_WARNING "Toshiba with broken keyboard detected. If your keyboard sometimes generates 3 keypresses instead of one, see http://davyd.ucc.asn.au/projects/toshiba/README\n"); + return 0; +} + + +#ifdef CONFIG_ACPI_SLEEP +static __init int reset_videomode_after_s3(struct dmi_blacklist *d) +{ + /* See acpi_wakeup.S */ + extern long acpi_video_flags; + acpi_video_flags |= 2; + return 0; +} +#endif + + +#ifdef CONFIG_ACPI_BOOT +extern int acpi_force; + +static __init __attribute__((unused)) int dmi_disable_acpi(struct dmi_blacklist *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Limit ACPI to CPU enumeration for HT + */ +static __init __attribute__((unused)) int force_acpi_ht(struct dmi_blacklist *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); + disable_acpi(); + acpi_ht = 1; + } else { + printk(KERN_NOTICE + "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); + } + return 0; +} +#endif + +#ifdef CONFIG_ACPI_PCI +static __init int disable_acpi_irq(struct dmi_blacklist *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} +static __init int disable_acpi_pci(struct dmi_blacklist *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} +#endif + +/* + * Process the DMI blacklists + */ + + +/* + * This will be expanded over time to force things like the APM + * interrupt mask settings according to the laptop + */ + +static __initdata struct dmi_blacklist dmi_blacklist[]={ + + { broken_toshiba_keyboard, "Toshiba Satellite 4030cdt", { /* Keyboard generates spurious repeats */ + MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), + NO_MATCH, NO_MATCH, NO_MATCH + } }, +#ifdef CONFIG_ACPI_SLEEP + { reset_videomode_after_s3, "Toshiba Satellite 4030cdt", { /* Reset video mode after returning from ACPI S3 sleep */ + MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), + NO_MATCH, NO_MATCH, NO_MATCH + } }, +#endif + +#ifdef CONFIG_ACPI_BOOT + /* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact acpi-devel@sourceforge.net + */ + + /* + * Boxes that need ACPI disabled + */ + + { dmi_disable_acpi, "IBM Thinkpad", { + MATCH(DMI_BOARD_VENDOR, "IBM"), + MATCH(DMI_BOARD_NAME, "2629H1G"), + NO_MATCH, NO_MATCH }}, + + /* + * Boxes that need acpi=ht + */ + + { force_acpi_ht, "FSC Primergy T850", { + MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "DELL GX240", { + MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), + MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "HP VISUALIZE NT Workstation", { + MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), + MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "Compaq Workstation W8000", { + MATCH(DMI_SYS_VENDOR, "Compaq"), + MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "ASUS P4B266", { + MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + MATCH(DMI_BOARD_NAME, "P4B266"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "ASUS P2B-DS", { + MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + MATCH(DMI_BOARD_NAME, "P2B-DS"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "ASUS CUR-DLS", { + MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + MATCH(DMI_BOARD_NAME, "CUR-DLS"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "ABIT i440BX-W83977", { + MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), + MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "IBM Bladecenter", { + MATCH(DMI_BOARD_VENDOR, "IBM"), + MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "IBM eServer xSeries 360", { + MATCH(DMI_BOARD_VENDOR, "IBM"), + MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "IBM eserver xSeries 330", { + MATCH(DMI_BOARD_VENDOR, "IBM"), + MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), + NO_MATCH, NO_MATCH }}, + + { force_acpi_ht, "IBM eserver xSeries 440", { + MATCH(DMI_BOARD_VENDOR, "IBM"), + MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), + NO_MATCH, NO_MATCH }}, + +#endif // CONFIG_ACPI_BOOT + +#ifdef CONFIG_ACPI_PCI + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + + { disable_acpi_irq, "ASUS A7V", { + MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + MATCH(DMI_BOARD_NAME, "<A7V>"), + /* newer BIOS, Revision 1011, does work */ + MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), + NO_MATCH }}, + + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { disable_acpi_pci, "ASUS PR-DLS", { /* _BBN 0 bug */ + MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + MATCH(DMI_BOARD_NAME, "PR-DLS"), + MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"), + MATCH(DMI_BIOS_DATE, "03/21/2003") }}, + + { disable_acpi_pci, "Acer TravelMate 36x Laptop", { + MATCH(DMI_SYS_VENDOR, "Acer"), + MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + NO_MATCH, NO_MATCH + } }, + +#endif + + { NULL, } +}; + +/* + * Process a DMI table entry. Right now all we care about are the BIOS + * and machine entries. For 2.5 we should pull the smbus controller info + * out of here. + */ + +static void __init dmi_decode(struct dmi_header *dm) +{ +#ifdef DMI_DEBUG + u8 *data = (u8 *)dm; +#endif + + switch(dm->type) + { + case 0: + dmi_printk(("BIOS Vendor: %s\n", + dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); + dmi_printk(("BIOS Version: %s\n", + dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BIOS_VERSION, 5); + dmi_printk(("BIOS Release: %s\n", + dmi_string(dm, data[8]))); + dmi_save_ident(dm, DMI_BIOS_DATE, 8); + break; + case 1: + dmi_printk(("System Vendor: %s\n", + dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_SYS_VENDOR, 4); + dmi_printk(("Product Name: %s\n", + dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); + dmi_printk(("Version: %s\n", + dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); + dmi_printk(("Serial Number: %s\n", + dmi_string(dm, data[7]))); + break; + case 2: + dmi_printk(("Board Vendor: %s\n", + dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); + dmi_printk(("Board Name: %s\n", + dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BOARD_NAME, 5); + dmi_printk(("Board Version: %s\n", + dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_BOARD_VERSION, 6); + break; + } +} + +void __init dmi_scan_machine(void) +{ + int err = dmi_iterate(dmi_decode); + if(err == 0) + dmi_check_system(dmi_blacklist); + else + printk(KERN_INFO "DMI not present.\n"); +} + + +/** + * dmi_check_system - check system DMI data + * @list: array of dmi_system_id structures to match against + * + * Walk the blacklist table running matching functions until someone + * returns non zero or we hit the end. Callback function is called for + * each successfull match. Returns the number of matches. + */ +int dmi_check_system(struct dmi_system_id *list) +{ + int i, count = 0; + struct dmi_system_id *d = list; + + while (d->ident) { + for (i = 0; i < ARRAY_SIZE(d->matches); i++) { + int s = d->matches[i].slot; + if (s == DMI_NONE) + continue; + if (dmi_ident[s] && strstr(dmi_ident[s], d->matches[i].substr)) + continue; + /* No match */ + goto fail; + } + if (d->callback && d->callback(d)) + break; + count++; +fail: d++; + } + + return count; +} + +EXPORT_SYMBOL(dmi_check_system); + +/** + * dmi_get_system_info - return DMI data value + * @field: data index (see enum dmi_filed) + * + * Returns one DMI data value, can be used to perform + * complex DMI data checks. + */ +char * dmi_get_system_info(int field) +{ + return dmi_ident[field]; +} + diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c new file mode 100644 index 000000000000..789af3e9fb1f --- /dev/null +++ b/arch/i386/kernel/doublefault.c @@ -0,0 +1,65 @@ +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/fs.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> + +#define DOUBLEFAULT_STACKSIZE (1024) +static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + 0x1000000) + +static void doublefault_fn(void) +{ + struct Xgt_desc_struct gdt_desc = {0, 0}; + unsigned long gdt, tss; + + __asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory"); + gdt = gdt_desc.address; + + printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + + if (ptr_ok(gdt)) { + gdt += GDT_ENTRY_TSS << 3; + tss = *(u16 *)(gdt+2); + tss += *(u8 *)(gdt+4) << 16; + tss += *(u8 *)(gdt+7) << 24; + printk("double fault, tss at %08lx\n", tss); + + if (ptr_ok(tss)) { + struct tss_struct *t = (struct tss_struct *)tss; + + printk("eip = %08lx, esp = %08lx\n", t->eip, t->esp); + + printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", + t->eax, t->ebx, t->ecx, t->edx); + printk("esi = %08lx, edi = %08lx\n", + t->esi, t->edi); + } + } + + for (;;) /* nothing */; +} + +struct tss_struct doublefault_tss __cacheline_aligned = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .eip = (unsigned long) doublefault_fn, + .eflags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */ + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + + .__cr3 = __pa(swapper_pg_dir) +}; diff --git a/arch/i386/kernel/early_printk.c b/arch/i386/kernel/early_printk.c new file mode 100644 index 000000000000..92f812ba275c --- /dev/null +++ b/arch/i386/kernel/early_printk.c @@ -0,0 +1,2 @@ + +#include "../../x86_64/kernel/early_printk.c" diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c new file mode 100644 index 000000000000..9e5e0d8bd36e --- /dev/null +++ b/arch/i386/kernel/efi.c @@ -0,0 +1,635 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/time.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/efi.h> + +#include <asm/setup.h> +#include <asm/io.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> +#include <asm/tlbflush.h> + +#define EFI_DEBUG 0 +#define PFX "EFI: " + +extern efi_status_t asmlinkage efi_call_phys(void *, ...); + +struct efi efi; +EXPORT_SYMBOL(efi); +static struct efi efi_phys __initdata; +struct efi_memory_map memmap __initdata; + +/* + * We require an early boot_ioremap mapping mechanism initially + */ +extern void * boot_ioremap(unsigned long, unsigned long); + +/* + * To make EFI call EFI runtime service in physical addressing mode we need + * prelog/epilog before/after the invocation to disable interrupt, to + * claim EFI runtime service handler exclusively and to duplicate a memory in + * low memory space say 0 - 3G. + */ + +static unsigned long efi_rt_eflags; +static DEFINE_SPINLOCK(efi_rt_lock); +static pgd_t efi_bak_pg_dir_pointer[2]; + +static void efi_call_phys_prelog(void) +{ + unsigned long cr4; + unsigned long temp; + + spin_lock(&efi_rt_lock); + local_irq_save(efi_rt_eflags); + + /* + * If I don't have PSE, I should just duplicate two entries in page + * directory. If I have PSE, I just need to duplicate one entry in + * page directory. + */ + __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4)); + + if (cr4 & X86_CR4_PSE) { + efi_bak_pg_dir_pointer[0].pgd = + swapper_pg_dir[pgd_index(0)].pgd; + swapper_pg_dir[0].pgd = + swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; + } else { + efi_bak_pg_dir_pointer[0].pgd = + swapper_pg_dir[pgd_index(0)].pgd; + efi_bak_pg_dir_pointer[1].pgd = + swapper_pg_dir[pgd_index(0x400000)].pgd; + swapper_pg_dir[pgd_index(0)].pgd = + swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; + temp = PAGE_OFFSET + 0x400000; + swapper_pg_dir[pgd_index(0x400000)].pgd = + swapper_pg_dir[pgd_index(temp)].pgd; + } + + /* + * After the lock is released, the original page table is restored. + */ + local_flush_tlb(); + + cpu_gdt_descr[0].address = __pa(cpu_gdt_descr[0].address); + __asm__ __volatile__("lgdt %0":"=m" + (*(struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0]))); +} + +static void efi_call_phys_epilog(void) +{ + unsigned long cr4; + + cpu_gdt_descr[0].address = + (unsigned long) __va(cpu_gdt_descr[0].address); + __asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr)); + __asm__ __volatile__("movl %%cr4, %0":"=r"(cr4)); + + if (cr4 & X86_CR4_PSE) { + swapper_pg_dir[pgd_index(0)].pgd = + efi_bak_pg_dir_pointer[0].pgd; + } else { + swapper_pg_dir[pgd_index(0)].pgd = + efi_bak_pg_dir_pointer[0].pgd; + swapper_pg_dir[pgd_index(0x400000)].pgd = + efi_bak_pg_dir_pointer[1].pgd; + } + + /* + * After the lock is released, the original page table is restored. + */ + local_flush_tlb(); + + local_irq_restore(efi_rt_eflags); + spin_unlock(&efi_rt_lock); +} + +static efi_status_t +phys_efi_set_virtual_address_map(unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); + efi_call_phys_epilog(); + return status; +} + +static efi_status_t +phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys(efi_phys.get_time, tm, tc); + efi_call_phys_epilog(); + return status; +} + +inline int efi_set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes; + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + spin_lock(&efi_rt_lock); + status = efi.get_time(&eft, &cap); + spin_unlock(&efi_rt_lock); + if (status != EFI_SUCCESS) + panic("Ooops, efitime: can't read time!\n"); + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + + if (((abs(real_minutes - eft.minute) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + + eft.minute = real_minutes; + eft.second = real_seconds; + + if (status != EFI_SUCCESS) { + printk("Ooops: efitime: can't read time!\n"); + return -1; + } + return 0; +} +/* + * This should only be used during kernel init and before runtime + * services have been remapped, therefore, we'll need to call in physical + * mode. Note, this call isn't used later, so mark it __init. + */ +inline unsigned long __init efi_get_time(void) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = phys_efi_get_time(&eft, &cap); + if (status != EFI_SUCCESS) + printk("Oops: efitime: can't read time status: 0x%lx\n",status); + + return mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); +} + +int is_available_memory(efi_memory_desc_t * md) +{ + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + return 1; + } + return 0; +} + +/* + * We need to map the EFI memory map again after paging_init(). + */ +void __init efi_map_memmap(void) +{ + memmap.map = NULL; + + memmap.map = (efi_memory_desc_t *) + bt_ioremap((unsigned long) memmap.phys_map, + (memmap.nr_map * sizeof(efi_memory_desc_t))); + + if (memmap.map == NULL) + printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); +} + +#if EFI_DEBUG +static void __init print_efi_memmap(void) +{ + efi_memory_desc_t *md; + int i; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " + "range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} +#endif /* EFI_DEBUG */ + +/* + * Walks the EFI memory map and calls CALLBACK once for each EFI + * memory descriptor that has memory that is available for kernel use. + */ +void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) +{ + int prev_valid = 0; + struct range { + unsigned long start; + unsigned long end; + } prev, curr; + efi_memory_desc_t *md; + unsigned long start, end; + int i; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + + if ((md->num_pages == 0) || (!is_available_memory(md))) + continue; + + curr.start = md->phys_addr; + curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); + + if (!prev_valid) { + prev = curr; + prev_valid = 1; + } else { + if (curr.start < prev.start) + printk(KERN_INFO PFX "Unordered memory map\n"); + if (prev.end == curr.start) + prev.end = curr.end; + else { + start = + (unsigned long) (PAGE_ALIGN(prev.start)); + end = (unsigned long) (prev.end & PAGE_MASK); + if ((end > start) + && (*callback) (start, end, arg) < 0) + return; + prev = curr; + } + } + } + if (prev_valid) { + start = (unsigned long) PAGE_ALIGN(prev.start); + end = (unsigned long) (prev.end & PAGE_MASK); + if (end > start) + (*callback) (start, end, arg); + } +} + +void __init efi_init(void) +{ + efi_config_table_t *config_tables; + efi_runtime_services_t *runtime; + efi_char16_t *c16; + char vendor[100] = "unknown"; + unsigned long num_config_tables; + int i = 0; + + memset(&efi, 0, sizeof(efi) ); + memset(&efi_phys, 0, sizeof(efi_phys)); + + efi_phys.systab = EFI_SYSTAB; + memmap.phys_map = EFI_MEMMAP; + memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE; + memmap.desc_version = EFI_MEMDESC_VERSION; + + efi.systab = (efi_system_table_t *) + boot_ioremap((unsigned long) efi_phys.systab, + sizeof(efi_system_table_t)); + /* + * Verify the EFI Table + */ + if (efi.systab == NULL) + printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); + if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0) + printk(KERN_ERR PFX + "Warning: EFI system table major version mismatch: " + "got %d.%02d, expected %d.%02d\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, + EFI_SYSTEM_TABLE_REVISION >> 16, + EFI_SYSTEM_TABLE_REVISION & 0xffff); + /* + * Grab some details from the system table + */ + num_config_tables = efi.systab->nr_tables; + config_tables = (efi_config_table_t *)efi.systab->tables; + runtime = efi.systab->runtime; + + /* + * Show what we know for posterity + */ + c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < sizeof(vendor) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + + printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + /* + * Let's see what config tables the firmware passed to us. + */ + config_tables = (efi_config_table_t *) + boot_ioremap((unsigned long) config_tables, + num_config_tables * sizeof(efi_config_table_t)); + + if (config_tables == NULL) + printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); + + for (i = 0; i < num_config_tables; i++) { + if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { + efi.mps = (void *)config_tables[i].table; + printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { + efi.acpi20 = __va(config_tables[i].table); + printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { + efi.acpi = __va(config_tables[i].table); + printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { + efi.smbios = (void *) config_tables[i].table; + printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { + efi.hcdp = (void *)config_tables[i].table; + printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { + efi.uga = (void *)config_tables[i].table; + printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); + } + } + printk("\n"); + + /* + * Check out the runtime services table. We need to map + * the runtime services table so that we can grab the physical + * address of several of the EFI runtime functions, needed to + * set the firmware into virtual mode. + */ + + runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) + runtime, + sizeof(efi_runtime_services_t)); + if (runtime != NULL) { + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *) runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + } else + printk(KERN_ERR PFX "Could not map the runtime service table!\n"); + + /* Map the EFI memory map for use until paging_init() */ + + memmap.map = (efi_memory_desc_t *) + boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); + + if (memmap.map == NULL) + printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); + + if (EFI_MEMDESC_SIZE != sizeof(efi_memory_desc_t)) { + printk(KERN_WARNING PFX "Warning! Kernel-defined memdesc doesn't " + "match the one from EFI!\n"); + } +#if EFI_DEBUG + print_efi_memmap(); +#endif +} + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor and update + * that memory descriptor with the virtual address obtained from ioremap(). + * This enables the runtime services to be called without having to + * thunk back into physical mode for every invocation. + */ + +void __init efi_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + efi_status_t status; + int i; + + efi.systab = NULL; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + + if (md->attribute & EFI_MEMORY_RUNTIME) { + md->virt_addr = + (unsigned long)ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!(unsigned long)md->virt_addr) { + printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", + (unsigned long)md->phys_addr); + } + + if (((unsigned long)md->phys_addr <= + (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < + md->phys_addr + + ((unsigned long)md->num_pages << + EFI_PAGE_SHIFT))) { + unsigned long addr; + + addr = md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab; + efi.systab = (efi_system_table_t *)addr; + } + } + } + + if (!efi.systab) + BUG(); + + status = phys_efi_set_virtual_address_map( + sizeof(efi_memory_desc_t) * memmap.nr_map, + sizeof(efi_memory_desc_t), + memmap.desc_version, + memmap.phys_map); + + if (status != EFI_SUCCESS) { + printk (KERN_ALERT "You are screwed! " + "Unable to switch EFI into virtual mode " + "(status=%lx)\n", status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } + + /* + * Now that EFI is in virtual mode, update the function + * pointers in the runtime service table to the new virtual addresses. + */ + + efi.get_time = (efi_get_time_t *) efi.systab->runtime->get_time; + efi.set_time = (efi_set_time_t *) efi.systab->runtime->set_time; + efi.get_wakeup_time = (efi_get_wakeup_time_t *) + efi.systab->runtime->get_wakeup_time; + efi.set_wakeup_time = (efi_set_wakeup_time_t *) + efi.systab->runtime->set_wakeup_time; + efi.get_variable = (efi_get_variable_t *) + efi.systab->runtime->get_variable; + efi.get_next_variable = (efi_get_next_variable_t *) + efi.systab->runtime->get_next_variable; + efi.set_variable = (efi_set_variable_t *) + efi.systab->runtime->set_variable; + efi.get_next_high_mono_count = (efi_get_next_high_mono_count_t *) + efi.systab->runtime->get_next_high_mono_count; + efi.reset_system = (efi_reset_system_t *) + efi.systab->runtime->reset_system; +} + +void __init +efi_initialize_iomem_resources(struct resource *code_resource, + struct resource *data_resource) +{ + struct resource *res; + efi_memory_desc_t *md; + int i; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + + if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > + 0x100000000ULL) + continue; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (md->type) { + case EFI_RESERVED_TYPE: + res->name = "Reserved Memory"; + break; + case EFI_LOADER_CODE: + res->name = "Loader Code"; + break; + case EFI_LOADER_DATA: + res->name = "Loader Data"; + break; + case EFI_BOOT_SERVICES_DATA: + res->name = "BootServices Data"; + break; + case EFI_BOOT_SERVICES_CODE: + res->name = "BootServices Code"; + break; + case EFI_RUNTIME_SERVICES_CODE: + res->name = "Runtime Service Code"; + break; + case EFI_RUNTIME_SERVICES_DATA: + res->name = "Runtime Service Data"; + break; + case EFI_CONVENTIONAL_MEMORY: + res->name = "Conventional Memory"; + break; + case EFI_UNUSABLE_MEMORY: + res->name = "Unusable Memory"; + break; + case EFI_ACPI_RECLAIM_MEMORY: + res->name = "ACPI Reclaim"; + break; + case EFI_ACPI_MEMORY_NVS: + res->name = "ACPI NVS"; + break; + case EFI_MEMORY_MAPPED_IO: + res->name = "Memory Mapped IO"; + break; + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + res->name = "Memory Mapped IO Port Space"; + break; + default: + res->name = "Reserved"; + break; + } + res->start = md->phys_addr; + res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res) < 0) + printk(KERN_ERR PFX "Failed to allocate res %s : 0x%lx-0x%lx\n", + res->name, res->start, res->end); + /* + * We don't know which region contains kernel data so we try + * it repeatedly and let the resource manager test it. + */ + if (md->type == EFI_CONVENTIONAL_MEMORY) { + request_resource(res, code_resource); + request_resource(res, data_resource); + } + } +} + +/* + * Convenience functions to obtain memory types and attributes + */ + +u32 efi_mem_type(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + int i; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if ((md->phys_addr <= phys_addr) && (phys_addr < + (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) + return md->type; + } + return 0; +} + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + int i; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if ((md->phys_addr <= phys_addr) && (phys_addr < + (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) + return md->attribute; + } + return 0; +} diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S new file mode 100644 index 000000000000..08c0312d9b6c --- /dev/null +++ b/arch/i386/kernel/efi_stub.S @@ -0,0 +1,124 @@ +/* + * EFI call stub for IA32. + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/page.h> +#include <asm/pgtable.h> + +/* + * efi_call_phys(void *, ...) is a function with variable parameters. + * All the callers of this function assure that all the parameters are 4-bytes. + */ + +/* + * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. + * So we'd better save all of them at the beginning of this function and restore + * at the end no matter how many we use, because we can not assure EFI runtime + * service functions will comply with gcc calling convention, too. + */ + +.text +ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been + * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found + * the values of these registers are the same. And, the corresponding + * GDT entries are identical. So I will do nothing about segment reg + * and GDT, but change GDT base register in prelog and epilog. + */ + + /* + * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. + * But to make it smoothly switch from virtual mode to flat mode. + * The mapping of lower virtual memory has been created in prelog and + * epilog. + */ + movl $1f, %edx + subl $__PAGE_OFFSET, %edx + jmp *%edx +1: + + /* + * 2. Now on the top of stack is the return + * address in the caller of efi_call_phys(), then parameter 1, + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ + popl %edx + movl %edx, saved_return_addr + /* get the function pointer into ECX*/ + popl %ecx + movl %ecx, efi_rt_function_ptr + movl $2f, %edx + subl $__PAGE_OFFSET, %edx + pushl %edx + + /* + * 3. Clear PG bit in %CR0. + */ + movl %cr0, %edx + andl $0x7fffffff, %edx + movl %edx, %cr0 + jmp 1f +1: + + /* + * 4. Adjust stack pointer. + */ + subl $__PAGE_OFFSET, %esp + + /* + * 5. Call the physical function. + */ + jmp *%ecx + +2: + /* + * 6. After EFI runtime service returns, control will return to + * following instruction. We'd better readjust stack pointer first. + */ + addl $__PAGE_OFFSET, %esp + + /* + * 7. Restore PG bit + */ + movl %cr0, %edx + orl $0x80000000, %edx + movl %edx, %cr0 + jmp 1f +1: + /* + * 8. Now restore the virtual mode from flat mode by + * adding EIP with PAGE_OFFSET. + */ + movl $1f, %edx + jmp *%edx +1: + + /* + * 9. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. + */ + leal efi_rt_function_ptr, %edx + movl (%edx), %ecx + pushl %ecx + + /* + * 10. Push the saved return address onto the stack and return. + */ + leal saved_return_addr, %edx + movl (%edx), %ecx + pushl %ecx + ret +.previous + +.data +saved_return_addr: + .long 0 +efi_rt_function_ptr: + .long 0 diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S new file mode 100644 index 000000000000..1e45ff292bc9 --- /dev/null +++ b/arch/i386/kernel/entry.S @@ -0,0 +1,950 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - orig_eax + * 28(%esp) - %eip + * 2C(%esp) - %cs + * 30(%esp) - %eflags + * 34(%esp) - %oldesp + * 38(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/thread_info.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page.h> +#include <asm/desc.h> +#include "irq_vectors.h" + +#define nr_syscalls ((syscall_table_size)/4) + +EBX = 0x00 +ECX = 0x04 +EDX = 0x08 +ESI = 0x0C +EDI = 0x10 +EBP = 0x14 +EAX = 0x18 +DS = 0x1C +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C +EFLAGS = 0x30 +OLDESP = 0x34 +OLDSS = 0x38 + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli +#else +#define preempt_stop +#define resume_kernel restore_nocheck +#endif + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; + +#define RESTORE_INT_REGS \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ +2: popl %es; \ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3b; \ + .long 2b,4b; \ +.previous + + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + jmp syscall_exit + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop +ret_from_intr: + GET_THREAD_INFO(%ebp) + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernel +ENTRY(resume_userspace) + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + cli + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_nocheck +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + movl TSS_sysenter_esp0(%esp),%esp +sysenter_past_esp: + sti + pushl $(__USER_DS) + pushl %ebp + pushfl + pushl $(__USER_CS) + pushl $SYSENTER_RETURN + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) + cli + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx + movl OLDESP(%esp), %ecx + xorl %ebp,%ebp + sti + sysexit + + + # system call handler stub +ENTRY(system_call) + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # store the return value +syscall_exit: + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + +restore_all: + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + movb OLDSS(%esp), %ah + movb CS(%esp), %al + andl $(VM_MASK | (4 << 8) | 3), %eax + cmpl $((4 << 8) | 3), %eax + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: + RESTORE_REGS + addl $4, %esp +1: iret +.section .fixup,"ax" +iret_exc: + sti + movl $__USER_DS, %edx + movl %edx, %ds + movl %edx, %es + movl $11,%eax + call do_exit +.previous +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ldt_ss: + larl OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + /* If returning to userspace with 16bit stack, + * try to fix the higher word of ESP, as the CPU + * won't restore it. + * This is an "official" bug of all the x86-compatible + * CPUs, which we can try to work around to make + * dosemu and wine happy. */ + subl $8, %esp # reserve space for switch16 pointer + cli + movl %esp, %eax + /* Set up the 16bit stack frame with switch32 pointer on top, + * and a switch16 pointer on top of the current frame. */ + call setup_x86_bogus_stack + RESTORE_REGS + lss 20+4(%esp), %esp # switch to 16bit stack +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests + testl $VM_MASK, EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + movl %eax, %esp + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + jz work_pending + sti # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace + + ALIGN +syscall_fault: + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,EAX(%esp) + jmp resume_userspace + + ALIGN +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + +#define FIXUP_ESPFIX_STACK \ + movl %esp, %eax; \ + /* switch to 32bit stack using the pointer on top of 16bit stack */ \ + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ + /* copy data from 16bit stack to 32bit stack */ \ + call fixup_x86_bogus_stack; \ + /* put ESP to the proper location */ \ + movl %eax, %esp; +#define UNWIND_ESPFIX_STACK \ + pushl %eax; \ + movl %ss, %eax; \ + /* see if on 16bit stack */ \ + cmpw $__ESPFIX_SS, %ax; \ + jne 28f; \ + movl $__KERNEL_DS, %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + /* switch to 32bit stack */ \ + FIXUP_ESPFIX_STACK \ +28: popl %eax; + +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +vector=0 +ENTRY(irq_entries_start) +.rept NR_IRQS + ALIGN +1: pushl $vector-256 + jmp common_interrupt +.data + .long 1b +.text +vector=vector+1 +.endr + + ALIGN +common_interrupt: + SAVE_ALL + movl %esp,%eax + call do_IRQ + jmp ret_from_intr + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + pushl $nr-256; \ + SAVE_ALL \ + movl %esp,%eax; \ + call smp_/**/name; \ + jmp ret_from_intr; + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" + +ENTRY(divide_error) + pushl $0 # no error code + pushl $do_divide_error + ALIGN +error_code: + pushl %ds + pushl %eax + xorl %eax, %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + pushl %es + UNWIND_ESPFIX_STACK + popl %ecx + movl ES(%esp), %edi # get the function address + movl ORIG_EAX(%esp), %edx # get the error code + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + +ENTRY(coprocessor_error) + pushl $0 + pushl $do_coprocessor_error + jmp error_code + +ENTRY(simd_coprocessor_error) + pushl $0 + pushl $do_simd_coprocessor_error + jmp error_code + +ENTRY(device_not_available) + pushl $-1 # mark this as an int + SAVE_ALL + movl %cr0, %eax + testl $0x4, %eax # EM (math emulation bit) + jne device_not_available_emulate + preempt_stop + call math_state_restore + jmp ret_from_exception +device_not_available_emulate: + pushl $0 # temporary storage for ORIG_EIP + call math_emulate + addl $4, %esp + jmp ret_from_exception + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_esp0+offset(%esp),%esp; \ + pushfl; \ + pushl $__KERNEL_CS; \ + pushl $sysenter_past_esp + +ENTRY(debug) + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + SAVE_ALL + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + testl %eax,%eax + jnz restore_all + jmp ret_from_exception + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + pushl %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + je nmi_16bit_stack + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all + +nmi_stack_fixup: + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug - 1,(%esp) + jle nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + jle nmi_debug_stack_fixup +nmi_debug_stack_fixup: + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_16bit_stack: + /* create the pointer to lss back */ + pushl %ss + pushl %esp + movzwl %sp, %esp + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + .endr + pushl %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to 16bit stack +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(int3) + pushl $-1 # mark this as an int + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + testl %eax,%eax + jnz restore_all + jmp ret_from_exception + +ENTRY(overflow) + pushl $0 + pushl $do_overflow + jmp error_code + +ENTRY(bounds) + pushl $0 + pushl $do_bounds + jmp error_code + +ENTRY(invalid_op) + pushl $0 + pushl $do_invalid_op + jmp error_code + +ENTRY(coprocessor_segment_overrun) + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code + +ENTRY(invalid_TSS) + pushl $do_invalid_TSS + jmp error_code + +ENTRY(segment_not_present) + pushl $do_segment_not_present + jmp error_code + +ENTRY(stack_segment) + pushl $do_stack_segment + jmp error_code + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code + +ENTRY(alignment_check) + pushl $do_alignment_check + jmp error_code + +ENTRY(page_fault) + pushl $do_page_fault + jmp error_code + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + pushl $0 + pushl machine_check_vector + jmp error_code +#endif + +ENTRY(spurious_interrupt_bug) + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code + +.data +ENTRY(sys_call_table) + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ + .long sys_exit + .long sys_fork + .long sys_read + .long sys_write + .long sys_open /* 5 */ + .long sys_close + .long sys_waitpid + .long sys_creat + .long sys_link + .long sys_unlink /* 10 */ + .long sys_execve + .long sys_chdir + .long sys_time + .long sys_mknod + .long sys_chmod /* 15 */ + .long sys_lchown16 + .long sys_ni_syscall /* old break syscall holder */ + .long sys_stat + .long sys_lseek + .long sys_getpid /* 20 */ + .long sys_mount + .long sys_oldumount + .long sys_setuid16 + .long sys_getuid16 + .long sys_stime /* 25 */ + .long sys_ptrace + .long sys_alarm + .long sys_fstat + .long sys_pause + .long sys_utime /* 30 */ + .long sys_ni_syscall /* old stty syscall holder */ + .long sys_ni_syscall /* old gtty syscall holder */ + .long sys_access + .long sys_nice + .long sys_ni_syscall /* 35 - old ftime syscall holder */ + .long sys_sync + .long sys_kill + .long sys_rename + .long sys_mkdir + .long sys_rmdir /* 40 */ + .long sys_dup + .long sys_pipe + .long sys_times + .long sys_ni_syscall /* old prof syscall holder */ + .long sys_brk /* 45 */ + .long sys_setgid16 + .long sys_getgid16 + .long sys_signal + .long sys_geteuid16 + .long sys_getegid16 /* 50 */ + .long sys_acct + .long sys_umount /* recycled never used phys() */ + .long sys_ni_syscall /* old lock syscall holder */ + .long sys_ioctl + .long sys_fcntl /* 55 */ + .long sys_ni_syscall /* old mpx syscall holder */ + .long sys_setpgid + .long sys_ni_syscall /* old ulimit syscall holder */ + .long sys_olduname + .long sys_umask /* 60 */ + .long sys_chroot + .long sys_ustat + .long sys_dup2 + .long sys_getppid + .long sys_getpgrp /* 65 */ + .long sys_setsid + .long sys_sigaction + .long sys_sgetmask + .long sys_ssetmask + .long sys_setreuid16 /* 70 */ + .long sys_setregid16 + .long sys_sigsuspend + .long sys_sigpending + .long sys_sethostname + .long sys_setrlimit /* 75 */ + .long sys_old_getrlimit + .long sys_getrusage + .long sys_gettimeofday + .long sys_settimeofday + .long sys_getgroups16 /* 80 */ + .long sys_setgroups16 + .long old_select + .long sys_symlink + .long sys_lstat + .long sys_readlink /* 85 */ + .long sys_uselib + .long sys_swapon + .long sys_reboot + .long old_readdir + .long old_mmap /* 90 */ + .long sys_munmap + .long sys_truncate + .long sys_ftruncate + .long sys_fchmod + .long sys_fchown16 /* 95 */ + .long sys_getpriority + .long sys_setpriority + .long sys_ni_syscall /* old profil syscall holder */ + .long sys_statfs + .long sys_fstatfs /* 100 */ + .long sys_ioperm + .long sys_socketcall + .long sys_syslog + .long sys_setitimer + .long sys_getitimer /* 105 */ + .long sys_newstat + .long sys_newlstat + .long sys_newfstat + .long sys_uname + .long sys_iopl /* 110 */ + .long sys_vhangup + .long sys_ni_syscall /* old "idle" system call */ + .long sys_vm86old + .long sys_wait4 + .long sys_swapoff /* 115 */ + .long sys_sysinfo + .long sys_ipc + .long sys_fsync + .long sys_sigreturn + .long sys_clone /* 120 */ + .long sys_setdomainname + .long sys_newuname + .long sys_modify_ldt + .long sys_adjtimex + .long sys_mprotect /* 125 */ + .long sys_sigprocmask + .long sys_ni_syscall /* old "create_module" */ + .long sys_init_module + .long sys_delete_module + .long sys_ni_syscall /* 130: old "get_kernel_syms" */ + .long sys_quotactl + .long sys_getpgid + .long sys_fchdir + .long sys_bdflush + .long sys_sysfs /* 135 */ + .long sys_personality + .long sys_ni_syscall /* reserved for afs_syscall */ + .long sys_setfsuid16 + .long sys_setfsgid16 + .long sys_llseek /* 140 */ + .long sys_getdents + .long sys_select + .long sys_flock + .long sys_msync + .long sys_readv /* 145 */ + .long sys_writev + .long sys_getsid + .long sys_fdatasync + .long sys_sysctl + .long sys_mlock /* 150 */ + .long sys_munlock + .long sys_mlockall + .long sys_munlockall + .long sys_sched_setparam + .long sys_sched_getparam /* 155 */ + .long sys_sched_setscheduler + .long sys_sched_getscheduler + .long sys_sched_yield + .long sys_sched_get_priority_max + .long sys_sched_get_priority_min /* 160 */ + .long sys_sched_rr_get_interval + .long sys_nanosleep + .long sys_mremap + .long sys_setresuid16 + .long sys_getresuid16 /* 165 */ + .long sys_vm86 + .long sys_ni_syscall /* Old sys_query_module */ + .long sys_poll + .long sys_nfsservctl + .long sys_setresgid16 /* 170 */ + .long sys_getresgid16 + .long sys_prctl + .long sys_rt_sigreturn + .long sys_rt_sigaction + .long sys_rt_sigprocmask /* 175 */ + .long sys_rt_sigpending + .long sys_rt_sigtimedwait + .long sys_rt_sigqueueinfo + .long sys_rt_sigsuspend + .long sys_pread64 /* 180 */ + .long sys_pwrite64 + .long sys_chown16 + .long sys_getcwd + .long sys_capget + .long sys_capset /* 185 */ + .long sys_sigaltstack + .long sys_sendfile + .long sys_ni_syscall /* reserved for streams1 */ + .long sys_ni_syscall /* reserved for streams2 */ + .long sys_vfork /* 190 */ + .long sys_getrlimit + .long sys_mmap2 + .long sys_truncate64 + .long sys_ftruncate64 + .long sys_stat64 /* 195 */ + .long sys_lstat64 + .long sys_fstat64 + .long sys_lchown + .long sys_getuid + .long sys_getgid /* 200 */ + .long sys_geteuid + .long sys_getegid + .long sys_setreuid + .long sys_setregid + .long sys_getgroups /* 205 */ + .long sys_setgroups + .long sys_fchown + .long sys_setresuid + .long sys_getresuid + .long sys_setresgid /* 210 */ + .long sys_getresgid + .long sys_chown + .long sys_setuid + .long sys_setgid + .long sys_setfsuid /* 215 */ + .long sys_setfsgid + .long sys_pivot_root + .long sys_mincore + .long sys_madvise + .long sys_getdents64 /* 220 */ + .long sys_fcntl64 + .long sys_ni_syscall /* reserved for TUX */ + .long sys_ni_syscall + .long sys_gettid + .long sys_readahead /* 225 */ + .long sys_setxattr + .long sys_lsetxattr + .long sys_fsetxattr + .long sys_getxattr + .long sys_lgetxattr /* 230 */ + .long sys_fgetxattr + .long sys_listxattr + .long sys_llistxattr + .long sys_flistxattr + .long sys_removexattr /* 235 */ + .long sys_lremovexattr + .long sys_fremovexattr + .long sys_tkill + .long sys_sendfile64 + .long sys_futex /* 240 */ + .long sys_sched_setaffinity + .long sys_sched_getaffinity + .long sys_set_thread_area + .long sys_get_thread_area + .long sys_io_setup /* 245 */ + .long sys_io_destroy + .long sys_io_getevents + .long sys_io_submit + .long sys_io_cancel + .long sys_fadvise64 /* 250 */ + .long sys_ni_syscall + .long sys_exit_group + .long sys_lookup_dcookie + .long sys_epoll_create + .long sys_epoll_ctl /* 255 */ + .long sys_epoll_wait + .long sys_remap_file_pages + .long sys_set_tid_address + .long sys_timer_create + .long sys_timer_settime /* 260 */ + .long sys_timer_gettime + .long sys_timer_getoverrun + .long sys_timer_delete + .long sys_clock_settime + .long sys_clock_gettime /* 265 */ + .long sys_clock_getres + .long sys_clock_nanosleep + .long sys_statfs64 + .long sys_fstatfs64 + .long sys_tgkill /* 270 */ + .long sys_utimes + .long sys_fadvise64_64 + .long sys_ni_syscall /* sys_vserver */ + .long sys_mbind + .long sys_get_mempolicy + .long sys_set_mempolicy + .long sys_mq_open + .long sys_mq_unlink + .long sys_mq_timedsend + .long sys_mq_timedreceive /* 280 */ + .long sys_mq_notify + .long sys_mq_getsetattr + .long sys_ni_syscall /* reserved for kexec */ + .long sys_waitid + .long sys_ni_syscall /* 285 */ /* available */ + .long sys_add_key + .long sys_request_key + .long sys_keyctl + +syscall_table_size=(.-sys_call_table) diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S new file mode 100644 index 000000000000..d273fd746192 --- /dev/null +++ b/arch/i386/kernel/head.S @@ -0,0 +1,521 @@ +/* + * linux/arch/i386/kernel/head.S -- the 32-bit startup code. + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Enhanced CPU detection and feature setting code by Mike Jagdis + * and Martin Mares, November 1997. + */ + +.text +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/cache.h> +#include <asm/thread_info.h> +#include <asm/asm_offsets.h> +#include <asm/setup.h> + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +/* + * This is how much memory *in addition to the memory covered up to + * and including _end* we need mapped initially. We need one bit for + * each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. + */ +#define INIT_MAP_BEYOND_END (128*1024) + + +/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. + */ +ENTRY(startup_32) + +/* + * Set segments to known values. + */ + cld + lgdt boot_gdt_descr - __PAGE_OFFSET + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + +/* + * Clear BSS first so that there are no surprises... + * No need to cld as DF is already clear from cld above... + */ + xorl %eax,%eax + movl $__bss_start - __PAGE_OFFSET,%edi + movl $__bss_stop - __PAGE_OFFSET,%ecx + subl %edi,%ecx + shrl $2,%ecx + rep ; stosl + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond _end. The variable + * init_pg_tables_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. + * + * Warning: don't use %esi or the stack in this code. However, %esp + * can be used as a GPR if you really need it... + */ +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $(pg0 - __PAGE_OFFSET), %edi + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +10: + leal 0x007(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ + /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ + leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + cmpl %ebp,%eax + jb 10b + movl %edi,(init_pg_tables_end - __PAGE_OFFSET) + +#ifdef CONFIG_SMP + xorl %ebx,%ebx /* This is the boot CPU (BSP) */ + jmp 3f + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt_table GDT + * for us. + */ +ENTRY(startup_32_smp) + cld + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + +/* + * New page tables may be in 4Mbyte page mode and may + * be using the global pages. + * + * NOTE! If we are on a 486 we may have no cr4 at all! + * So we do not try to touch it unless we really have + * some bits in it to set. This won't work if the BSP + * implements cr4 but this AP does not -- very unlikely + * but be warned! The same applies to the pse feature + * if not equally supported. --macro + * + * NOTE! We have to correct for the fact that we're + * not yet offset PAGE_OFFSET.. + */ +#define cr4_bits mmu_cr4_features-__PAGE_OFFSET + movl cr4_bits,%edx + andl %edx,%edx + jz 6f + movl %cr4,%eax # Turn on paging options (PSE,PAE,..) + orl %edx,%eax + movl %eax,%cr4 + + btl $5, %eax # check if PAE is enabled + jnc 6f + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe 6f + mov $0x80000001, %eax + cpuid + /* Execute Disable bit supported? */ + btl $20, %edx + jnc 6f + + /* Setup EFER (Extended Feature Enable Register) */ + movl $0xc0000080, %ecx + rdmsr + + btsl $11, %eax + /* Make changes effective */ + wrmsr + +6: + /* This is a secondary processor (AP) */ + xorl %ebx,%ebx + incl %ebx + +3: +#endif /* CONFIG_SMP */ + +/* + * Enable paging + */ + movl $swapper_pg_dir-__PAGE_OFFSET,%eax + movl %eax,%cr3 /* set the page table pointer.. */ + movl %cr0,%eax + orl $0x80000000,%eax + movl %eax,%cr0 /* ..and set paging (PG) bit */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ +1: + /* Set up the stack pointer */ + lss stack_start,%esp + +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ + pushl $0 + popfl + +#ifdef CONFIG_SMP + andl %ebx,%ebx + jz 1f /* Initial CPU cleans BSS */ + jmp checkCPUtype +1: +#endif /* CONFIG_SMP */ + +/* + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. + */ + call setup_idt + +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + */ + movl $boot_params,%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl boot_params+NEW_CL_POINTER,%esi + andl %esi,%esi + jnz 2f # New command line protocol + cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR + jne 1f + movzwl OLD_CL_OFFSET,%esi + addl $(OLD_CL_BASE_ADDR),%esi +2: + movl $saved_command_line,%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: +checkCPUtype: + + movl $-1,X86_CPUID # -1 for no CPUID initially + +/* check if it is 486 or 386. */ +/* + * XXX - this does a lot of unnecessary setup. Alignment checks don't + * apply at our cpl of 0 and the stack ought to be aligned already, and + * we don't need to preserve eflags. + */ + + movb $3,X86 # at least 386 + pushfl # push EFLAGS + popl %eax # get EFLAGS + movl %eax,%ecx # save original EFLAGS + xorl $0x240000,%eax # flip AC and ID bits in EFLAGS + pushl %eax # copy to EFLAGS + popfl # set EFLAGS + pushfl # get new EFLAGS + popl %eax # put it in eax + xorl %ecx,%eax # change in flags + pushl %ecx # restore original EFLAGS + popfl + testl $0x40000,%eax # check if AC bit changed + je is386 + + movb $4,X86 # at least 486 + testl $0x200000,%eax # check if ID bit changed + je is486 + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + orl %eax,%eax # do we have processor info as well? + je is486 + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + +is486: movl $0x50022,%ecx # set AM, WP, NE and MP + jmp 2f + +is386: movl $2,%ecx # set MP +2: movl %cr0,%eax + andl $0x80000011,%eax # Save PG,PE,ET + orl %ecx,%eax + movl %eax,%cr0 + + call check_x87 + incb ready + lgdt cpu_gdt_descr + lidt idt_descr + ljmp $(__KERNEL_CS),$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + + xorl %eax,%eax # Clear FS/GS and LDT + movl %eax,%fs + movl %eax,%gs + lldt %ax + cld # gcc2 wants the direction flag cleared at all times +#ifdef CONFIG_SMP + movb ready, %cl + cmpb $1,%cl + je 1f # the first CPU calls start_kernel + # all other CPUs call initialize_secondary + call initialize_secondary + jmp L6 +1: +#endif /* CONFIG_SMP */ + call start_kernel +L6: + jmp L6 # main should never return here, but + # just in case, we know what happens. + +/* + * We depend on ET to be correct. This checks for 287/387. + */ +check_x87: + movb $0,X86_HARD_MATH + clts + fninit + fstsw %ax + cmpb $0,%al + je 1f + movl %cr0,%eax /* no coprocessor: have to set bits */ + xorl $4,%eax /* set EM */ + movl %eax,%cr0 + ret + ALIGN +1: movb $1,X86_HARD_MATH + .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ + ret + +/* + * setup_idt + * + * sets up a idt with 256 entries pointing to + * ignore_int, interrupt gates. It doesn't actually load + * idt - that can be done only after paging has been enabled + * and the kernel moved to PAGE_OFFSET. Interrupts + * are enabled elsewhere, when we can be relatively + * sure everything is ok. + * + * Warning: %esi is live across this function. + */ +setup_idt: + lea ignore_int,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax /* selector = 0x0010 = cs */ + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + + lea idt_table,%edi + mov $256,%ecx +rp_sidt: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + dec %ecx + jne rp_sidt + ret + +/* This is the default interrupt "handler" :-) */ + ALIGN +ignore_int: + cld + pushl %eax + pushl %ecx + pushl %edx + pushl %es + pushl %ds + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg + call printk + addl $(5*4),%esp + popl %ds + popl %es + popl %edx + popl %ecx + popl %eax + iret + +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) + +/* + * BSS section + */ +.section ".bss.page_aligned","w" +ENTRY(swapper_pg_dir) + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +ENTRY(stack_start) + .long init_thread_union+THREAD_SIZE + .long __BOOT_DS + +ready: .byte 0 + +int_msg: + .asciz "Unknown interrupt or fault at EIP %p %p %p\n" + +/* + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: + */ + +.globl boot_gdt_descr +.globl idt_descr +.globl cpu_gdt_descr + + ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +boot_gdt_descr: + .word __BOOT_DS+7 + .long boot_gdt_table - __PAGE_OFFSET + + .word 0 # 32-bit align idt_desc.address +idt_descr: + .word IDT_ENTRIES*8-1 # idt contains 256 entries + .long idt_table + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +cpu_gdt_descr: + .word GDT_ENTRIES*8-1 + .long cpu_gdt_table + + .fill NR_CPUS-1,8,0 # space for the other GDT descriptors + +/* + * The boot_gdt_table must mirror the equivalent in setup.S and is + * used only for booting. + */ + .align L1_CACHE_BYTES +ENTRY(boot_gdt_table) + .fill GDT_ENTRY_BOOT_CS,8,0 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ + +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ + .align PAGE_SIZE_asm +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* 0x0b reserved */ + .quad 0x0000000000000000 /* 0x13 reserved */ + .quad 0x0000000000000000 /* 0x1b reserved */ + .quad 0x0000000000000000 /* 0x20 unused */ + .quad 0x0000000000000000 /* 0x28 unused */ + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ + .quad 0x0000000000000000 /* 0x4b reserved */ + .quad 0x0000000000000000 /* 0x53 reserved */ + .quad 0x0000000000000000 /* 0x5b reserved */ + + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ + + .quad 0x0000000000000000 /* 0x80 TSS descriptor */ + .quad 0x0000000000000000 /* 0x88 LDT descriptor */ + + /* Segments used for calling PnP BIOS */ + .quad 0x00c09a0000000000 /* 0x90 32-bit code */ + .quad 0x00809a0000000000 /* 0x98 16-bit code */ + .quad 0x0080920000000000 /* 0xa0 16-bit data */ + .quad 0x0080920000000000 /* 0xa8 16-bit data */ + .quad 0x0080920000000000 /* 0xb0 16-bit data */ + /* + * The APM segments have byte granularity and their bases + * and limits are set at run time. + */ + .quad 0x00409a0000000000 /* 0xb8 APM CS code */ + .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */ + .quad 0x0040920000000000 /* 0xc8 APM DS data */ + + .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xe0 - unused */ + .quad 0x0000000000000000 /* 0xe8 - unused */ + .quad 0x0000000000000000 /* 0xf0 - unused */ + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c new file mode 100644 index 000000000000..14ec354bec92 --- /dev/null +++ b/arch/i386/kernel/i386_ksyms.c @@ -0,0 +1,195 @@ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/user.h> +#include <linux/elfcore.h> +#include <linux/mca.h> +#include <linux/sched.h> +#include <linux/in6.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/pm.h> +#include <linux/pci.h> +#include <linux/apm_bios.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/tty.h> +#include <linux/highmem.h> +#include <linux/time.h> + +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/uaccess.h> +#include <asm/checksum.h> +#include <asm/io.h> +#include <asm/delay.h> +#include <asm/irq.h> +#include <asm/mmx.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/nmi.h> +#include <asm/ist.h> +#include <asm/kdebug.h> + +extern void dump_thread(struct pt_regs *, struct user *); +extern spinlock_t rtc_lock; + +/* This is definitely a GPL-only symbol */ +EXPORT_SYMBOL_GPL(cpu_gdt_table); + +#if defined(CONFIG_APM_MODULE) +extern void machine_real_restart(unsigned char *, int); +EXPORT_SYMBOL(machine_real_restart); +extern void default_idle(void); +EXPORT_SYMBOL(default_idle); +#endif + +#ifdef CONFIG_SMP +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +#endif + +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +extern struct drive_info_struct drive_info; +EXPORT_SYMBOL(drive_info); +#endif + +extern unsigned long cpu_khz; +extern unsigned long get_cmos_time(void); + +/* platform dependent support */ +EXPORT_SYMBOL(boot_cpu_data); +#ifdef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(physnode_map); +#endif +#ifdef CONFIG_X86_NUMAQ +EXPORT_SYMBOL(xquad_portio); +#endif +EXPORT_SYMBOL(dump_thread); +EXPORT_SYMBOL(dump_fpu); +EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL(__ioremap); +EXPORT_SYMBOL(ioremap_nocache); +EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(kernel_thread); +EXPORT_SYMBOL(pm_idle); +EXPORT_SYMBOL(pm_power_off); +EXPORT_SYMBOL(get_cmos_time); +EXPORT_SYMBOL(cpu_khz); +EXPORT_SYMBOL(apm_info); + +EXPORT_SYMBOL(__down_failed); +EXPORT_SYMBOL(__down_failed_interruptible); +EXPORT_SYMBOL(__down_failed_trylock); +EXPORT_SYMBOL(__up_wakeup); +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_generic); +/* Delay loops */ +EXPORT_SYMBOL(__ndelay); +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(__delay); +EXPORT_SYMBOL(__const_udelay); + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); + +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(strpbrk); +EXPORT_SYMBOL(strstr); + +EXPORT_SYMBOL(strncpy_from_user); +EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(__copy_from_user_ll); +EXPORT_SYMBOL(__copy_to_user_ll); +EXPORT_SYMBOL(strnlen_user); + +EXPORT_SYMBOL(dma_alloc_coherent); +EXPORT_SYMBOL(dma_free_coherent); + +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +#ifdef CONFIG_PCI_BIOS +EXPORT_SYMBOL(pcibios_set_irq_routing); +EXPORT_SYMBOL(pcibios_get_irq_routing_table); +#endif + +#ifdef CONFIG_X86_USE_3DNOW +EXPORT_SYMBOL(_mmx_memcpy); +EXPORT_SYMBOL(mmx_clear_page); +EXPORT_SYMBOL(mmx_copy_page); +#endif + +#ifdef CONFIG_X86_HT +EXPORT_SYMBOL(smp_num_siblings); +EXPORT_SYMBOL(cpu_sibling_map); +#endif + +#ifdef CONFIG_SMP +EXPORT_SYMBOL(cpu_data); +EXPORT_SYMBOL(cpu_online_map); +EXPORT_SYMBOL(cpu_callout_map); +EXPORT_SYMBOL(__write_lock_failed); +EXPORT_SYMBOL(__read_lock_failed); + +/* Global SMP stuff */ +EXPORT_SYMBOL(smp_call_function); + +/* TLB flushing */ +EXPORT_SYMBOL(flush_tlb_page); +#endif + +#ifdef CONFIG_X86_IO_APIC +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); +#endif + +#ifdef CONFIG_MCA +EXPORT_SYMBOL(machine_id); +#endif + +#ifdef CONFIG_VT +EXPORT_SYMBOL(screen_info); +#endif + +EXPORT_SYMBOL(get_wchan); + +EXPORT_SYMBOL(rtc_lock); + +EXPORT_SYMBOL_GPL(set_nmi_callback); +EXPORT_SYMBOL_GPL(unset_nmi_callback); + +#undef memcmp +extern int memcmp(const void *,const void *,__kernel_size_t); +EXPORT_SYMBOL(memcmp); + +EXPORT_SYMBOL(register_die_notifier); +#ifdef CONFIG_HAVE_DEC_LOCK +EXPORT_SYMBOL(_atomic_dec_and_lock); +#endif + +EXPORT_SYMBOL(__PAGE_KERNEL); + +#ifdef CONFIG_HIGHMEM +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_to_page); +#endif + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +EXPORT_SYMBOL(ist_info); +#endif + +EXPORT_SYMBOL(csum_partial); diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c new file mode 100644 index 000000000000..c55e037f08f7 --- /dev/null +++ b/arch/i386/kernel/i387.c @@ -0,0 +1,555 @@ +/* + * linux/arch/i386/kernel/i387.c + * + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/math_emu.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_MATH_EMULATION +#define HAVE_HWFP (boot_cpu_data.hard_math) +#else +#define HAVE_HWFP 1 +#endif + +static unsigned long mxcsr_feature_mask = 0xffffffff; + +void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + clts(); + if (cpu_has_fxsr) { + memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; + stts(); +} + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +void init_fpu(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + tsk->thread.i387.fxsave.cwd = 0x37f; + if (cpu_has_xmm) + tsk->thread.i387.fxsave.mxcsr = 0x1f80; + } else { + memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); + tsk->thread.i387.fsave.cwd = 0xffff037fu; + tsk->thread.i387.fsave.swd = 0xffff0000u; + tsk->thread.i387.fsave.twd = 0xffffffffu; + tsk->thread.i387.fsave.fos = 0xffff0000u; + } + /* only the device not available exception or ptrace can call init_fpu */ + set_stopped_child_used_math(tsk); +} + +/* + * FPU lazy state save handling. + */ + +void kernel_fpu_begin(void) +{ + struct thread_info *thread = current_thread_info(); + + preempt_disable(); + if (thread->status & TS_USEDFPU) { + __save_init_fpu(thread->task); + return; + } + clts(); +} + +void restore_fpu( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + asm volatile( "fxrstor %0" + : : "m" (tsk->thread.i387.fxsave) ); + } else { + asm volatile( "frstor %0" + : : "m" (tsk->thread.i387.fsave) ); + } +} + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) +{ + struct _fpxreg *st = NULL; + unsigned long tos = (fxsave->swd >> 11) & 7; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000u; + int i; + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); + + for ( i = 0 ; i < 8 ; i++ ) { + if ( twd & 0x1 ) { + st = FPREG_ADDR( fxsave, (i - tos) & 7 ); + + switch ( st->exponent & 0x7fff ) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if ( st->significand[3] & 0x8000 ) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +/* + * FPU state interaction. + */ + +unsigned short get_fpu_cwd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.cwd; + } else { + return (unsigned short)tsk->thread.i387.fsave.cwd; + } +} + +unsigned short get_fpu_swd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.swd; + } else { + return (unsigned short)tsk->thread.i387.fsave.swd; + } +} + +#if 0 +unsigned short get_fpu_twd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.twd; + } else { + return (unsigned short)tsk->thread.i387.fsave.twd; + } +} +#endif /* 0 */ + +unsigned short get_fpu_mxcsr( struct task_struct *tsk ) +{ + if ( cpu_has_xmm ) { + return tsk->thread.i387.fxsave.mxcsr; + } else { + return 0x1f80; + } +} + +#if 0 + +void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.cwd = cwd; + } else { + tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); + } +} + +void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.swd = swd; + } else { + tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); + } +} + +void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); + } else { + tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); + } +} + +#endif /* 0 */ + +/* + * FXSR floating point environment conversions. + */ + +static int convert_fxsr_to_user( struct _fpstate __user *buf, + struct i387_fxsave_struct *fxsave ) +{ + unsigned long env[7]; + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + + env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; + env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; + env[2] = twd_fxsr_to_i387(fxsave); + env[3] = fxsave->fip; + env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); + env[5] = fxsave->foo; + env[6] = fxsave->fos; + + if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) + return 1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for ( i = 0 ; i < 8 ; i++, to++, from++ ) { + unsigned long __user *t = (unsigned long __user *)to; + unsigned long *f = (unsigned long *)from; + + if (__put_user(*f, t) || + __put_user(*(f + 1), t + 1) || + __put_user(from->exponent, &to->exponent)) + return 1; + } + return 0; +} + +static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, + struct _fpstate __user *buf ) +{ + unsigned long env[7]; + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + + if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) + return 1; + + fxsave->cwd = (unsigned short)(env[0] & 0xffff); + fxsave->swd = (unsigned short)(env[1] & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); + fxsave->fip = env[3]; + fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); + fxsave->fcs = (env[4] & 0xffff); + fxsave->foo = env[5]; + fxsave->fos = env[6]; + + to = (struct _fpxreg *) &fxsave->st_space[0]; + from = &buf->_st[0]; + for ( i = 0 ; i < 8 ; i++, to++, from++ ) { + unsigned long *t = (unsigned long *)to; + unsigned long __user *f = (unsigned long __user *)from; + + if (__get_user(*t, f) || + __get_user(*(t + 1), f + 1) || + __get_user(to->exponent, &from->exponent)) + return 1; + } + return 0; +} + +/* + * Signal frame handlers. + */ + +static inline int save_i387_fsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + + unlazy_fpu( tsk ); + tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; + if ( __copy_to_user( buf, &tsk->thread.i387.fsave, + sizeof(struct i387_fsave_struct) ) ) + return -1; + return 1; +} + +static int save_i387_fxsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + int err = 0; + + unlazy_fpu( tsk ); + + if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) + return -1; + + err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); + err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); + if ( err ) + return -1; + + if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct) ) ) + return -1; + return 1; +} + +int save_i387( struct _fpstate __user *buf ) +{ + if ( !used_math() ) + return 0; + + /* This will cause a "finit" to be triggered by the next + * attempted FPU operation by the 'current' process. + */ + clear_used_math(); + + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return save_i387_fxsave( buf ); + } else { + return save_i387_fsave( buf ); + } + } else { + return save_i387_soft( ¤t->thread.i387.soft, buf ); + } +} + +static inline int restore_i387_fsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + clear_fpu( tsk ); + return __copy_from_user( &tsk->thread.i387.fsave, buf, + sizeof(struct i387_fsave_struct) ); +} + +static int restore_i387_fxsave( struct _fpstate __user *buf ) +{ + int err; + struct task_struct *tsk = current; + clear_fpu( tsk ); + err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct) ); + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); +} + +int restore_i387( struct _fpstate __user *buf ) +{ + int err; + + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + err = restore_i387_fxsave( buf ); + } else { + err = restore_i387_fsave( buf ); + } + } else { + err = restore_i387_soft( ¤t->thread.i387.soft, buf ); + } + set_used_math(); + return err; +} + +/* + * ptrace request handlers. + */ + +static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, + struct task_struct *tsk ) +{ + return __copy_to_user( buf, &tsk->thread.i387.fsave, + sizeof(struct user_i387_struct) ); +} + +static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, + struct task_struct *tsk ) +{ + return convert_fxsr_to_user( (struct _fpstate __user *)buf, + &tsk->thread.i387.fxsave ); +} + +int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) +{ + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return get_fpregs_fxsave( buf, tsk ); + } else { + return get_fpregs_fsave( buf, tsk ); + } + } else { + return save_i387_soft( &tsk->thread.i387.soft, + (struct _fpstate __user *)buf ); + } +} + +static inline int set_fpregs_fsave( struct task_struct *tsk, + struct user_i387_struct __user *buf ) +{ + return __copy_from_user( &tsk->thread.i387.fsave, buf, + sizeof(struct user_i387_struct) ); +} + +static inline int set_fpregs_fxsave( struct task_struct *tsk, + struct user_i387_struct __user *buf ) +{ + return convert_fxsr_from_user( &tsk->thread.i387.fxsave, + (struct _fpstate __user *)buf ); +} + +int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) +{ + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return set_fpregs_fxsave( tsk, buf ); + } else { + return set_fpregs_fsave( tsk, buf ); + } + } else { + return restore_i387_soft( &tsk->thread.i387.soft, + (struct _fpstate __user *)buf ); + } +} + +int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + if (__copy_to_user( buf, &tsk->thread.i387.fxsave, + sizeof(struct user_fxsr_struct) )) + return -EFAULT; + return 0; + } else { + return -EIO; + } +} + +int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) +{ + int ret = 0; + + if ( cpu_has_fxsr ) { + if (__copy_from_user( &tsk->thread.i387.fxsave, buf, + sizeof(struct user_fxsr_struct) )) + ret = -EFAULT; + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + } else { + ret = -EIO; + } + return ret; +} + +/* + * FPU state for core dumps. + */ + +static inline void copy_fpu_fsave( struct task_struct *tsk, + struct user_i387_struct *fpu ) +{ + memcpy( fpu, &tsk->thread.i387.fsave, + sizeof(struct user_i387_struct) ); +} + +static inline void copy_fpu_fxsave( struct task_struct *tsk, + struct user_i387_struct *fpu ) +{ + unsigned short *to; + unsigned short *from; + int i; + + memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); + + to = (unsigned short *)&fpu->st_space[0]; + from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; + for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { + memcpy( to, from, 5 * sizeof(unsigned short) ); + } +} + +int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) +{ + int fpvalid; + struct task_struct *tsk = current; + + fpvalid = !!used_math(); + if ( fpvalid ) { + unlazy_fpu( tsk ); + if ( cpu_has_fxsr ) { + copy_fpu_fxsave( tsk, fpu ); + } else { + copy_fpu_fsave( tsk, fpu ); + } + } + + return fpvalid; +} + +int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) +{ + int fpvalid = !!tsk_used_math(tsk); + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + if (cpu_has_fxsr) + copy_fpu_fxsave(tsk, fpu); + else + copy_fpu_fsave(tsk, fpu); + } + return fpvalid; +} + +int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) +{ + int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); + } + return fpvalid; +} diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c new file mode 100644 index 000000000000..560bef1afb3b --- /dev/null +++ b/arch/i386/kernel/i8259.c @@ -0,0 +1,429 @@ +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/bitops.h> + +#include <asm/8253pit.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/timer.h> +#include <asm/pgtable.h> +#include <asm/delay.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/arch_hooks.h> +#include <asm/i8259.h> + +#include <linux/irq.h> + +#include <io_ports.h> + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ + +DEFINE_SPINLOCK(i8259A_lock); + +static void end_8259A_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && + irq_desc[irq].action) + enable_8259A_irq(irq); +} + +#define shutdown_8259A_irq disable_8259A_irq + +static void mask_and_ack_8259A(unsigned int); + +unsigned int startup_8259A_irq(unsigned int irq) +{ + enable_8259A_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type i8259A_irq_type = { + .typename = "XT-PIC", + .startup = startup_8259A_irq, + .shutdown = shutdown_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, + .ack = mask_and_ack_8259A, + .end = end_8259A_irq, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + unsigned long flags; + int ret; + + spin_lock_irqsave(&i8259A_lock, flags); + if (irq < 8) + ret = inb(PIC_MASTER_CMD) & mask; + else + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + irq_desc[irq].handler = &i8259A_irq_type; + enable_irq(irq); +} + +/* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + * This has to be protected by the irq controller spinlock + * before being called. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ + return value; + } + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecesserily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(0); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, u32 state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .suspend = i8259A_suspend, + .resume = i8259A_resume, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ + if (auto_eoi) + /* + * in AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_irq_type.ack = disable_8259A_irq; + else + i8259A_irq_type.ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id, struct pt_regs *regs) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)regs->eip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + irq_desc[i].handler = &i8259A_irq_type; + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].handler = &no_irq_type; + } + } +} + +void __init init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * Set the clock to HZ Hz, we already have a valid + * vector now: + */ + setup_pit_timer(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c new file mode 100644 index 000000000000..9caa8e8db80c --- /dev/null +++ b/arch/i386/kernel/init_task.c @@ -0,0 +1,46 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/fs.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/desc.h> + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial thread structure. + * + * We need to make sure that this is THREAD_SIZE aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. + */ +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS; + diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c new file mode 100644 index 000000000000..9c1350e811d0 --- /dev/null +++ b/arch/i386/kernel/io_apic.c @@ -0,0 +1,2545 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/config.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/compiler.h> +#include <linux/acpi.h> + +#include <linux/sysdev.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/timer.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +int (*ioapic_renumber_irq)(int ioapic, int irq); +atomic_t irq_mis_count; + +static DEFINE_SPINLOCK(ioapic_lock); + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + int apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin, reg; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00010000); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) +{ + unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int apicid_value; + + apicid_value = cpu_mask_to_apicid(cpumask); + /* Prepare to do the io_apic_write */ + apicid_value = apicid_value << 24; + spin_lock_irqsave(&ioapic_lock, flags); + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#if defined(CONFIG_IRQBALANCE) +# include <asm/processor.h> /* kernel_thread() */ +# include <linux/kernel_stat.h> /* kstat */ +# include <linux/slab.h> /* kmalloc() */ +# include <linux/timer.h> /* time_after() */ + +# ifdef CONFIG_BALANCED_IRQ_DEBUG +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) +# define Dprintk(x...) do { TDprintk(x); } while (0) +# else +# define TDprintk(x...) +# define Dprintk(x...) +# endif + +cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; + +#define IRQBALANCE_CHECK_ARCH -999 +static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; +static int physical_balance = 0; + +static struct irq_cpu_info { + unsigned long * last_irq; + unsigned long * irq_delta; + unsigned long irq; +} irq_cpu_data[NR_CPUS]; + +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) + +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) + +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; + +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, + unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= NR_CPUS) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = NR_CPUS-1; + } + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int cpu, int irq) +{ + unsigned long now = jiffies; + cpumask_t allowed_mask; + unsigned int new_cpu; + + if (irqbalance_disabled) + return; + + cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]); + new_cpu = move(cpu, allowed_mask, now, 1); + if (cpu != new_cpu) { + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu); + spin_unlock_irqrestore(&desc->lock, flags); + } +} + +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) +{ + int i, j; + Dprintk("Rotating IRQs among CPUs.\n"); + for (i = 0; i < NR_CPUS; i++) { + for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) { + if (!irq_desc[j].action) + continue; + /* Is it a significant load ? */ + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + useful_load_threshold) + continue; + balance_irq(i, j); + } + } + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; +} + +static void do_irq_balance(void) +{ + int i, j; + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); + unsigned long move_this_load = 0; + int max_loaded = 0, min_loaded = 0; + int load; + unsigned long useful_load_threshold = balanced_irq_interval + 10; + int selected_irq; + int tmp_loaded, first_attempt = 1; + unsigned long tmp_cpu_irq; + unsigned long imbalance = 0; + cpumask_t allowed_mask, target_cpu_mask, tmp; + + for (i = 0; i < NR_CPUS; i++) { + int package_index; + CPU_IRQ(i) = 0; + if (!cpu_online(i)) + continue; + package_index = CPU_TO_PACKAGEINDEX(i); + for (j = 0; j < NR_IRQS; j++) { + unsigned long value_now, delta; + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if ( package_index == i ) + IRQ_DELTA(package_index,j) = 0; + /* Determine the total count per processor per IRQ */ + value_now = (unsigned long) kstat_cpu(i).irqs[j]; + + /* Determine the activity per processor per IRQ */ + delta = value_now - LAST_CPU_IRQ(i,j); + + /* Update last_cpu_irq[][] for the next time */ + LAST_CPU_IRQ(i,j) = value_now; + + /* Ignore IRQs whose rate is less than the clock */ + if (delta < useful_load_threshold) + continue; + /* update the load for the processor or package total */ + IRQ_DELTA(package_index,j) += delta; + + /* Keep track of the higher numbered sibling as well */ + if (i != package_index) + CPU_IRQ(i) += delta; + /* + * We have sibling A and sibling B in the package + * + * cpu_irq[A] = load for cpu A + load for cpu B + * cpu_irq[B] = load for cpu B + */ + CPU_IRQ(package_index) += delta; + } + } + /* Find the least loaded processor package */ + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (min_cpu_irq > CPU_IRQ(i)) { + min_cpu_irq = CPU_IRQ(i); + min_loaded = i; + } + } + max_cpu_irq = ULONG_MAX; + +tryanothercpu: + /* Look for heaviest loaded processor. + * We may come back to get the next heaviest loaded processor. + * Skip processors with trivial loads. + */ + tmp_cpu_irq = 0; + tmp_loaded = -1; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (max_cpu_irq <= CPU_IRQ(i)) + continue; + if (tmp_cpu_irq < CPU_IRQ(i)) { + tmp_cpu_irq = CPU_IRQ(i); + tmp_loaded = i; + } + } + + if (tmp_loaded == -1) { + /* In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original + * approach to rotate them around. + */ + if (!first_attempt && imbalance >= useful_load_threshold) { + rotate_irqs_among_cpus(useful_load_threshold); + return; + } + goto not_worth_the_effort; + } + + first_attempt = 0; /* heaviest search */ + max_cpu_irq = tmp_cpu_irq; /* load */ + max_loaded = tmp_loaded; /* processor */ + imbalance = (max_cpu_irq - min_cpu_irq) / 2; + + Dprintk("max_loaded cpu = %d\n", max_loaded); + Dprintk("min_loaded cpu = %d\n", min_loaded); + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); + Dprintk("load imbalance = %lu\n", imbalance); + + /* if imbalance is less than approx 10% of max load, then + * observe diminishing returns action. - quit + */ + if (imbalance < (max_cpu_irq >> 3)) { + Dprintk("Imbalance too trivial\n"); + goto not_worth_the_effort; + } + +tryanotherirq: + /* if we select an IRQ to move that can't go where we want, then + * see if there is another one to try. + */ + move_this_load = 0; + selected_irq = -1; + for (j = 0; j < NR_IRQS; j++) { + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if (imbalance <= IRQ_DELTA(max_loaded,j)) + continue; + /* Try to find the IRQ that is closest to the imbalance + * without going over. + */ + if (move_this_load < IRQ_DELTA(max_loaded,j)) { + move_this_load = IRQ_DELTA(max_loaded,j); + selected_irq = j; + } + } + if (selected_irq == -1) { + goto tryanothercpu; + } + + imbalance = move_this_load; + + /* For physical_balance case, we accumlated both load + * values in the one of the siblings cpu_irq[], + * to use the same code for physical and logical processors + * as much as possible. + * + * NOTE: the cpu_irq[] array holds the sum of the load for + * sibling A and sibling B in the slot for the lowest numbered + * sibling (A), _AND_ the load for sibling B in the slot for + * the higher numbered sibling. + * + * We seek the least loaded sibling by making the comparison + * (A+B)/2 vs B + */ + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } + + cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); + + if (!cpus_empty(tmp)) { + irq_desc_t *desc = irq_desc + selected_irq; + unsigned long flags; + + Dprintk("irq = %d moved to cpu = %d\n", + selected_irq, min_loaded); + /* mark for change destination */ + spin_lock_irqsave(&desc->lock, flags); + pending_irq_balance_cpumask[selected_irq] = + cpumask_of_cpu(min_loaded); + spin_unlock_irqrestore(&desc->lock, flags); + /* Since we made a change, come back sooner to + * check for more variation. + */ + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; + } + goto tryanotherirq; + +not_worth_the_effort: + /* + * if we did not find an IRQ to move, then adjust the time interval + * upward + */ + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + Dprintk("IRQ worth rotating not found\n"); + return; +} + +static int balanced_irq(void *unused) +{ + int i; + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + + daemonize("kirqd"); + + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + pending_irq_balance_cpumask[i] = cpumask_of_cpu(0); + } + + for ( ; ; ) { + set_current_state(TASK_INTERRUPTIBLE); + time_remaining = schedule_timeout(time_remaining); + try_to_freeze(PF_FREEZE); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; + } + } + return 0; +} + +static int __init balanced_irq_init(void) +{ + int i; + struct cpuinfo_x86 *c; + cpumask_t tmp; + + cpus_shift_right(tmp, cpu_online_map, 2); + c = &boot_cpu_data; + /* When not overwritten by the command line ask subarchitecture. */ + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) + irqbalance_disabled = NO_BALANCE_IRQ; + if (irqbalance_disabled) + return 0; + + /* disable irqbalance completely if there is only one processor online */ + if (num_online_cpus() < 2) { + irqbalance_disabled = 1; + return 0; + } + /* + * Enable physical balance only if more than 1 physical processor + * is present + */ + if (smp_num_siblings > 1 && !cpus_empty(tmp)) + physical_balance = 1; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { + printk(KERN_ERR "balanced_irq_init: out of memory"); + goto failed; + } + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + } + + printk(KERN_INFO "Starting balanced_irq\n"); + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + return 0; + else + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); +failed: + for (i = 0; i < NR_CPUS; i++) { + if(irq_cpu_data[i].irq_delta) + kfree(irq_cpu_data[i].irq_delta); + if(irq_cpu_data[i].last_irq) + kfree(irq_cpu_data[i].last_irq); + } + return 0; +} + +int __init irqbalance_disable(char *str) +{ + irqbalance_disabled = 1; + return 0; +} + +__setup("noirqbalance", irqbalance_disable); + +static inline void move_irq(int irq) +{ + /* note - we hold the desc->lock */ + if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) { + set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]); + cpus_clear(pending_irq_balance_cpumask[irq]); + } +} + +late_initcall(balanced_irq_init); + +#else /* !CONFIG_IRQBALANCE */ +static inline void move_irq(int irq) { } +#endif /* CONFIG_IRQBALANCE */ + +#ifndef CONFIG_SMP +void fastcall send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP */ + + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; + +static int __init ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +__setup("noapic", ioapic_setup); + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " + "slot:%d, pin:%d.\n", bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +/* NEC98 interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_NEC98_trigger(idx) (0) +#define default_NEC98_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + polarity = default_NEC98_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + trigger = default_NEC98_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + case MP_BUS_NEC98: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; + +int assign_irq_vector(int irq) +{ + static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + + BUG_ON(irq >= NR_IRQ_VECTORS); + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + return IO_APIC_VECTOR(irq); +next: + current_vector += 8; + if (current_vector == SYSCALL_VECTOR) + goto next; + + if (current_vector >= FIRST_SYSTEM_VECTOR) { + offset++; + if (!(offset%8)) + return -ENOSPC; + current_vector = FIRST_DEVICE_VECTOR + offset; + } + + vector_irq[current_vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = current_vector; + + return current_vector; +} + +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + if (use_pci_vector() && !platform_legacy_irq(irq)) { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[vector].handler = &ioapic_level_type; + else + irq_desc[vector].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[vector]); + } else { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[irq].handler = &ioapic_level_type; + else + irq_desc[irq].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[irq]); + } +} + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, + pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", + mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + } + + irq = pin_2_irq(idx, apic, pin); + /* + * skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum + */ + if (multi_timer_check(apic, irq)) + continue; + else + add_pin_to_irq(irq, apic, pin); + + if (!apic && !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the 8259A-master output pin: + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].handler = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +static inline void UNEXPECTED_IO_APIC(void) +{ +} + +void __init print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + if (reg_00.bits.ID >= get_physical_broadcast()) + UNEXPECTED_IO_APIC(); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + if (reg_03.bits.__reserved_1) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void /*__init*/ print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void /*__init*/ print_PIC(void) +{ + extern spinlock_t i8259A_lock; + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (i = 0; i < nr_ioapics; i++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(i, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[i] = reg_01.bits.entries+1; + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + disconnect_bsp_APIC(); +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ + +#ifndef CONFIG_X86_NUMAQ +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mpc_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + } + + /* Don't check I/O APIC IDs for some xAPIC systems. They have + * no meaning without the serial APIC bus. */ + if (NO_IOAPIC_CHECK) + continue; + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mpc_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mpc_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mpc_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + if (jiffies - t1 > 4) + return 1; + + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + unsigned long v; + int i; + + move_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = IO_APIC_VECTOR(irq); + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ +static struct hw_interrupt_type ioapic_edge_type = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static struct hw_interrupt_type ioapic_level_type = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].handler = &no_irq_type; + } + } +} + +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(0, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(0, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int pin1, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + timer_ack = 1; + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + pin2 = find_isa_irq_pin(0, mp_ExtINT); + + printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + check_nmi_watchdog(); + } + return; + } + clear_IO_APIC_pin(0, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + } + + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (pin1 != -1) + replace_pin_at_irq(0, 0, pin1, 0, pin2); + else + add_pin_to_irq(0, 0, pin2); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + check_nmi_watchdog(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(0, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].handler = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + printk(" failed.\n"); + + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + timer_ack = 0; + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option"); +} + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + printk("ENABLING IO-APIC IRQs\n"); + + /* + * Set up IO-APIC IRQ routing. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if(sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, u32 state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) + panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /*CONFIG_ACPI_BOOT*/ diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c new file mode 100644 index 000000000000..8b25160393c1 --- /dev/null +++ b/arch/i386/kernel/ioport.c @@ -0,0 +1,147 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + unsigned long i, max_long, bytes, bytes_updated; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss; + unsigned long *bitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* + * Sets the lazy trigger so that the next I/O operation will + * reload the correct bitmap. + */ + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned long unused) +{ + volatile struct pt_regs * regs = (struct pt_regs *) &unused; + unsigned int level = regs->ebx; + unsigned int old = (regs->eflags >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + /* Make sure we return the long way (not sysenter) */ + set_thread_flag(TIF_IRET); + return 0; +} diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c new file mode 100644 index 000000000000..73945a3c53c4 --- /dev/null +++ b/arch/i386/kernel/irq.c @@ -0,0 +1,261 @@ +/* + * linux/arch/i386/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; +EXPORT_PER_CPU_SYMBOL(irq_stat); + +#ifndef CONFIG_X86_LOCAL_APIC +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} +#endif + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +}; + +static union irq_ctx *hardirq_ctx[NR_CPUS]; +static union irq_ctx *softirq_ctx[NR_CPUS]; +#endif + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; +#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx, *irqctx; + u32 *isp; +#endif + + irq_enter(); +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + +#ifdef CONFIG_4KSTACKS + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (curctx != irqctx) { + int arg1, arg2, ebx; + + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_IRQ \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (ebx) + : "0" (irq), "1" (regs), "2" (isp) + : "memory", "cc", "ecx" + ); + } else +#endif + __do_IRQ(irq, regs); + + irq_exit(); + + return 1; +} + +#ifdef CONFIG_4KSTACKS + +/* + * These should really be __section__(".bss.page_aligned") as well, but + * gcc's 3.0 and earlier don't handle that correctly. + */ +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + } + return 0; +} diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c new file mode 100644 index 000000000000..671681659243 --- /dev/null +++ b/arch/i386/kernel/kprobes.c @@ -0,0 +1,385 @@ +/* + * Kernel Probes (KProbes) + * arch/i386/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + */ + +#include <linux/config.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/spinlock.h> +#include <linux/preempt.h> +#include <asm/kdebug.h> +#include <asm/desc.h> + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; +static struct pt_regs jprobe_saved_regs; +static long *jprobe_saved_esp; +/* copy of the kernel stack at the probe fire time */ +static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; +void jprobe_return_end(void); + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(kprobe_opcode_t opcode) +{ + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + return 0; +} + +void arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->eip = (unsigned long)p->addr; +} + +static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + /*single step inline if the instruction is an int3*/ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->eip = (unsigned long)p->addr; + else + regs->eip = (unsigned long)&p->ainsn.insn; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +static int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr = NULL; + unsigned long *lp; + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + /* Check if the application is using LDT entry for its code segment and + * calculate the address by reading the base address from the LDT entry. + */ + if ((regs->xcs & 4) && (current->mm)) { + lp = (unsigned long *) ((unsigned long)((regs->xcs >> 3) * 8) + + (char *) current->mm->context.ldt); + addr = (kprobe_opcode_t *) (get_desc_base(lp) + regs->eip - + sizeof(kprobe_opcode_t)); + } else { + addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); + } + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + if (kprobe_status == KPROBE_HIT_SS) { + regs->eflags &= ~TF_MASK; + regs->eflags |= kprobe_saved_eflags; + unlock_kprobes(); + goto no_kprobe; + } + disarm_kprobe(p, regs); + ret = 1; + } else { + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + if (regs->eflags & VM_MASK) { + /* We are in virtual-8086 mode. Return 0 */ + goto no_kprobe; + } + + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new eip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed eflags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long *tos = (unsigned long *)®s->esp; + unsigned long next_eip = 0; + unsigned long copy_eip = (unsigned long)&p->ainsn.insn; + unsigned long orig_eip = (unsigned long)p->addr; + + switch (p->ainsn.insn[0]) { + case 0x9c: /* pushfl */ + *tos &= ~(TF_MASK | IF_MASK); + *tos |= kprobe_old_eflags; + break; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_eip + (*tos - copy_eip); + break; + case 0xff: + if ((p->ainsn.insn[1] & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; eip is correct. */ + next_eip = regs->eip; + *tos = orig_eip + (*tos - copy_eip); + } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + /* eip is correct. */ + next_eip = regs->eip; + } + break; + case 0xea: /* jmp absolute -- eip is correct */ + next_eip = regs->eip; + break; + default: + break; + } + + regs->eflags &= ~TF_MASK; + if (next_eip) { + regs->eip = next_eip; + } else { + regs->eip = orig_eip + (regs->eip - copy_eip); + } +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. And we hold kprobe lock. + */ +static inline int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_saved_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_old_eflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = (struct die_args *)data; + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_GPF: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + default: + break; + } + return NOTIFY_DONE; +} + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + + jprobe_saved_regs = *regs; + jprobe_saved_esp = ®s->esp; + addr = (unsigned long)jprobe_saved_esp; + + /* + * TBD: As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->eip = (unsigned long)(jp->entry); + return 1; +} + +void jprobe_return(void) +{ + preempt_enable_no_resched(); + asm volatile (" xchgl %%ebx,%%esp \n" + " int3 \n" + " .globl jprobe_return_end \n" + " jprobe_return_end: \n" + " nop \n"::"b" + (jprobe_saved_esp):"memory"); +} + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + u8 *addr = (u8 *) (regs->eip - 1); + unsigned long stack_addr = (unsigned long)jprobe_saved_esp; + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if (®s->esp != jprobe_saved_esp) { + struct pt_regs *saved_regs = + container_of(jprobe_saved_esp, struct pt_regs, esp); + printk("current esp %p does not match saved esp %p\n", + ®s->esp, jprobe_saved_esp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + return 1; + } + return 0; +} diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c new file mode 100644 index 000000000000..bb50afbee921 --- /dev/null +++ b/arch/i386/kernel/ldt.c @@ -0,0 +1,255 @@ +/* + * linux/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt; + void *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); + load_LDT(pc); + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + void *address; + + err = 0; + address = &default_ldt[0]; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; + + err = size; + if (copy_to_user(ptr, address, size)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2, *lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + *lp = entry_1; + *(lp+1) = entry_2; + error = 0; + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c new file mode 100644 index 000000000000..8600faeea29d --- /dev/null +++ b/arch/i386/kernel/mca.c @@ -0,0 +1,474 @@ +/* + * linux/arch/i386/kernel/mca.c + * Written by Martin Kolinek, February 1996 + * + * Changes: + * + * Chris Beauregard July 28th, 1996 + * - Fixed up integrated SCSI detection + * + * Chris Beauregard August 3rd, 1996 + * - Made mca_info local + * - Made integrated registers accessible through standard function calls + * - Added name field + * - More sanity checking + * + * Chris Beauregard August 9th, 1996 + * - Rewrote /proc/mca + * + * Chris Beauregard January 7th, 1997 + * - Added basic NMI-processing + * - Added more information to mca_info structure + * + * David Weinehall October 12th, 1998 + * - Made a lot of cleaning up in the source + * - Added use of save_flags / restore_flags + * - Added the 'driver_loaded' flag in MCA_adapter + * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter + * + * David Weinehall March 24th, 1999 + * - Fixed the output of 'Driver Installed' in /proc/mca/pos + * - Made the Integrated Video & SCSI show up even if they have id 0000 + * + * Alexander Viro November 9th, 1999 + * - Switched to regular procfs methods + * + * Alfred Arnold & David Weinehall August 23rd, 2000 + * - Added support for Planar POS-registers + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/mca.h> +#include <asm/system.h> +#include <asm/io.h> +#include <linux/proc_fs.h> +#include <linux/mman.h> +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/ioport.h> +#include <asm/uaccess.h> +#include <linux/init.h> +#include <asm/arch_hooks.h> + +static unsigned char which_scsi = 0; + +int MCA_bus = 0; +EXPORT_SYMBOL(MCA_bus); + +/* + * Motherboard register spinlock. Untested on SMP at the moment, but + * are there any MCA SMP boxes? + * + * Yes - Alan + */ +static DEFINE_SPINLOCK(mca_lock); + +/* Build the status info for the adapter */ + +static void mca_configure_adapter_status(struct mca_device *mca_dev) { + mca_dev->status = MCA_ADAPTER_NONE; + + mca_dev->pos_id = mca_dev->pos[0] + + (mca_dev->pos[1] << 8); + + if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { + + /* id = 0x0000 usually indicates hardware failure, + * however, ZP Gu (zpg@castle.net> reports that his 9556 + * has 0x0000 as id and everything still works. There + * also seem to be an adapter with id = 0x0000; the + * NCR Parallel Bus Memory Card. Until this is confirmed, + * however, this code will stay. + */ + + mca_dev->status = MCA_ADAPTER_ERROR; + + return; + } else if(mca_dev->pos_id != 0xffff) { + + /* 0xffff usually indicates that there's no adapter, + * however, some integrated adapters may have 0xffff as + * their id and still be valid. Examples are on-board + * VGA of the 55sx, the integrated SCSI of the 56 & 57, + * and possibly also the 95 ULTIMEDIA. + */ + + mca_dev->status = MCA_ADAPTER_NORMAL; + } + + if((mca_dev->pos_id == 0xffff || + mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { + int j; + + for(j = 2; j < 8; j++) { + if(mca_dev->pos[j] != 0xff) { + mca_dev->status = MCA_ADAPTER_NORMAL; + break; + } + } + } + + if(!(mca_dev->pos[2] & MCA_ENABLED)) { + + /* enabled bit is in POS 2 */ + + mca_dev->status = MCA_ADAPTER_DISABLED; + } +} /* mca_configure_adapter_status */ + +/*--------------------------------------------------------------------*/ + +static struct resource mca_standard_resources[] = { + { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, + { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, + { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, + { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, + { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, + { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, + { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } +}; + +#define MCA_STANDARD_RESOURCES (sizeof(mca_standard_resources)/sizeof(struct resource)) + +/** + * mca_read_and_store_pos - read the POS registers into a memory buffer + * @pos: a char pointer to 8 bytes, contains the POS register value on + * successful return + * + * Returns 1 if a card actually exists (i.e. the pos isn't + * all 0xff) or 0 otherwise + */ +static int mca_read_and_store_pos(unsigned char *pos) { + int j; + int found = 0; + + for(j=0; j<8; j++) { + if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) { + /* 0xff all across means no device. 0x00 means + * something's broken, but a device is + * probably there. However, if you get 0x00 + * from a motherboard register it won't matter + * what we find. For the record, on the + * 57SLC, the integrated SCSI adapter has + * 0xffff for the adapter ID, but nonzero for + * other registers. */ + + found = 1; + } + } + return found; +} + +static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) +{ + unsigned char byte; + unsigned long flags; + + if(reg < 0 || reg >= 8) + return 0; + + spin_lock_irqsave(&mca_lock, flags); + if(mca_dev->pos_register) { + /* Disable adapter setup, enable motherboard setup */ + + outb_p(0, MCA_ADAPTER_SETUP_REG); + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + + byte = inb_p(MCA_POS_REG(reg)); + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + } else { + + /* Make sure motherboard setup is off */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Read the appropriate register */ + + outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); + byte = inb_p(MCA_POS_REG(reg)); + outb_p(0, MCA_ADAPTER_SETUP_REG); + } + spin_unlock_irqrestore(&mca_lock, flags); + + mca_dev->pos[reg] = byte; + + return byte; +} + +static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, + unsigned char byte) +{ + unsigned long flags; + + if(reg < 0 || reg >= 8) + return; + + spin_lock_irqsave(&mca_lock, flags); + + /* Make sure motherboard setup is off */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Read in the appropriate register */ + + outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); + outb_p(byte, MCA_POS_REG(reg)); + outb_p(0, MCA_ADAPTER_SETUP_REG); + + spin_unlock_irqrestore(&mca_lock, flags); + + /* Update the global register list, while we have the byte */ + + mca_dev->pos[reg] = byte; + +} + +/* for the primary MCA bus, we have identity transforms */ +static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq) +{ + return irq; +} + +static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port) +{ + return port; +} + +static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem) +{ + return mem; +} + + +static int __init mca_init(void) +{ + unsigned int i, j; + struct mca_device *mca_dev; + unsigned char pos[8]; + short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; + struct mca_bus *bus; + + /* WARNING: Be careful when making changes here. Putting an adapter + * and the motherboard simultaneously into setup mode may result in + * damage to chips (according to The Indispensible PC Hardware Book + * by Hans-Peter Messmer). Also, we disable system interrupts (so + * that we are not disturbed in the middle of this). + */ + + /* Make sure the MCA bus is present */ + + if (mca_system_init()) { + printk(KERN_ERR "MCA bus system initialisation failed\n"); + return -ENODEV; + } + + if (!MCA_bus) + return -ENODEV; + + printk(KERN_INFO "Micro Channel bus detected.\n"); + + /* All MCA systems have at least a primary bus */ + bus = mca_attach_bus(MCA_PRIMARY_BUS); + if (!bus) + goto out_nomem; + bus->default_dma_mask = 0xffffffffLL; + bus->f.mca_write_pos = mca_pc_write_pos; + bus->f.mca_read_pos = mca_pc_read_pos; + bus->f.mca_transform_irq = mca_dummy_transform_irq; + bus->f.mca_transform_ioport = mca_dummy_transform_ioport; + bus->f.mca_transform_memory = mca_dummy_transform_memory; + + /* get the motherboard device */ + mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); + if(unlikely(!mca_dev)) + goto out_nomem; + memset(mca_dev, 0, sizeof(struct mca_device)); + + /* + * We do not expect many MCA interrupts during initialization, + * but let us be safe: + */ + spin_lock_irq(&mca_lock); + + /* Make sure adapter setup is off */ + + outb_p(0, MCA_ADAPTER_SETUP_REG); + + /* Read motherboard POS registers */ + + mca_dev->pos_register = 0x7f; + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + mca_dev->name[0] = 0; + mca_read_and_store_pos(mca_dev->pos); + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for a motherboard */ + mca_dev->pos_id = MCA_MOTHERBOARD_POS; + mca_dev->slot = MCA_MOTHERBOARD; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + + mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + if(unlikely(!mca_dev)) + goto out_unlock_nomem; + memset(mca_dev, 0, sizeof(struct mca_device)); + + + /* Put motherboard into video setup mode, read integrated video + * POS registers, and turn motherboard setup off. + */ + + mca_dev->pos_register = 0xdf; + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + mca_dev->name[0] = 0; + mca_read_and_store_pos(mca_dev->pos); + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for the integrated video */ + mca_dev->pos_id = MCA_INTEGVIDEO_POS; + mca_dev->slot = MCA_INTEGVIDEO; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + + /* Put motherboard into scsi setup mode, read integrated scsi + * POS registers, and turn motherboard setup off. + * + * It seems there are two possible SCSI registers. Martin says that + * for the 56,57, 0xf7 is the one, but fails on the 76. + * Alfredo (apena@vnet.ibm.com) says + * 0xfd works on his machine. We'll try both of them. I figure it's + * a good bet that only one could be valid at a time. This could + * screw up though if one is used for something else on the other + * machine. + */ + + for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { + outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); + if(mca_read_and_store_pos(pos)) + break; + } + if(which_scsi) { + /* found a scsi card */ + mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + if(unlikely(!mca_dev)) + goto out_unlock_nomem; + memset(mca_dev, 0, sizeof(struct mca_device)); + + for(j = 0; j < 8; j++) + mca_dev->pos[j] = pos[j]; + + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for integrated SCSI controller */ + mca_dev->pos_id = MCA_INTEGSCSI_POS; + mca_dev->slot = MCA_INTEGSCSI; + mca_dev->pos_register = which_scsi; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + } + + /* Turn off motherboard setup */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Now loop over MCA slots: put each adapter into setup mode, and + * read its POS registers. Then put adapter setup off. + */ + + for(i=0; i<MCA_MAX_SLOT_NR; i++) { + outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); + if(!mca_read_and_store_pos(pos)) + continue; + + mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + if(unlikely(!mca_dev)) + goto out_unlock_nomem; + memset(mca_dev, 0, sizeof(struct mca_device)); + + for(j=0; j<8; j++) + mca_dev->pos[j]=pos[j]; + + mca_dev->driver_loaded = 0; + mca_dev->slot = i; + mca_dev->pos_register = 0; + mca_configure_adapter_status(mca_dev); + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + } + outb_p(0, MCA_ADAPTER_SETUP_REG); + + /* Enable interrupts and return memory start */ + spin_unlock_irq(&mca_lock); + + for (i = 0; i < MCA_STANDARD_RESOURCES; i++) + request_resource(&ioport_resource, mca_standard_resources + i); + + mca_do_proc_init(); + + return 0; + + out_unlock_nomem: + spin_unlock_irq(&mca_lock); + out_nomem: + printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); + return -ENOMEM; +} + +subsys_initcall(mca_init); + +/*--------------------------------------------------------------------*/ + +static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) +{ + int slot = mca_dev->slot; + + if(slot == MCA_INTEGSCSI) { + printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", + mca_dev->name); + } else if(slot == MCA_INTEGVIDEO) { + printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", + mca_dev->name); + } else if(slot == MCA_MOTHERBOARD) { + printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", + mca_dev->name); + } + + /* More info available in POS 6 and 7? */ + + if(check_flag) { + unsigned char pos6, pos7; + + pos6 = mca_device_read_pos(mca_dev, 6); + pos7 = mca_device_read_pos(mca_dev, 7); + + printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); + } + +} /* mca_handle_nmi_slot */ + +/*--------------------------------------------------------------------*/ + +static int mca_handle_nmi_callback(struct device *dev, void *data) +{ + struct mca_device *mca_dev = to_mca_device(dev); + unsigned char pos5; + + pos5 = mca_device_read_pos(mca_dev, 5); + + if(!(pos5 & 0x80)) { + /* Bit 7 of POS 5 is reset when this adapter has a hardware + * error. Bit 7 it reset if there's error information + * available in POS 6 and 7. + */ + mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); + return 1; + } + return 0; +} + +void mca_handle_nmi(void) +{ + /* First try - scan the various adapters and see if a specific + * adapter was responsible for the error. + */ + bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); + + mca_nmi_hook(); +} /* mca_handle_nmi */ diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c new file mode 100644 index 000000000000..a77c612aad00 --- /dev/null +++ b/arch/i386/kernel/microcode.c @@ -0,0 +1,512 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2004 Tigran Aivazian + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, + * Order Number 245472 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/245472.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to speculative + * nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +//#define DEBUG /* pr_debug */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ +#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ +#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ +#define DWSIZE (sizeof (u32)) +#define get_totalsize(mc) \ + (((microcode_t *)mc)->hdr.totalsize ? \ + ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) +#define get_datasize(mc) \ + (((microcode_t *)mc)->hdr.datasize ? \ + ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +/* serialize access to the physical write to MSR 0x79 */ +static DEFINE_SPINLOCK(microcode_update_lock); + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DECLARE_MUTEX(microcode_sem); + +static void __user *user_buffer; /* user area microcode data buffer */ +static unsigned int user_buffer_size; /* it's size */ + +typedef enum mc_error_code { + MC_SUCCESS = 0, + MC_NOTFOUND = 1, + MC_MARKED = 2, + MC_ALLOCATED = 3, +} mc_error_code_t; + +static struct ucode_cpu_info { + unsigned int sig; + unsigned int pf; + unsigned int rev; + unsigned int cksum; + mc_error_code_t err; + microcode_t *mc; +} ucode_cpu_info[NR_CPUS]; + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static void collect_cpu_info (void *unused) +{ + int cpu_num = smp_processor_id(); + struct cpuinfo_x86 *c = cpu_data + cpu_num; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + unsigned int val[2]; + + uci->sig = uci->pf = uci->rev = uci->cksum = 0; + uci->err = MC_NOTFOUND; + uci->mc = NULL; + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num); + return; + } else { + uci->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + uci->pf = 1 << ((val[1] >> 18) & 7); + } + } + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", + uci->sig, uci->pf, uci->rev); +} + +static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + pr_debug("Microcode Found.\n"); + pr_debug(" Header Revision 0x%x\n", mc_header->hdrver); + pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver); + pr_debug(" Revision 0x%x \n", mc_header->rev); + pr_debug(" Date %x/%x/%x\n", + ((mc_header->date >> 24 ) & 0xff), + ((mc_header->date >> 16 ) & 0xff), + (mc_header->date & 0xFFFF)); + pr_debug(" Signature 0x%x\n", sig); + pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n", + ((sig >> 12) & 0x3), + ((sig >> 8) & 0xf), + ((sig >> 4) & 0xf), + ((sig & 0xf))); + pr_debug(" Processor Flags 0x%x\n", pf); + pr_debug(" Checksum 0x%x\n", cksum); + + if (mc_header->rev < uci->rev) { + printk(KERN_ERR "microcode: CPU%d not 'upgrading' to earlier revision" + " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); + goto out; + } else if (mc_header->rev == uci->rev) { + /* notify the caller of success on this cpu */ + uci->err = MC_SUCCESS; + printk(KERN_ERR "microcode: CPU%d already at revision" + " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); + goto out; + } + + pr_debug("microcode: CPU%d found a matching microcode update with " + " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); + uci->cksum = cksum; + uci->pf = pf; /* keep the original mc pf for cksum calculation */ + uci->err = MC_MARKED; /* found the match */ +out: + return; +} + +static int find_matching_ucodes (void) +{ + int cursor = 0; + int error = 0; + + while (cursor + MC_HEADER_SIZE < user_buffer_size) { + microcode_header_t mc_header; + void *newmc = NULL; + int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size; + + if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + error = -EFAULT; + goto out; + } + + total_size = get_totalsize(&mc_header); + if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + error = -EINVAL; + goto out; + } + + data_size = get_datasize(&mc_header); + if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + error = -EINVAL; + goto out; + } + + if (mc_header.ldrver != 1 || mc_header.hdrver != 1) { + printk(KERN_ERR "microcode: error! Unknown microcode update format\n"); + error = -EINVAL; + goto out; + } + + for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/ + continue; + + if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->pf)) + mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum); + } + + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + struct extended_sigtable ext_header; + struct extended_signature ext_sig; + int ext_sigcount; + + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + error = -EINVAL; + goto out; + } + if (copy_from_user(&ext_header, user_buffer + cursor + + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + error = -EFAULT; + goto out; + } + if (ext_table_size != exttable_size(&ext_header)) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + error = -EFAULT; + goto out; + } + + ext_sigcount = ext_header.count; + + for (i = 0; i < ext_sigcount; i++) { + if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + error = -EFAULT; + goto out; + } + for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/ + continue; + if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->pf)) { + mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum); + } + } + } + } + /* now check if any cpu has matched */ + for (cpu_num = 0, allocated_flag = 0, sum = 0; cpu_num < num_online_cpus(); cpu_num++) { + if (ucode_cpu_info[cpu_num].err == MC_MARKED) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + if (!allocated_flag) { + allocated_flag = 1; + newmc = vmalloc(total_size); + if (!newmc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + error = -ENOMEM; + goto out; + } + if (copy_from_user(newmc + MC_HEADER_SIZE, + user_buffer + cursor + MC_HEADER_SIZE, + total_size - MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(newmc); + error = -EFAULT; + goto out; + } + memcpy(newmc, &mc_header, MC_HEADER_SIZE); + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size); + i = ext_table_size / DWSIZE; + while (i--) ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n"); + vfree(newmc); + error = -EINVAL; + goto out; + } + } + + /* calculate the checksum */ + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) sum += ((int *)newmc)[i]; + sum -= (mc_header.sig + mc_header.pf + mc_header.cksum); + } + ucode_cpu_info[cpu_num].mc = newmc; + ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */ + if (sum + uci->sig + uci->pf + uci->cksum != 0) { + printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num); + error = -EINVAL; + goto out; + } + } + } + cursor += total_size; /* goto the next update patch */ + } /* end of while */ +out: + return error; +} + +static void do_update_one (void * unused) +{ + unsigned long flags; + unsigned int val[2]; + int cpu_num = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + if (uci->mc == NULL) { + printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num); + return; + } + + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) uci->mc->bits, + (unsigned long) uci->mc->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + /* notify the caller of success on this cpu */ + uci->err = MC_SUCCESS; + spin_unlock_irqrestore(µcode_update_lock, flags); + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x, date = %08x \n", + cpu_num, uci->rev, val[1], uci->mc->hdr.date); + return; +} + +static int do_microcode_update (void) +{ + int i, error; + + if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) { + printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); + error = -EIO; + goto out; + } + + if ((error = find_matching_ucodes())) { + printk(KERN_ERR "microcode: Error in the microcode data\n"); + goto out_free; + } + + if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) { + printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); + error = -EIO; + } + +out_free: + for (i = 0; i < num_online_cpus(); i++) { + if (ucode_cpu_info[i].mc) { + int j; + void *tmp = ucode_cpu_info[i].mc; + vfree(tmp); + for (j = i; j < num_online_cpus(); j++) { + if (ucode_cpu_info[j].mc == tmp) + ucode_cpu_info[j].mc = NULL; + } + } + } +out: + return error; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if (len < DEFAULT_UCODE_TOTALSIZE) { + printk(KERN_ERR "microcode: not enough data\n"); + return -EINVAL; + } + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + down(µcode_sem); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + up(µcode_sem); + + return ret; +} + +static int microcode_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + /* + * XXX: will be removed after microcode_ctl + * is updated to ignore failure of this ioctl() + */ + case MICROCODE_IOCFREE: + return 0; + default: + return -EINVAL; + } + return -EINVAL; +} + +static struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .ioctl = microcode_ioctl, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .devfs_name = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n"); + return 0; +} + +static void __exit microcode_exit (void) +{ + misc_deregister(µcode_dev); + printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n"); +} + +module_init(microcode_init) +module_exit(microcode_exit) +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c new file mode 100644 index 000000000000..5149c8a621f0 --- /dev/null +++ b/arch/i386/kernel/module.c @@ -0,0 +1,129 @@ +/* Kernel module help for i386. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <linux/moduleloader.h> +#include <linux/elf.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/kernel.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt...) +#endif + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} + + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} + +extern void apply_alternatives(void *start, void *end); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + /* look for .altinstructions to patch */ + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + void *seg; + if (strcmp(".altinstructions", secstrings + s->sh_name)) + continue; + seg = (void *)s->sh_addr; + apply_alternatives(seg, seg + s->sh_size); + } + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ +} diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c new file mode 100644 index 000000000000..1347ab4939e7 --- /dev/null +++ b/arch/i386/kernel/mpparse.c @@ -0,0 +1,1109 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/config.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> + +#include <asm/smp.h> +#include <asm/acpi.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> + +#include <mach_apic.h> +#include <mach_mpparse.h> +#include <bios_ebda.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +int mp_bus_id_to_type [MAX_MP_BUSSES]; +int mp_bus_id_to_node [MAX_MP_BUSSES]; +int mp_bus_id_to_local [MAX_MP_BUSSES]; +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +static int mp_current_pci_id; + +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; + +int pic_mode; +unsigned long mp_lapic_addr; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +unsigned int boot_cpu_logical_apicid = -1U; +/* Internal processor count */ +static unsigned int __initdata num_processors; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* + * Intel MP BIOS table parsing routines: + */ + + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; + +#ifdef CONFIG_X86_NUMAQ +static int MP_valid_apicid(int apicid, int version) +{ + return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf; +} +#else +static int MP_valid_apicid(int apicid, int version) +{ + if (version >= 0x14) + return apicid < 0xff; + else + return apicid < 0xf; +} +#endif + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver, apicid; + physid_mask_t tmp; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (m->mpc_featureflag&(1<<0)) + Dprintk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + Dprintk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + Dprintk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + Dprintk(" Internal APIC present.\n"); + if (m->mpc_featureflag&(1<<11)) + Dprintk(" SEP present.\n"); + if (m->mpc_featureflag&(1<<12)) + Dprintk(" MTRR present.\n"); + if (m->mpc_featureflag&(1<<13)) + Dprintk(" PGE present.\n"); + if (m->mpc_featureflag&(1<<14)) + Dprintk(" MCA present.\n"); + if (m->mpc_featureflag&(1<<15)) + Dprintk(" CMOV present.\n"); + if (m->mpc_featureflag&(1<<16)) + Dprintk(" PAT present.\n"); + if (m->mpc_featureflag&(1<<17)) + Dprintk(" PSE present.\n"); + if (m->mpc_featureflag&(1<<18)) + Dprintk(" PSN present.\n"); + if (m->mpc_featureflag&(1<<19)) + Dprintk(" Cache Line Flush Instruction present.\n"); + /* 20 Reserved */ + if (m->mpc_featureflag&(1<<21)) + Dprintk(" Debug Trace and EMON Store present.\n"); + if (m->mpc_featureflag&(1<<22)) + Dprintk(" ACPI Thermal Throttle Registers present.\n"); + if (m->mpc_featureflag&(1<<23)) + Dprintk(" MMX present.\n"); + if (m->mpc_featureflag&(1<<24)) + Dprintk(" FXSR present.\n"); + if (m->mpc_featureflag&(1<<25)) + Dprintk(" XMM present.\n"); + if (m->mpc_featureflag&(1<<26)) + Dprintk(" Willamette New Instructions present.\n"); + if (m->mpc_featureflag&(1<<27)) + Dprintk(" Self Snoop present.\n"); + if (m->mpc_featureflag&(1<<28)) + Dprintk(" HT present.\n"); + if (m->mpc_featureflag&(1<<29)) + Dprintk(" Thermal Monitor present.\n"); + /* 30, 31 Reserved */ + + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_physical_apicid = m->mpc_apicid; + boot_cpu_logical_apicid = apicid; + } + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + num_processors++; + ver = m->mpc_apicver; + + if (!MP_valid_apicid(apicid, ver)) { + printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + --num_processors; + return; + } + + tmp = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp); + + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + + mpc_oem_bus_info(m, str, translation_table[mpc_record]); + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { + mpc_oem_pci_bus(m, translation_table[mpc_record]); + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; + } else { + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info (struct mpc_config_translation *m) +{ + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ + unsigned short oemsize) +{ + int count = sizeof (*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable)+count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) + { + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], + oemtable->oem_signature[1], + oemtable->oem_signature[2], + oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) + { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m= + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + char oem[10]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", + *(u32 *)mpc->mpc_signature); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk(KERN_ERR "SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(oem,mpc->mpc_oem,8); + oem[8]=0; + printk(KERN_INFO "OEM ID: %s ",oem); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + mps_oem_check(mpc, oem, str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* + * Save the local APIC address (it might be non-default) -- but only + * if we're not using ACPI. + */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + mpc_record = 0; + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + default: + { + count = mpc->mpc_length; + break; + } + } + ++mpc_record; + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk("???\n"); + printk(KERN_ERR "Unknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI may be used to obtain the entire SMP configuration or just to + * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + unsigned long *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } + + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +} + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +void __init mp_register_lapic_address ( + u64 address) +{ + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + + +void __init mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + + MP_processor_info(&processor); +} + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || defined(CONFIG_ACPI_BOOT)) + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_base; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + +int es7000_plat; + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overriden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + +#ifdef CONFIG_ACPI_BUS + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; +#endif + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, + active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /*CONFIG_X86_IO_APIC && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)*/ +#endif /*CONFIG_ACPI_BOOT*/ diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c new file mode 100644 index 000000000000..05d9f8f363a6 --- /dev/null +++ b/arch/i386/kernel/msr.c @@ -0,0 +1,346 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * msr.c + * + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include <linux/module.h> +#include <linux/config.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/notifier.h> + +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +static struct class_simple *msr_class; + +/* Note: "err" is handled in a funny way below. Otherwise one version + of gcc or another breaks. */ + +static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) +{ + int err; + + asm volatile ("1: wrmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" " .long 1b,3b\n" ".previous":"=&bDS" (err) + :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) +{ + int err; + + asm volatile ("1: rdmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,3b\n" + ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) + :"c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +#ifdef CONFIG_SMP + +struct msr_command { + int cpu; + int err; + u32 reg; + u32 data[2]; +}; + +static void msr_smp_wrmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); +} + +static void msr_smp_rdmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); +} + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = wrmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + cmd.data[0] = eax; + cmd.data[1] = edx; + + smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = rdmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + + smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + + *eax = cmd.data[0]; + *edx = cmd.data[1]; + + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +#else /* ! CONFIG_SMP */ + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + return wrmsr_eio(reg, eax, edx); +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) +{ + return rdmsr_eio(reg, eax, edx); +} + +#endif /* ! CONFIG_SMP */ + +static loff_t msr_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret = -EINVAL; + + lock_kernel(); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + } + unlock_kernel(); + return ret; +} + +static ssize_t msr_read(struct file *file, char __user * buf, + size_t count, loff_t * ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + err = do_rdmsr(cpu, reg, &data[0], &data[1]); + if (err) + return err; + if (copy_to_user(tmp, &data, 8)) + return -EFAULT; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) + return -EFAULT; + err = do_wrmsr(cpu, reg, data[0], data[1]); + if (err) + return err; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = msr_read, + .write = msr_write, + .open = msr_open, +}; + +static int msr_class_simple_device_add(int i) +{ + int err = 0; + struct class_device *class_err; + + class_err = class_simple_device_add(msr_class, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i); + if (IS_ERR(class_err)) + err = PTR_ERR(class_err); + return err; +} + +static int __devinit msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + msr_class_simple_device_add(cpu); + break; + case CPU_DEAD: + class_simple_device_remove(MKDEV(MSR_MAJOR, cpu)); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block msr_class_cpu_notifier = +{ + .notifier_call = msr_class_cpu_callback, +}; + +static int __init msr_init(void) +{ + int i, err = 0; + i = 0; + + if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + printk(KERN_ERR "msr: unable to get major %d for msr\n", + MSR_MAJOR); + err = -EBUSY; + goto out; + } + msr_class = class_simple_create(THIS_MODULE, "msr"); + if (IS_ERR(msr_class)) { + err = PTR_ERR(msr_class); + goto out_chrdev; + } + for_each_online_cpu(i) { + err = msr_class_simple_device_add(i); + if (err != 0) + goto out_class; + } + register_cpu_notifier(&msr_class_cpu_notifier); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) + class_simple_device_remove(MKDEV(MSR_MAJOR, i)); + class_simple_destroy(msr_class); +out_chrdev: + unregister_chrdev(MSR_MAJOR, "cpu/msr"); +out: + return err; +} + +static void __exit msr_exit(void) +{ + int cpu = 0; + for_each_online_cpu(cpu) + class_simple_device_remove(MKDEV(MSR_MAJOR, cpu)); + class_simple_destroy(msr_class); + unregister_chrdev(MSR_MAJOR, "cpu/msr"); + unregister_cpu_notifier(&msr_class_cpu_notifier); +} + +module_init(msr_init); +module_exit(msr_exit) + +MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c new file mode 100644 index 000000000000..f5b0c5081bd6 --- /dev/null +++ b/arch/i386/kernel/nmi.c @@ -0,0 +1,570 @@ +/* + * linux/arch/i386/nmi.c + * + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar <mingo@redhat.com> + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/sysdev.h> +#include <linux/sysctl.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/nmi.h> + +#include "mach_traps.h" + +unsigned int nmi_watchdog = NMI_NONE; +extern int unknown_nmi_panic; +static unsigned int nmi_hz = HZ; +static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +static unsigned int nmi_p4_cccr_val; +extern void show_registers(struct pt_regs *regs); + +/* + * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: + * - it may be reserved by some other driver, or not + * - when not reserved by some other driver, it may be used for + * the NMI watchdog, or not + * + * This is maintained separately from nmi_active because the NMI + * watchdog may also be driven from the I/O APIC timer. + */ +static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static unsigned int lapic_nmi_owner; +#define LAPIC_NMI_WATCHDOG (1<<0) +#define LAPIC_NMI_RESERVED (1<<1) + +/* nmi_active: + * +1: the lapic NMI watchdog is active, but can be disabled + * 0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * -1: the lapic NMI watchdog is disabled, but can be enabled + */ +int nmi_active; + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +#define MSR_P4_MISC_ENABLE 0x1A0 +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) +#define MSR_P4_PERFCTR0 0x300 +#define MSR_P4_CCCR0 0x360 +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ +#define MSR_P4_IQ_COUNTER0 0x30C +#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) +#define P4_NMI_IQ_CCCR0 \ + (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ + P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) + +int __init check_nmi_watchdog (void) +{ + unsigned int prev_nmi_count[NR_CPUS]; + int cpu; + + printk(KERN_INFO "testing NMI watchdog ... "); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; + local_irq_enable(); + mdelay((10*1000)/nmi_hz); // wait 10 ticks + + /* FIXME: Only boot CPU is online at this stage. Check CPUs + as they come up. */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_SMP + /* Check cpu_callin_map here because that is set + after the timer is started. */ + if (!cpu_isset(cpu, cpu_callin_map)) + continue; +#endif + if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk("CPU#%d: NMI appears to be stuck!\n", cpu); + nmi_active = 0; + lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; + return -1; + } + } + printk("OK.\n"); + + /* now that we know it works we can reduce NMI frequency to + something more reasonable; makes a difference in some configs */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = 1; + + return 0; +} + +static int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID) + return 0; + if (nmi == NMI_NONE) + nmi_watchdog = nmi; + /* + * If any other x86 CPU has a local APIC, then + * please test the NMI stuff there and send me the + * missing bits. Right now Intel P6/P4 and AMD K7 only. + */ + if ((nmi == NMI_LOCAL_APIC) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) + nmi_watchdog = nmi; + if ((nmi == NMI_LOCAL_APIC) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) + nmi_watchdog = nmi; + /* + * We can enable the IO-APIC watchdog + * unconditionally. + */ + if (nmi == NMI_IO_APIC) { + nmi_active = 1; + nmi_watchdog = nmi; + } + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + +static void disable_lapic_nmi_watchdog(void) +{ + if (nmi_active <= 0) + return; + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + wrmsr(MSR_K7_EVNTSEL0, 0, 0); + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + break; + + wrmsr(MSR_P6_EVNTSEL0, 0, 0); + break; + case 15: + if (boot_cpu_data.x86_model > 0x3) + break; + + wrmsr(MSR_P4_IQ_CCCR0, 0, 0); + wrmsr(MSR_P4_CRU_ESCR0, 0, 0); + break; + } + break; + } + nmi_active = -1; + /* tell do_nmi() and others that we're not active any more */ + nmi_watchdog = 0; +} + +static void enable_lapic_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_LOCAL_APIC; + setup_apic_nmi_watchdog(); + } +} + +int reserve_lapic_nmi(void) +{ + unsigned int old_owner; + + spin_lock(&lapic_nmi_owner_lock); + old_owner = lapic_nmi_owner; + lapic_nmi_owner |= LAPIC_NMI_RESERVED; + spin_unlock(&lapic_nmi_owner_lock); + if (old_owner & LAPIC_NMI_RESERVED) + return -EBUSY; + if (old_owner & LAPIC_NMI_WATCHDOG) + disable_lapic_nmi_watchdog(); + return 0; +} + +void release_lapic_nmi(void) +{ + unsigned int new_owner; + + spin_lock(&lapic_nmi_owner_lock); + new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; + lapic_nmi_owner = new_owner; + spin_unlock(&lapic_nmi_owner_lock); + if (new_owner & LAPIC_NMI_WATCHDOG) + enable_lapic_nmi_watchdog(); +} + +void disable_timer_nmi_watchdog(void) +{ + if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + return; + + unset_nmi_callback(); + nmi_active = -1; + nmi_watchdog = NMI_NONE; +} + +void enable_timer_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_IO_APIC; + touch_nmi_watchdog(); + nmi_active = 1; + } +} + +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, u32 state) +{ + nmi_pm_active = nmi_active; + disable_lapic_nmi_watchdog(); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + if (nmi_pm_active > 0) + enable_lapic_nmi_watchdog(); + return 0; +} + + +static struct sysdev_class nmi_sysclass = { + set_kset_name("lapic_nmi"), + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +static void clear_msr_range(unsigned int base, unsigned int n) +{ + unsigned int i; + + for(i = 0; i < n; ++i) + wrmsr(base+i, 0, 0); +} + +static void setup_k7_watchdog(void) +{ + unsigned int evntsel; + + nmi_perfctr_msr = MSR_K7_PERFCTR0; + + clear_msr_range(MSR_K7_EVNTSEL0, 4); + clear_msr_range(MSR_K7_PERFCTR0, 4); + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); +} + +static void setup_p6_watchdog(void) +{ + unsigned int evntsel; + + nmi_perfctr_msr = MSR_P6_PERFCTR0; + + clear_msr_range(MSR_P6_EVNTSEL0, 2); + clear_msr_range(MSR_P6_PERFCTR0, 2); + + evntsel = P6_EVNTSEL_INT + | P6_EVNTSEL_OS + | P6_EVNTSEL_USR + | P6_NMI_EVENT; + + wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); + wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); +} + +static int setup_p4_watchdog(void) +{ + unsigned int misc_enable, dummy; + + rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) + return 0; + + nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; + nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) + nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; +#endif + + if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) + clear_msr_range(0x3F1, 2); + /* MSR 0x3F0 seems to have a default value of 0xFC00, but current + docs doesn't fully define it, so leave it alone for now. */ + if (boot_cpu_data.x86_model >= 0x3) { + /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ + clear_msr_range(0x3A0, 26); + clear_msr_range(0x3BC, 3); + } else { + clear_msr_range(0x3A0, 31); + } + clear_msr_range(0x3C0, 6); + clear_msr_range(0x3C8, 6); + clear_msr_range(0x3E0, 2); + clear_msr_range(MSR_P4_CCCR0, 18); + clear_msr_range(MSR_P4_PERFCTR0, 18); + + wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); + wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); + Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); + wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + apic_write(APIC_LVTPC, APIC_DM_NMI); + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + return 1; +} + +void setup_apic_nmi_watchdog (void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + return; + setup_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + setup_p6_watchdog(); + break; + case 15: + if (boot_cpu_data.x86_model > 0x3) + return; + + if (!setup_p4_watchdog()) + return; + break; + default: + return; + } + break; + default: + return; + } + lapic_nmi_owner = LAPIC_NMI_WATCHDOG; + nmi_active = 1; +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up + * here too!] + */ + +static unsigned int + last_irq_sums [NR_CPUS], + alert_counter [NR_CPUS]; + +void touch_nmi_watchdog (void) +{ + int i; + + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for (i = 0; i < NR_CPUS; i++) + alert_counter[i] = 0; +} + +extern void die_nmi(struct pt_regs *, const char *msg); + +void nmi_watchdog_tick (struct pt_regs * regs) +{ + + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + int sum, cpu = smp_processor_id(); + + sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + + if (last_irq_sums[cpu] == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; + if (alert_counter[cpu] == 5*nmi_hz) + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } else { + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } + if (nmi_perfctr_msr) { + if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { + /* Only P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + } +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + if (!(reason & 0xc0)) { + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); + } + return 0; +} + +/* + * proc handler for /proc/sys/kernel/unknown_nmi_panic + */ +int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + old_state = unknown_nmi_panic; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!unknown_nmi_panic) + return 0; + + if (unknown_nmi_panic) { + if (reserve_lapic_nmi() < 0) { + unknown_nmi_panic = 0; + return -EBUSY; + } else { + set_nmi_callback(unknown_nmi_panic_callback); + } + } else { + release_lapic_nmi(); + unset_nmi_callback(); + } + return 0; +} + +#endif + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(reserve_lapic_nmi); +EXPORT_SYMBOL(release_lapic_nmi); +EXPORT_SYMBOL(disable_timer_nmi_watchdog); +EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c new file mode 100644 index 000000000000..e51edf0a6564 --- /dev/null +++ b/arch/i386/kernel/numaq.c @@ -0,0 +1,79 @@ +/* + * Written by: Patricia Gaughen, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <gone@us.ibm.com> + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <asm/numaq.h> +#include <asm/topology.h> + +#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) + +/* + * Function: smp_dump_qct() + * + * Description: gets memory layout from the quad config table. This + * function also updates node_online_map with the nodes (quads) present. + */ +static void __init smp_dump_qct(void) +{ + int node; + struct eachquadmem *eq; + struct sys_cfg_data *scd = + (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); + + nodes_clear(node_online_map); + for_each_node(node) { + if (scd->quads_present31_0 & (1 << node)) { + node_set_online(node); + eq = &scd->eq[node]; + /* Convert to pages */ + node_start_pfn[node] = MB_TO_PAGES( + eq->hi_shrd_mem_start - eq->priv_mem_size); + node_end_pfn[node] = MB_TO_PAGES( + eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); + + memory_present(node, + node_start_pfn[node], node_end_pfn[node]); + node_remap_size[node] = node_memmap_size_bytes(node, + node_start_pfn[node], + node_end_pfn[node]); + } + } +} + +/* + * Unlike Summit, we don't really care to let the NUMA-Q + * fall back to flat mode. Don't compile for NUMA-Q + * unless you really need it! + */ +int __init get_memcfg_numaq(void) +{ + smp_dump_qct(); + return 1; +} diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c new file mode 100644 index 000000000000..4de2e03c7b45 --- /dev/null +++ b/arch/i386/kernel/pci-dma.c @@ -0,0 +1,147 @@ +/* + * Dynamic DMA mapping support. + * + * On i386 there is no hardware dynamic DMA address translation, + * so consistent alloc/free are merely page allocation/freeing. + * The rest of the dynamic DMA mapping interface is implemented + * in asm/pci.h. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <asm/io.h> + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, unsigned int __nocast gfp) +{ + void *ret; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + ret = mem->virt_base + (page << PAGE_SHIFT); + memset(ret, 0, size); + return ret; + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + return NULL; + } + + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) + gfp |= GFP_DMA; + + ret = (void *)__get_free_pages(gfp, order); + + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_phys(ret); + } + return ret; +} + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + } else + free_pages((unsigned long)vaddr, order); +} + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base; + int pages = size >> PAGE_SHIFT; + int bitmap_size = (pages + 31)/32; + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); + dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + memset(dev->dma_mem->bitmap, 0, bitmap_size); + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem->bitmap); + out: + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if(!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + int pos, err; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c new file mode 100644 index 000000000000..c36fedf40e95 --- /dev/null +++ b/arch/i386/kernel/process.c @@ -0,0 +1,848 @@ +/* + * linux/arch/i386/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <stdarg.h> + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/interrupt.h> +#include <linux/config.h> +#include <linux/utsname.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/random.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/irq.h> +#include <asm/desc.h> +#ifdef CONFIG_MATH_EMULATION +#include <asm/math_emu.h> +#endif + +#include <linux/irq.h> +#include <linux/err.h> + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + +static int hlt_counter; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.esp)[3]; +} + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + hlt_counter++; +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} + +EXPORT_SYMBOL(enable_hlt); + +/* + * We use this if we don't have any better + * idle routine.. + */ +void default_idle(void) +{ + if (!hlt_counter && boot_cpu_data.hlt_works_ok) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } else { + cpu_relax(); + } +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + int oldval; + + local_irq_enable(); + + /* + * Deal with another CPU just having chosen a thread to + * run here: + */ + oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); + + if (!oldval) { + set_thread_flag(TIF_POLLING_NRFLAG); + asm volatile( + "2:" + "testl %0, %1;" + "rep; nop;" + "je 2b;" + : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + + clear_thread_flag(TIF_POLLING_NRFLAG); + } else { + set_need_resched(); + } +} + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + idle = pm_idle; + + if (!idle) + idle = default_idle; + + __get_cpu_var(irq_stat).idle_timestamp = jiffies; + idle(); + } + schedule(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + */ +static void mwait_idle(void) +{ + local_irq_enable(); + + if (!need_resched()) { + set_thread_flag(TIF_POLLING_NRFLAG); + do { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (need_resched()) + break; + __mwait(0, 0); + } while (!need_resched()); + clear_thread_flag(TIF_POLLING_NRFLAG); + } +} + +void __init select_idle_routine(const struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_MWAIT)) { + printk("monitor/mwait feature present.\n"); + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + printk("using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } + } +} + +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; +#ifdef CONFIG_X86_SMP + if (smp_num_siblings > 1) + printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); +#endif + } else if (!strncmp(str, "halt", 4)) { + printk("using halt in idle threads.\n"); + pm_idle = default_idle; + } + + boot_option_idle_override = 1; + return 1; +} + +__setup("idle=", idle_setup); + +void show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + + printk("\n"); + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + print_symbol("EIP is at %s\n", regs->eip); + + if (regs->xcs & 3) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" EFLAGS: %08lx %s (%s)\n", + regs->eflags, print_tainted(), system_utsname.release); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->eax,regs->ebx,regs->ecx,regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + regs->esi, regs->edi, regs->ebp); + printk(" DS: %04x ES: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes); + + __asm__("movl %%cr0, %0": "=r" (cr0)); + __asm__("movl %%cr2, %0": "=r" (cr2)); + __asm__("movl %%cr3, %0": "=r" (cr3)); + /* This could fault if %cr4 does not exist */ + __asm__("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (cr4): "0" (0)); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + show_trace(NULL, ®s->esp); +} + +/* + * This gets run with %ebx containing the + * function to call, and %edx containing + * the "args". + */ +extern void kernel_thread_helper(void); +__asm__(".section .text\n" + ".align 4\n" + "kernel_thread_helper:\n\t" + "movl %edx,%eax\n\t" + "pushl %edx\n\t" + "call *%ebx\n\t" + "pushl %eax\n\t" + "call do_exit\n" + ".previous"); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS; + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *tsk = current; + struct thread_struct *t = &tsk->thread; + + /* The process may have allocated an io port bitmap... nuke it. */ + if (unlikely(NULL != t->io_bitmap_ptr)) { + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); + t->io_bitmap_max = 0; + tss->io_bitmap_owner = NULL; + tss->io_bitmap_max = 0; + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + put_cpu(); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + // temporary debugging check + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } + + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + struct pt_regs * childregs; + struct task_struct *tsk; + int err; + + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + *childregs = *regs; + childregs->eax = 0; + childregs->esp = esp; + + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); + + p->thread.eip = (unsigned long) ret_from_fork; + + savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); + + tsk = current; + if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { + struct desc_struct *desc; + struct user_desc info; + int idx; + + err = -EFAULT; + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) + goto out; + err = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + goto out; + + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + err = 0; + out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * fill in the user structure for a core dump.. + */ +void dump_thread(struct pt_regs * regs, struct user * dump) +{ + int i; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + for (i = 0; i < 8; i++) + dump->u_debugreg[i] = current->thread.debugreg[i]; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; + + dump->regs.ebx = regs->ebx; + dump->regs.ecx = regs->ecx; + dump->regs.edx = regs->edx; + dump->regs.esi = regs->esi; + dump->regs.edi = regs->edi; + dump->regs.ebp = regs->ebp; + dump->regs.eax = regs->eax; + dump->regs.ds = regs->xds; + dump->regs.es = regs->xes; + savesegment(fs,dump->regs.fs); + savesegment(gs,dump->regs.gs); + dump->regs.orig_eax = regs->orig_eax; + dump->regs.eip = regs->eip; + dump->regs.cs = regs->xcs; + dump->regs.eflags = regs->eflags; + dump->regs.esp = regs->esp; + dump->regs.ss = regs->xss; + + dump->u_fpvalid = dump_fpu (regs, &dump->i387); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs ptregs; + + ptregs = *(struct pt_regs *) + ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs)); + ptregs.xcs &= 0xffff; + ptregs.xds &= 0xffff; + ptregs.xes &= 0xffff; + ptregs.xss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +static inline void +handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) +{ + if (!next->io_bitmap_ptr) { + /* + * Disable the bitmap via an invalid offset. We still cache + * the previous bitmap owner and the IO bitmap contents: + */ + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + return; + } + if (likely(next == tss->io_bitmap_owner)) { + /* + * Previous owner of the bitmap (hence the bitmap content) + * matches the next task, we dont have to do anything but + * to set a valid offset in the TSS: + */ + tss->io_bitmap_base = IO_BITMAP_OFFSET; + return; + } + /* + * Lazy TSS's I/O bitmap copy. We set an invalid offset here + * and we let the task to get a GPF in case an I/O instruction + * is performed. The handler of the GPF will verify that the + * faulting task has a valid I/O bitmap and, it true, does the + * real copy and restart the instruction. This will save us + * redundant copies when the currently switched task does not + * perform any I/O during its timeslice. + */ + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; +} +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,register) \ + __asm__("movl %0,%%db" #register \ + : /* no output */ \ + :"r" (thread->debugreg[register])) + +/* + * switch_to(x,yn) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %eax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + __unlazy_fpu(prev_p); + + /* + * Reload esp0, LDT and the page table pointer: + */ + load_esp0(tss, next); + + /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + /* + * Save away %fs and %gs. No need to save %es and %ds, as + * those are always kernel segments while inside the kernel. + */ + asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs)); + asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs)); + + /* + * Restore %fs and %gs if needed. + */ + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); + } + + /* + * Now maybe reload the debug registers + */ + if (unlikely(next->debugreg[7])) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) + handle_io_bitmap(next, tss); + + return prev_p; +} + +asmlinkage int sys_fork(struct pt_regs regs) +{ + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +asmlinkage int sys_clone(struct pt_regs regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs.ebx; + newsp = regs.ecx; + parent_tidptr = (int __user *)regs.edx; + child_tidptr = (int __user *)regs.edi; + if (!newsp) + newsp = regs.esp; + return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage int sys_vfork(struct pt_regs regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage int sys_execve(struct pt_regs regs) +{ + int error; + char * filename; + + filename = getname((char __user *) regs.ebx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, + (char __user * __user *) regs.ecx, + (char __user * __user *) regs.edx, + ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } + putname(filename); +out: + return error; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ebp, esp, eip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)p->thread_info; + esp = p->thread.esp; + if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + ebp = *(unsigned long *) esp; + do { + if (ebp < stack_page || ebp > top_ebp+stack_page) + return 0; + eip = *(unsigned long *) (ebp+4); + if (!in_sched_functions(eip)) + return eip; + ebp = *(unsigned long *) ebp; + } while (count++ < 16); + return 0; +} + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(t->tls_array + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + */ +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + struct thread_struct *t = ¤t->thread; + struct user_desc info; + struct desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + load_TLS(t, cpu); + + put_cpu(); + + return 0; +} + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + struct user_desc info; + struct desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c new file mode 100644 index 000000000000..b2f17640ceff --- /dev/null +++ b/arch/i386/kernel/ptrace.c @@ -0,0 +1,717 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/user.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x00044dd5 + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100 + +/* + * Offset of eflags on child stack.. + */ +#define EFL_OFFSET ((EFL-2)*4-sizeof(struct pt_regs)) + +static inline struct pt_regs *get_child_regs(struct task_struct *task) +{ + void *stack_top = (void *)task->thread.esp0; + return stack_top - sizeof(struct pt_regs); +} + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->thread.esp0; + stack += offset; + return (*((int *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->thread.esp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long regno, unsigned long value) +{ + switch (regno >> 2) { + case FS: + if (value && (value & 3) != 3) + return -EIO; + child->thread.fs = value; + return 0; + case GS: + if (value && (value & 3) != 3) + return -EIO; + child->thread.gs = value; + return 0; + case DS: + case ES: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case SS: + case CS: + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case EFL: + value &= FLAG_MASK; + value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; + break; + } + if (regno > GS*4) + regno -= 2*4; + put_stack_long(child, regno - sizeof(struct pt_regs), value); + return 0; +} + +static unsigned long getreg(struct task_struct *child, + unsigned long regno) +{ + unsigned long retval = ~0UL; + + switch (regno >> 2) { + case FS: + retval = child->thread.fs; + break; + case GS: + retval = child->thread.gs; + break; + case DS: + case ES: + case SS: + case CS: + retval = 0xffff; + /* fall through */ + default: + if (regno > GS*4) + regno -= 2*4; + regno = regno - sizeof(struct pt_regs); + retval &= get_stack_long(child, regno); + } + return retval; +} + +#define LDT_SEGMENT 4 + +static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->eip; + seg = regs->xcs & 0xffff; + if (regs->eflags & VM_MASK) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if (seg & LDT_SEGMENT) { + u32 *desc; + unsigned long base; + + down(&child->mm->context.sem); + desc = child->mm->context.ldt + (seg & ~7); + base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + up(&child->mm->context.sem); + } + return addr; +} + +static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[16]; + unsigned long addr = convert_eip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf */ + case 0x9d: + return 1; + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +static void set_singlestep(struct task_struct *child) +{ + struct pt_regs *regs = get_child_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->eflags & TRAP_FLAG) + return; + + /* Set TF on the kernel stack.. */ + regs->eflags |= TRAP_FLAG; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_at_popf(child, regs)) + return; + + child->ptrace |= PT_DTRACE; +} + +static void clear_singlestep(struct task_struct *child) +{ + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (child->ptrace & PT_DTRACE) { + struct pt_regs *regs = get_child_regs(child); + regs->eflags &= ~TRAP_FLAG; + child->ptrace &= ~PT_DTRACE; + } +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + clear_singlestep(child); +} + +/* + * Perform get_thread_area on behalf of the traced child. + */ +static int +ptrace_get_thread_area(struct task_struct *child, + int idx, struct user_desc __user *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(user_desc, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +/* + * Perform set_thread_area on behalf of the traced child. + */ +static int +ptrace_set_thread_area(struct task_struct *child, + int idx, struct user_desc __user *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + return 0; +} + +asmlinkage int sys_ptrace(long request, long pid, long addr, long data) +{ + struct task_struct *child; + struct user * dummy = NULL; + int i, ret; + unsigned long __user *datap = (unsigned long __user *)data; + + lock_kernel(); + ret = -EPERM; + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->ptrace & PT_PTRACED) + goto out; + ret = security_ptrace(current->parent, current); + if (ret) + goto out; + /* set the ptrace bit in the process flags. */ + current->ptrace |= PT_PTRACED; + ret = 0; + goto out; + } + ret = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (!child) + goto out; + + ret = -EPERM; + if (pid == 1) /* you may not mess with init */ + goto out_tsk; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_tsk; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_tsk; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int copied; + + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + ret = -EIO; + if (copied != sizeof(tmp)) + break; + ret = put_user(tmp, datap); + break; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + break; + + tmp = 0; /* Default return condition */ + if(addr < FRAME_SIZE*sizeof(long)) + tmp = getreg(child, addr); + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + addr -= (long) &dummy->u_debugreg[0]; + addr = addr >> 2; + tmp = child->thread.debugreg[addr]; + } + ret = put_user(tmp, datap); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = 0; + if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) + break; + ret = -EIO; + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + break; + + if (addr < FRAME_SIZE*sizeof(long)) { + ret = putreg(child, addr, data); + break; + } + /* We need to be very careful here. We implicitly + want to modify a portion of the task_struct, and we + have to be selective about what portions we allow someone + to modify. */ + + ret = -EIO; + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + + if(addr == (long) &dummy->u_debugreg[4]) break; + if(addr == (long) &dummy->u_debugreg[5]) break; + if(addr < (long) &dummy->u_debugreg[4] && + ((unsigned long) data) >= TASK_SIZE-3) break; + + /* Sanity-check data. Take one half-byte at once with + * check = (val >> (16 + 4*i)) & 0xf. It contains the + * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits + * 2 and 3 are LENi. Given a list of invalid values, + * we do mask |= 1 << invalid_value, so that + * (mask >> check) & 1 is a correct test for invalid + * values. + * + * R/Wi contains the type of the breakpoint / + * watchpoint, LENi contains the length of the watched + * data in the watchpoint case. + * + * The invalid values are: + * - LENi == 0x10 (undefined), so mask |= 0x0f00. + * - R/Wi == 0x10 (break on I/O reads or writes), so + * mask |= 0x4444. + * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= + * 0x1110. + * + * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. + * + * See the Intel Manual "System Programming Guide", + * 15.2.4 + * + * Note that LENi == 0x10 is defined on x86_64 in long + * mode (i.e. even for 32-bit userspace software, but + * 64-bit kernel), so the x86_64 mask value is 0x5454. + * See the AMD manual no. 24593 (AMD64 System + * Programming)*/ + + if(addr == (long) &dummy->u_debugreg[7]) { + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + goto out_tsk; + } + + addr -= (long) &dummy->u_debugreg; + addr = addr >> 2; + child->thread.debugreg[addr] = data; + ret = 0; + } + break; + + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: /* restart after signal. */ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + if (request == PTRACE_SYSCALL) { + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + else { + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + child->exit_code = data; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + ret = 0; + break; + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: + ret = 0; + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ + break; + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + break; + + case PTRACE_SINGLESTEP: /* set the trap flag. */ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_singlestep(child); + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { + __put_user(getreg(child, i), datap); + datap++; + } + ret = 0; + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { + __get_user(tmp, datap); + putreg(child, i, tmp); + datap++; + } + ret = 0; + break; + } + + case PTRACE_GETFPREGS: { /* Get the child FPU state. */ + if (!access_ok(VERIFY_WRITE, datap, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + ret = 0; + if (!tsk_used_math(child)) + init_fpu(child); + get_fpregs((struct user_i387_struct __user *)data, child); + break; + } + + case PTRACE_SETFPREGS: { /* Set the child FPU state. */ + if (!access_ok(VERIFY_READ, datap, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + set_fpregs(child, (struct user_i387_struct __user *)data); + ret = 0; + break; + } + + case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ + if (!access_ok(VERIFY_WRITE, datap, + sizeof(struct user_fxsr_struct))) { + ret = -EIO; + break; + } + if (!tsk_used_math(child)) + init_fpu(child); + ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); + break; + } + + case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ + if (!access_ok(VERIFY_READ, datap, + sizeof(struct user_fxsr_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); + break; + } + + case PTRACE_GET_THREAD_AREA: + ret = ptrace_get_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + ret = ptrace_set_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } +out_tsk: + put_task_struct(child); +out: + unlock_kernel(); + return ret; +} + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +{ + struct siginfo info; + + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGTRAP; + info.si_code = TRAP_BRKPT; + + /* User-mode eip? */ + info.si_addr = user_mode(regs) ? (void __user *) regs->eip : NULL; + + /* Send us the fakey SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +void do_syscall_trace(struct pt_regs *regs, int entryexit) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_eax); + + if (unlikely(current->audit_context)) { + if (!entryexit) + audit_syscall_entry(current, regs->orig_eax, + regs->ebx, regs->ecx, + regs->edx, regs->esi); + else + audit_syscall_exit(current, regs->eax); + } + + if (!(current->ptrace & PT_PTRACED)) + return; + + /* Fake a debug trap */ + if (test_thread_flag(TIF_SINGLESTEP)) + send_sigtrap(current, regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c new file mode 100644 index 000000000000..aaf89cb2bc51 --- /dev/null +++ b/arch/i386/kernel/quirks.c @@ -0,0 +1,52 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include <linux/config.h> +#include <linux/pci.h> +#include <linux/irq.h> + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u32 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + config |= 0x2; + pci_write_config_byte(dev, 0xf4, config); + + /* read xTPR register */ + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { + printk(KERN_INFO "Disabling irq balancing and affinity\n"); +#ifdef CONFIG_IRQBALANCE + irqbalance_disable(""); +#endif + noirqdebug_setup(""); +#ifdef CONFIG_PROC_FS + no_irq_affinity = 1; +#endif + } + + config &= ~0x2; + /* disable access to config space*/ + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +#endif diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c new file mode 100644 index 000000000000..3d7e994563df --- /dev/null +++ b/arch/i386/kernel/reboot.c @@ -0,0 +1,382 @@ +/* + * linux/arch/i386/kernel/reboot.c + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/efi.h> +#include <linux/dmi.h> +#include <asm/uaccess.h> +#include <asm/apic.h> +#include "mach_reboot.h" + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); + +static int reboot_mode; +static int reboot_thru_bios; + +#ifdef CONFIG_SMP +int reboot_smp = 0; +static int reboot_cpu = -1; +/* shamelessly grabbed from lib/vsprintf.c for readability */ +#define is_digit(c) ((c) >= '0' && (c) <= '9') +#endif +static int __init reboot_setup(char *str) +{ + while(1) { + switch (*str) { + case 'w': /* "warm" reboot (no memory testing etc) */ + reboot_mode = 0x1234; + break; + case 'c': /* "cold" reboot (with memory testing etc) */ + reboot_mode = 0x0; + break; + case 'b': /* "bios" reboot by jumping through the BIOS */ + reboot_thru_bios = 1; + break; + case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ + reboot_thru_bios = 0; + break; +#ifdef CONFIG_SMP + case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + reboot_smp = 1; + if (is_digit(*(str+1))) { + reboot_cpu = (int) (*(str+1) - '0'); + if (is_digit(*(str+2))) + reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); + } + /* we will leave sorting out the final value + when we are ready to reboot, since we might not + have set up boot_cpu_id or smp_num_cpu */ + break; +#endif + } + if((str = strchr(str,',')) != NULL) + str++; + else + break; + } + return 1; +} + +__setup("reboot=", reboot_setup); + +/* + * Reboot options and system auto-detection code provided by + * Dell Inc. so their systems "just work". :-) + */ + +/* + * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. + */ +static int __init set_bios_reboot(struct dmi_system_id *d) +{ + if (!reboot_thru_bios) { + reboot_thru_bios = 1; + printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + } + return 0; +} + +/* + * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. + */ +static int __init set_smp_reboot(struct dmi_system_id *d) +{ +#ifdef CONFIG_SMP + if (!reboot_smp) { + reboot_smp = 1; + printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); + } +#endif + return 0; +} + +/* + * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. + */ +static int __init set_smp_bios_reboot(struct dmi_system_id *d) +{ + set_smp_reboot(d); + set_bios_reboot(d); + return 0; +} + +static struct dmi_system_id __initdata reboot_dmi_table[] = { + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_smp_bios_reboot, + .ident = "Dell PowerEdge 1300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + }, + }, + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + }, + }, + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + }, + }, + { } +}; + +static int __init reboot_init(void) +{ + dmi_check_system(reboot_dmi_table); + return 0; +} + +core_initcall(reboot_init); + +/* The following code and data reboots the machine by switching to real + mode and jumping to the BIOS reset entry point, as if the CPU has + really been reset. The previous version asked the keyboard + controller to pulse the CPU reset line, which is more thorough, but + doesn't work with at least one type of 486 motherboard. It is easy + to stop this code working; hence the copious comments. */ + +static unsigned long long +real_mode_gdt_entries [3] = +{ + 0x0000000000000000ULL, /* Null descriptor */ + 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ + 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ +}; + +static struct +{ + unsigned short size __attribute__ ((packed)); + unsigned long long * base __attribute__ ((packed)); +} +real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries }, +real_mode_idt = { 0x3ff, NULL }, +no_idt = { 0, NULL }; + + +/* This is 16-bit protected mode code to disable paging and the cache, + switch to real mode and jump to the BIOS reset code. + + The instruction that switches to real mode by writing to CR0 must be + followed immediately by a far jump instruction, which set CS to a + valid value for real mode, and flushes the prefetch queue to avoid + running instructions that have already been decoded in protected + mode. + + Clears all the flags except ET, especially PG (paging), PE + (protected-mode enable) and TS (task switch for coprocessor state + save). Flushes the TLB after paging has been disabled. Sets CD and + NW, to disable the cache on a 486, and invalidates the cache. This + is more like the state of a 486 after reset. I don't know if + something else should be done for other chips. + + More could be done here to set up the registers as if a CPU reset had + occurred; hopefully real BIOSs don't assume much. */ + +static unsigned char real_mode_switch [] = +{ + 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ + 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ + 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ + 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ + 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ + 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ + 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ + 0x74, 0x02, /* jz f */ + 0x0f, 0x09, /* wbinvd */ + 0x24, 0x10, /* f: andb $0x10,al */ + 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ +}; +static unsigned char jump_to_bios [] = +{ + 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ +}; + +/* + * Switch to real mode and then execute the code + * specified by the code and length parameters. + * We assume that length will aways be less that 100! + */ +void machine_real_restart(unsigned char *code, int length) +{ + unsigned long flags; + + local_irq_disable(); + + /* Write zero to CMOS register number 0x0f, which the BIOS POST + routine will recognize as telling it to do a proper reboot. (Well + that's what this book in front of me says -- it may only apply to + the Phoenix BIOS though, it's not clear). At the same time, + disable NMIs by setting the top bit in the CMOS address register, + as we're about to do peculiar things to the CPU. I'm not sure if + `outb_p' is needed instead of just `outb'. Use it to be on the + safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0x00, 0x8f); + spin_unlock_irqrestore(&rtc_lock, flags); + + /* Remap the kernel at virtual address zero, as well as offset zero + from the kernel segment. This assumes the kernel segment starts at + virtual address PAGE_OFFSET. */ + + memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + + /* + * Use `swapper_pg_dir' as our page directory. + */ + load_cr3(swapper_pg_dir); + + /* Write 0x1234 to absolute memory location 0x472. The BIOS reads + this on booting to tell it to "Bypass memory test (also warm + boot)". This seems like a fairly standard thing that gets set by + REBOOT.COM programs, and the previous reset routine did this + too. */ + + *((unsigned short *)0x472) = reboot_mode; + + /* For the switch to real mode, copy some code to low memory. It has + to be in the first 64k because it is running in 16-bit mode, and it + has to have the same physical and virtual address, because it turns + off paging. Copy it near the end of the first page, out of the way + of BIOS variables. */ + + memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + real_mode_switch, sizeof (real_mode_switch)); + memcpy ((void *) (0x1000 - 100), code, length); + + /* Set up the IDT for real mode. */ + + __asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt)); + + /* Set up a GDT from which we can load segment descriptors for real + mode. The GDT is not used in real mode; it is just needed here to + prepare the descriptors. */ + + __asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt)); + + /* Load the data segment registers, and thus the descriptors ready for + real mode. The base address of each segment is 0x100, 16 times the + selector value being loaded here. This is so that the segment + registers don't have to be reloaded after switching to real mode: + the values are consistent for real mode operation already. */ + + __asm__ __volatile__ ("movl $0x0010,%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss" : : : "eax"); + + /* Jump to the 16-bit code that we copied earlier. It disables paging + and the cache, switches to real mode, and jumps to the BIOS reset + entry point. */ + + __asm__ __volatile__ ("ljmp $0x0008,%0" + : + : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); +} + +void machine_restart(char * __unused) +{ +#ifdef CONFIG_SMP + int cpuid; + + cpuid = GET_APIC_ID(apic_read(APIC_ID)); + + if (reboot_smp) { + + /* check to see if reboot_cpu is valid + if its not, default to the BSP */ + if ((reboot_cpu == -1) || + (reboot_cpu > (NR_CPUS -1)) || + !physid_isset(cpuid, phys_cpu_present_map)) + reboot_cpu = boot_cpu_physical_apicid; + + reboot_smp = 0; /* use this as a flag to only go through this once*/ + /* re-run this function on the other CPUs + it will fall though this section since we have + cleared reboot_smp, and do the reboot if it is the + correct CPU, otherwise it halts. */ + if (reboot_cpu != cpuid) + smp_call_function((void *)machine_restart , NULL, 1, 0); + } + + /* if reboot_cpu is still -1, then we want a tradional reboot, + and if we are not running on the reboot_cpu,, halt */ + if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { + for (;;) + __asm__ __volatile__ ("hlt"); + } + /* + * Stop all CPUs and turn off local APICs and the IO-APIC, so + * other OSs see a clean IRQ state. + */ + smp_send_stop(); +#endif /* CONFIG_SMP */ + + lapic_shutdown(); + +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif + + if (!reboot_thru_bios) { + if (efi_enabled) { + efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + __asm__ __volatile__("lidt %0": :"m" (no_idt)); + __asm__ __volatile__("int3"); + } + /* rebooting needs to touch the page at absolute addr 0 */ + *((unsigned short *)__va(0x472)) = reboot_mode; + for (;;) { + mach_reboot(); + /* That didn't work - force a triple fault.. */ + __asm__ __volatile__("lidt %0": :"m" (no_idt)); + __asm__ __volatile__("int3"); + } + } + if (efi_enabled) + efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); + + machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); +} + +EXPORT_SYMBOL(machine_restart); + +void machine_halt(void) +{ +} + +EXPORT_SYMBOL(machine_halt); + +void machine_power_off(void) +{ + lapic_shutdown(); + + if (efi_enabled) + efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, NULL); + if (pm_power_off) + pm_power_off(); +} + +EXPORT_SYMBOL(machine_power_off); + diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c new file mode 100644 index 000000000000..69e203a0d330 --- /dev/null +++ b/arch/i386/kernel/scx200.c @@ -0,0 +1,167 @@ +/* linux/arch/i386/kernel/scx200.c + + Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> + + National Semiconductor SCx200 support. */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> + +#include <linux/scx200.h> + +/* Verify that the configuration block really is there */ +#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) + +#define NAME "scx200" + +MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>"); +MODULE_DESCRIPTION("NatSemi SCx200 Driver"); +MODULE_LICENSE("GPL"); + +unsigned scx200_gpio_base = 0; +long scx200_gpio_shadow[2]; + +unsigned scx200_cb_base = 0; + +static struct pci_device_id scx200_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, + { }, +}; +MODULE_DEVICE_TABLE(pci,scx200_tbl); + +static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); + +static struct pci_driver scx200_pci_driver = { + .name = "scx200", + .id_table = scx200_tbl, + .probe = scx200_probe, +}; + +static DEFINE_SPINLOCK(scx200_gpio_config_lock); + +static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int bank; + unsigned base; + + if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || + pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { + base = pci_resource_start(pdev, 0); + printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); + + if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) { + printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); + return -EBUSY; + } + + scx200_gpio_base = base; + + /* read the current values driven on the GPIO signals */ + for (bank = 0; bank < 2; ++bank) + scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); + + } else { + /* find the base of the Configuration Block */ + if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { + scx200_cb_base = SCx200_CB_BASE_FIXED; + } else { + pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); + if (scx200_cb_probe(base)) { + scx200_cb_base = base; + } else { + printk(KERN_WARNING NAME ": Configuration Block not found\n"); + return -ENODEV; + } + } + printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); + } + + return 0; +} + +u32 scx200_gpio_configure(int index, u32 mask, u32 bits) +{ + u32 config, new_config; + unsigned long flags; + + spin_lock_irqsave(&scx200_gpio_config_lock, flags); + + outl(index, scx200_gpio_base + 0x20); + config = inl(scx200_gpio_base + 0x24); + + new_config = (config & mask) | bits; + outl(new_config, scx200_gpio_base + 0x24); + + spin_unlock_irqrestore(&scx200_gpio_config_lock, flags); + + return config; +} + +#if 0 +void scx200_gpio_dump(unsigned index) +{ + u32 config = scx200_gpio_configure(index, ~0, 0); + printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config); + + if (config & 1) + printk(" OE"); /* output enabled */ + else + printk(" TS"); /* tristate */ + if (config & 2) + printk(" PP"); /* push pull */ + else + printk(" OD"); /* open drain */ + if (config & 4) + printk(" PUE"); /* pull up enabled */ + else + printk(" PUD"); /* pull up disabled */ + if (config & 8) + printk(" LOCKED"); /* locked */ + if (config & 16) + printk(" LEVEL"); /* level input */ + else + printk(" EDGE"); /* edge input */ + if (config & 32) + printk(" HI"); /* trigger on rising edge */ + else + printk(" LO"); /* trigger on falling edge */ + if (config & 64) + printk(" DEBOUNCE"); /* debounce */ + printk("\n"); +} +#endif /* 0 */ + +static int __init scx200_init(void) +{ + printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); + + return pci_module_init(&scx200_pci_driver); +} + +static void __exit scx200_cleanup(void) +{ + pci_unregister_driver(&scx200_pci_driver); + release_region(scx200_gpio_base, SCx200_GPIO_SIZE); +} + +module_init(scx200_init); +module_exit(scx200_cleanup); + +EXPORT_SYMBOL(scx200_gpio_base); +EXPORT_SYMBOL(scx200_gpio_shadow); +EXPORT_SYMBOL(scx200_gpio_configure); +EXPORT_SYMBOL(scx200_cb_base); + +/* + Local variables: + compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules" + c-basic-offset: 8 + End: +*/ diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c new file mode 100644 index 000000000000..469f496e55c0 --- /dev/null +++ b/arch/i386/kernel/semaphore.c @@ -0,0 +1,297 @@ +/* + * i386 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/err.h> +#include <linux/init.h> +#include <asm/semaphore.h> + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to acquire the semaphore, while the "sleeping" + * variable is a count of such acquires. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + * + * "sleeping" and the contention routine ordering is protected + * by the spinlock in the semaphore's waitqueue head. + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in <asm/semaphore.h> + * where we want to avoid any extra jumps and calls. + */ + +/* + * Logic: + * - only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - when we go from a non-negative count to a negative do we + * (a) synchronize with the "sleeper" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +static fastcall void __attribute_used__ __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +static fastcall void __attribute_used__ __sched __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem) +{ + int sleepers; + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) { + wake_up_locked(&sem->wait); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} + + +/* + * The semaphore operations have a special calling sequence that + * allow us to do a simpler in-line version of them. These routines + * need to convert that sequence back into the C sequence when + * there is contention on the semaphore. + * + * %eax contains the semaphore pointer on entry. Save the C-clobbered + * registers (%eax, %edx and %ecx) except %eax whish is either a return + * value or just clobbered.. + */ +asm( +".section .sched.text\n" +".align 4\n" +".globl __down_failed\n" +"__down_failed:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __down_failed_interruptible\n" +"__down_failed_interruptible:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down_interruptible\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __down_failed_trylock\n" +"__down_failed_trylock:\n\t" +#if defined(CONFIG_FRAME_POINTER) + "pushl %ebp\n\t" + "movl %esp,%ebp\n\t" +#endif + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __down_trylock\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" +#if defined(CONFIG_FRAME_POINTER) + "movl %ebp,%esp\n\t" + "popl %ebp\n\t" +#endif + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __up_wakeup\n" +"__up_wakeup:\n\t" + "pushl %edx\n\t" + "pushl %ecx\n\t" + "call __up\n\t" + "popl %ecx\n\t" + "popl %edx\n\t" + "ret" +); + +/* + * rw spinlock fallbacks + */ +#if defined(CONFIG_SMP) +asm( +".section .sched.text\n" +".align 4\n" +".globl __write_lock_failed\n" +"__write_lock_failed:\n\t" + LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" +"1: rep; nop\n\t" + "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jne 1b\n\t" + LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jnz __write_lock_failed\n\t" + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __read_lock_failed\n" +"__read_lock_failed:\n\t" + LOCK "incl (%eax)\n" +"1: rep; nop\n\t" + "cmpl $1,(%eax)\n\t" + "js 1b\n\t" + LOCK "decl (%eax)\n\t" + "js __read_lock_failed\n\t" + "ret" +); +#endif diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c new file mode 100644 index 000000000000..945ec73163c8 --- /dev/null +++ b/arch/i386/kernel/setup.c @@ -0,0 +1,1535 @@ +/* + * linux/arch/i386/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@mvista.com>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@osdl.org>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/tty.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/mca.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/nodemask.h> +#include <video/edid.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/arch_hooks.h> +#include <asm/sections.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/io.h> +#include "setup_arch_pre.h" +#include <bios_ebda.h> + +/* This value is set up by the early boot code to point to the value + immediately after the boot time page tables. It contains a *physical* + address, and must not be in the .bss segment! */ +unsigned long init_pg_tables_end __initdata = ~0UL; + +int disable_pse __initdata = 0; + +/* + * Machine setup.. + */ + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +/* cpu data as detected by the assembly code in head.S */ +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +/* common cpu data for all cpus */ +struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; + +unsigned long mmu_cr4_features; + +#ifdef CONFIG_ACPI_INTERPRETER + int acpi_disabled = 0; +#else + int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_ACPI_BOOT +int __initdata acpi_force = 0; +extern acpi_interrupt_flags acpi_sci_flags; +#endif + +/* for MCA, but anyone else can use it if they want */ +unsigned int machine_id; +unsigned int machine_submodel_id; +unsigned int BIOS_revision; +unsigned int mca_pentium_flag; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; + +/* Boot loader ID as an integer, for the benefit of proc_dointvec */ +int bootloader_type; + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * Setup options + */ +struct drive_info_struct { char dummy[32]; } drive_info; +struct screen_info screen_info; +struct apm_info apm_info; +struct sys_desc_table_struct { + unsigned short length; + unsigned char table[0]; +}; +struct edid_info edid_info; +struct ist_info ist_info; +struct e820map e820; + +extern void early_cpu_init(void); +extern void dmi_scan_machine(void); +extern void generic_apic_probe(char *); +extern int root_mountflags; + +unsigned long saved_videomode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char command_line[COMMAND_LINE_SIZE]; + +unsigned char __initdata boot_params[PARAM_SIZE]; + +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +#define ADAPTER_ROM_RESOURCES \ + (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define STANDARD_IO_RESOURCES \ + (sizeof standard_io_resources / sizeof standard_io_resources[0]) + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +static void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + if (efi_enabled) { + for (i = 0; i < memmap.nr_map; i++) { + current_addr = memmap.map[i].phys_addr + + (memmap.map[i].num_pages << 12); + if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + memmap.map[i].num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + if (e820.map[i].type == E820_RAM) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr >= size) { + e820.map[i].size -= current_addr-size; + e820.nr_map = i + 1; + return; + } + } + } +} + +static void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +#define E820_DEBUG 1 + +static void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; + +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i=0; i < 2*old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i=0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if <current_addr> > <last_addr>, swap */ + /* or, if current=<start_addr> & last=<end_addr>, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; i<overlap_entries; i++) + { + if (overlap_list[i] == change_point[chgidx]->pbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; i<overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void copy_edd(void) +{ + memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); + edd.mbr_signature_nr = EDD_MBR_SIG_NR; + edd.edd_info_nr = EDD_NR; +} +#else +static inline void copy_edd(void) +{ +} +#endif + +/* + * Do NOT EVER look at the BIOS memory size location. + * It does not work on many machines. + */ +#define LOWMEMSIZE() (0x9f000) + +static void __init parse_cmdline_early (char ** cmdline_p) +{ + char c = ' ', *to = command_line, *from = saved_command_line; + int len = 0; + int userdef = 0; + + /* Save unparsed command line copy for /proc/cmdline */ + saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; + + for (;;) { + if (c != ' ') + goto next_char; + /* + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] + */ + if (!memcmp(from, "mem=", 4)) { + if (to != command_line) + to--; + if (!memcmp(from+4, "nopentium", 9)) { + from += 9+4; + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long mem_size; + + mem_size = memparse(from+4, &from); + limit_regions(mem_size); + userdef=1; + } + } + + else if (!memcmp(from, "memmap=", 7)) { + if (to != command_line) + to--; + if (!memcmp(from+7, "exactmap", 8)) { + from += 8+7; + e820.nr_map = 0; + userdef = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(from+7, &from); + if (*from == '@') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*from == '#') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*from == '$') { + start_at = memparse(from+1, &from); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + userdef=1; + } + } + } + + else if (!memcmp(from, "noexec=", 7)) + noexec_setup(from + 7); + + +#ifdef CONFIG_X86_SMP + /* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ + else if (!memcmp(from, "maxcpus=", 8)) { + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(from + 8, NULL, 0); + } +#endif + +#ifdef CONFIG_ACPI_BOOT + /* "acpi=off" disables both ACPI table parsing and interpreter */ + else if (!memcmp(from, "acpi=off", 8)) { + disable_acpi(); + } + + /* acpi=force to over-ride black-list */ + else if (!memcmp(from, "acpi=force", 10)) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + + /* acpi=strict disables out-of-spec workarounds */ + else if (!memcmp(from, "acpi=strict", 11)) { + acpi_strict = 1; + } + + /* Limit ACPI just to boot-time to enable HT */ + else if (!memcmp(from, "acpi=ht", 7)) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + + /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ + else if (!memcmp(from, "pci=noacpi", 10)) { + acpi_disable_pci(); + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (!memcmp(from, "acpi=noirq", 10)) { + acpi_noirq_set(); + } + + else if (!memcmp(from, "acpi_sci=edge", 13)) + acpi_sci_flags.trigger = 1; + + else if (!memcmp(from, "acpi_sci=level", 14)) + acpi_sci_flags.trigger = 3; + + else if (!memcmp(from, "acpi_sci=high", 13)) + acpi_sci_flags.polarity = 1; + + else if (!memcmp(from, "acpi_sci=low", 12)) + acpi_sci_flags.polarity = 3; + +#ifdef CONFIG_X86_IO_APIC + else if (!memcmp(from, "acpi_skip_timer_override", 24)) + acpi_skip_timer_override = 1; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* disable IO-APIC */ + else if (!memcmp(from, "noapic", 6)) + disable_ioapic_setup(); +#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_ACPI_BOOT */ + + /* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ + else if (!memcmp(from, "highmem=", 8)) + highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; + + /* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ + else if (!memcmp(from, "vmalloc=", 8)) + __VMALLOC_RESERVE = memparse(from+8, &from); + + next_char: + c = *(from++); + if (!c) + break; + if (COMMAND_LINE_SIZE <= ++len) + break; + *(to++) = c; + } + *to = '\0'; + *cmdline_p = command_line; + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); + } +} + +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + } +} + +/* + * Determine low and high memory ranges: + */ +unsigned long __init find_max_low_pfn(void) +{ + unsigned long max_low_pfn; + + max_low_pfn = max_pfn; + if (max_low_pfn > MAXMEM_PFN) { + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", + MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_X86_PAE + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING "Warning only 4GB will be used.\n"); + printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + } +#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM */ + } else { + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ + printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif + } + return max_low_pfn; +} + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= ((max_low_pfn + 1) << PAGE_SHIFT)) + return 0; + if (end >= ((max_low_pfn + 1) << PAGE_SHIFT)) + end = (max_low_pfn + 1) << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +static void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +/* + * workaround for Dell systems that neglect to reserve EBDA + */ +static void __init reserve_ebda_region(void) +{ + unsigned int addr; + addr = get_bios_ebda(); + if (addr) + reserve_bootmem(addr, PAGE_SIZE); +} + +#ifndef CONFIG_DISCONTIGMEM +void __init setup_bootmem_allocator(void); +static unsigned long __init setup_memory(void) +{ + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + min_low_pfn = PFN_UP(init_pg_tables_end); + + find_max_pfn(); + + max_low_pfn = find_max_low_pfn(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) { + highstart_pfn = max_low_pfn; + } + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned int max_dma, low; + + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + low = max_low_pfn; + + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = highend_pfn - low; +#endif + } + free_area_init(zones_size); +} +#else +extern unsigned long setup_memory(void); +extern void zone_sizes_init(void); +#endif /* !CONFIG_DISCONTIGMEM */ + +void __init setup_bootmem_allocator(void) +{ + unsigned long bootmap_size; + /* + * Initialize the boot-time allocator (with low memory only): + */ + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); + + register_bootmem_low_pages(max_low_pfn); + + /* + * Reserve the bootmem bitmap itself as well. We do this in two + * steps (first step was init_bootmem()) because this catches + * the (very unlikely) case of us accidentally initializing the + * bootmem allocator with an invalid RAM area. + */ + reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, + * enabling clean reboots, SMP operation, laptop functions. + */ + reserve_bootmem(0, PAGE_SIZE); + + /* reserve EBDA region, it's a 4K region */ + reserve_ebda_region(); + + /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent + PCI prefetch into it (errata #56). Usually the page is reserved anyways, + unless you have no PS/2 mouse plugged in. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 6) + reserve_bootmem(0xa0000 - 4096, 4096); + +#ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_bootmem(PAGE_SIZE, PAGE_SIZE); +#endif +#ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. + */ + acpi_reserve_bootmem(); +#endif +#ifdef CONFIG_X86_FIND_SMP_CONFIG + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START) { + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + reserve_bootmem(INITRD_START, INITRD_SIZE); + initrd_start = + INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_end = initrd_start+INITRD_SIZE; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + INITRD_START + INITRD_SIZE, + max_low_pfn << PAGE_SHIFT); + initrd_start = 0; + } + } +#endif +} + +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + + probe_roms(); + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, code_resource); + request_resource(res, data_resource); + } + } +} + +/* + * Request address space for all standard resources + */ +static void __init register_memory(void) +{ + unsigned long gapstart, gapsize; + unsigned long long last; + int i; + + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < STANDARD_IO_RESOURCES; i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * Start allocating dynamic PCI memory a bit into the gap, + * aligned up to the nearest megabyte. + * + * Question: should we try to pad it up a bit (do something + * like " + (gapsize >> 3)" in there too?). We now have the + * technology. + */ + pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} + +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("\t.data\nintelnops: " + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 + GENERIC_NOP7 GENERIC_NOP8); +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); + +extern unsigned char intelnops[], k8nops[], k7nops[]; +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +static struct nop { + int cpuid; + unsigned char **noptable; +} noptypes[] = { + { X86_FEATURE_K8, k8_nops }, + { X86_FEATURE_K7, k7_nops }, + { -1, NULL } +}; + +/* Replace instructions with better alternatives for this CPU type. + + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that assymetric systems where + APs have less capabilities than the boot processor are not handled. + In this case boot with "noreplacement". */ +void apply_alternatives(void *start, void *end) +{ + struct alt_instr *a; + int diff, i, k; + unsigned char **noptable = intel_nops; + for (i = 0; noptypes[i].cpuid >= 0; i++) { + if (boot_cpu_has(noptypes[i].cpuid)) { + noptable = noptypes[i].noptable; + break; + } + } + for (a = start; (void *)a < end; a++) { + if (!boot_cpu_has(a->cpuid)) + continue; + BUG_ON(a->replacementlen > a->instrlen); + memcpy(a->instr, a->replacement, a->replacementlen); + diff = a->instrlen - a->replacementlen; + /* Pad the rest with nops */ + for (i = a->replacementlen; diff > 0; diff -= k, i += k) { + k = diff; + if (k > ASM_NOP_MAX) + k = ASM_NOP_MAX; + memcpy(a->instr + i, noptable[k], k); + } + } +} + +static int no_replacement __initdata = 0; + +void __init alternative_instructions(void) +{ + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + if (no_replacement) + return; + apply_alternatives(__alt_instructions, __alt_instructions_end); +} + +static int __init noreplacement_setup(char *s) +{ + no_replacement = 1; + return 0; +} + +__setup("noreplacement", noreplacement_setup); + +static char * __init machine_specific_memory_setup(void); + +#ifdef CONFIG_MCA +static void set_mca_bus(int x) +{ + MCA_bus = x; +} +#else +static void set_mca_bus(int x) { } +#endif + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +void __init setup_arch(char **cmdline_p) +{ + unsigned long max_low_pfn; + + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + pre_setup_arch_hook(); + early_cpu_init(); + + /* + * FIXME: This isn't an official loader_type right + * now but does currently work with elilo. + * If we were configured as an EFI kernel, check to make + * sure that we were loaded correctly from elilo and that + * the system table is valid. If not, then initialize normally. + */ +#ifdef CONFIG_EFI + if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) + efi_enabled = 1; +#endif + + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + edid_info = EDID_INFO; + apm_info.bios = APM_BIOS_INFO; + ist_info = IST_INFO; + saved_videomode = VIDEO_MODE; + if( SYS_DESC_TABLE.length != 0 ) { + set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); + machine_id = SYS_DESC_TABLE.table[0]; + machine_submodel_id = SYS_DESC_TABLE.table[1]; + BIOS_revision = SYS_DESC_TABLE.table[2]; + } + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + ARCH_SETUP + if (efi_enabled) + efi_init(); + else { + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(machine_specific_memory_setup()); + } + + copy_edd(); + + if (!MOUNT_ROOT_RDONLY) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = init_pg_tables_end + PAGE_OFFSET; + + code_resource.start = virt_to_phys(_text); + code_resource.end = virt_to_phys(_etext)-1; + data_resource.start = virt_to_phys(_etext); + data_resource.end = virt_to_phys(_edata)-1; + + parse_cmdline_early(cmdline_p); + + max_low_pfn = setup_memory(); + + /* + * NOTE: before this point _nobody_ is allowed to allocate + * any memory using the bootmem allocator. Although the + * alloctor is now initialised only the first 8Mb of the kernel + * virtual address space has been mapped. All allocations before + * paging_init() has completed must use the alloc_bootmem_low_pages() + * variant (which allocates DMA'able memory) and care must be taken + * not to exceed the 8Mb limit. + */ + +#ifdef CONFIG_SMP + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ +#endif + paging_init(); + remapped_pgdat_init(); + zone_sizes_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + +#ifdef CONFIG_EARLY_PRINTK + { + char *s = strstr(*cmdline_p, "earlyprintk="); + if (s) { + extern void setup_early_printk(char *); + + setup_early_printk(s); + printk("early console enabled\n"); + } + } +#endif + + + dmi_scan_machine(); + +#ifdef CONFIG_X86_GENERICARCH + generic_apic_probe(*cmdline_p); +#endif + if (efi_enabled) + efi_map_memmap(); + + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); + acpi_boot_init(); + +#ifdef CONFIG_X86_LOCAL_APIC + if (smp_found_config) + get_smp_config(); +#endif + + register_memory(); + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif +} + +#include "setup_arch_post.h" +/* + * Local Variables: + * mode:c + * c-file-style:"k&r" + * c-basic-offset:8 + * End: + */ diff --git a/arch/i386/kernel/sigframe.h b/arch/i386/kernel/sigframe.h new file mode 100644 index 000000000000..d21b14f5c25c --- /dev/null +++ b/arch/i386/kernel/sigframe.h @@ -0,0 +1,21 @@ +struct sigframe +{ + char *pretcode; + int sig; + struct sigcontext sc; + struct _fpstate fpstate; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; +}; + +struct rt_sigframe +{ + char *pretcode; + int sig; + struct siginfo *pinfo; + void *puc; + struct siginfo info; + struct ucontext uc; + struct _fpstate fpstate; + char retcode[8]; +}; diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c new file mode 100644 index 000000000000..ef3602e1c052 --- /dev/null +++ b/arch/i386/kernel/signal.c @@ -0,0 +1,665 @@ +/* + * linux/arch/i386/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/suspend.h> +#include <linux/ptrace.h> +#include <linux/elf.h> +#include <asm/processor.h> +#include <asm/ucontext.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include "sigframe.h" + +#define DEBUG_SIG 0 + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + struct pt_regs * regs = (struct pt_regs *) &history0; + sigset_t saveset; + + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + regs->eax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(regs, &saveset)) + return -EINTR; + } +} + +asmlinkage int +sys_rt_sigsuspend(struct pt_regs regs) +{ + sigset_t saveset, newset; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (regs.ecx != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset, (sigset_t __user *)regs.ebx, sizeof(newset))) + return -EFAULT; + sigdelsetmask(&newset, ~_BLOCKABLE); + + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + regs.eax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(®s, &saveset)) + return -EINTR; + } +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} + +asmlinkage int +sys_sigaltstack(unsigned long ebx) +{ + /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ + struct pt_regs *regs = (struct pt_regs *)&ebx; + const stack_t __user *uss = (const stack_t __user *)ebx; + stack_t __user *uoss = (stack_t __user *)regs->ecx; + + return do_sigaltstack(uss, uoss, regs->esp); +} + + +/* + * Do a signal return; undo the signal stack. + */ + +static int +restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) +{ + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#define COPY(x) err |= __get_user(regs->x, &sc->x) + +#define COPY_SEG(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->x##seg = tmp; } + +#define COPY_SEG_STRICT(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->x##seg = tmp|3; } + +#define GET_SEG(seg) \ + { unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg,tmp); } + +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \ + X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ + X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) + + GET_SEG(gs); + GET_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); + COPY(edi); + COPY(esi); + COPY(ebp); + COPY(esp); + COPY(ebx); + COPY(edx); + COPY(ecx); + COPY(eip); + COPY_SEG_STRICT(cs); + COPY_SEG_STRICT(ss); + + { + unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->eflags); + regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_eax = -1; /* disable syscall checks */ + } + + { + struct _fpstate __user * buf; + err |= __get_user(buf, &sc->fpstate); + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387(buf); + } else { + struct task_struct *me = current; + if (used_math()) { + clear_fpu(me); + clear_used_math(); + } + } + } + + err |= __get_user(*peax, &sc->eax); + return err; + +badframe: + return 1; +} + +asmlinkage int sys_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *) &__unused; + struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); + sigset_t set; + int eax; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &eax)) + goto badframe; + return eax; + +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *) &__unused; + struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); + sigset_t set; + int eax; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) + goto badframe; + + return eax; + +badframe: + force_sig(SIGSEGV, current); + return 0; +} + +/* + * Set up a signal frame. + */ + +static int +setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + int tmp, err = 0; + + tmp = 0; + __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + + err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); + err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); + err |= __put_user(regs->edi, &sc->edi); + err |= __put_user(regs->esi, &sc->esi); + err |= __put_user(regs->ebp, &sc->ebp); + err |= __put_user(regs->esp, &sc->esp); + err |= __put_user(regs->ebx, &sc->ebx); + err |= __put_user(regs->edx, &sc->edx); + err |= __put_user(regs->ecx, &sc->ecx); + err |= __put_user(regs->eax, &sc->eax); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); + err |= __put_user(regs->eip, &sc->eip); + err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->eflags, &sc->eflags); + err |= __put_user(regs->esp, &sc->esp_at_signal); + err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); + + tmp = save_i387(fpstate); + if (tmp < 0) + err = 1; + else + err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + + /* non-iBCS2 extensions.. */ + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(current->thread.cr2, &sc->cr2); + + return err; +} + +/* + * Determine which stack to use.. + */ +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long esp; + + /* Default to using normal stack */ + esp = regs->esp; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(esp) == 0) + esp = current->sas_ss_sp + current->sas_ss_size; + } + + /* This is the legacy signal stack switching. */ + else if ((regs->xss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + esp = (unsigned long) ka->sa.sa_restorer; + } + + return (void __user *)((esp - frame_size) & -8ul); +} + +/* These symbols are defined with the addresses in the vsyscall page. + See vsyscall-sigreturn.S. */ +extern void __user __kernel_sigreturn; +extern void __user __kernel_rt_sigreturn; + +static void setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) +{ + void __user *restorer; + struct sigframe __user *frame; + int err = 0; + int usig; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; + + err = __put_user(usig, &frame->sig); + if (err) + goto give_sigsegv; + + err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); + if (err) + goto give_sigsegv; + + if (_NSIG_WORDS > 1) { + err = __copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask)); + if (err) + goto give_sigsegv; + } + + restorer = &__kernel_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + /* Set up to return from userspace. */ + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is popl %eax ; movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); + err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->esp = (unsigned long) frame; + regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eax = (unsigned long) sig; + regs->edx = (unsigned long) 0; + regs->ecx = (unsigned long) 0; + + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->eip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs * regs) +{ + void __user *restorer; + struct rt_sigframe __user *frame; + int err = 0; + int usig; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; + + err |= __put_user(usig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto give_sigsegv; + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->esp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; + + /* Set up to return from userspace. */ + restorer = &__kernel_rt_sigreturn; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); + err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->esp = (unsigned long) frame; + regs->eip = (unsigned long) ka->sa.sa_handler; + regs->eax = (unsigned long) usig; + regs->edx = (unsigned long) &frame->info; + regs->ecx = (unsigned long) &frame->uc; + + set_fs(USER_DS); + regs->xds = __USER_DS; + regs->xes = __USER_DS; + regs->xss = __USER_DS; + regs->xcs = __USER_CS; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->eip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +/* + * OK, we're invoking a handler + */ + +static void +handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, + sigset_t *oldset, struct pt_regs * regs) +{ + /* Are we from a system call? */ + if (regs->orig_eax >= 0) { + /* If so, check system call restarting.. */ + switch (regs->eax) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->eax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + regs->eax = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->eax = regs->orig_eax; + regs->eip -= 2; + } + } + + /* + * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so + * that register information in the sigcontext is correct. + */ + if (unlikely(regs->eflags & TF_MASK) + && likely(current->ptrace & PT_DTRACE)) { + current->ptrace &= ~PT_DTRACE; + regs->eflags &= ~TF_MASK; + } + + /* Set up the stack frame */ + if (ka->sa.sa_flags & SA_SIGINFO) + setup_rt_frame(sig, ka, info, oldset, regs); + else + setup_frame(sig, ka, oldset, regs); + + if (!(ka->sa.sa_flags & SA_NODEFER)) { + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigaddset(¤t->blocked,sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) +{ + siginfo_t info; + int signr; + struct k_sigaction ka; + + /* + * We want the common case to go fast, which + * is why we may in certain cases get here from + * kernel mode. Just return without doing anything + * if so. + */ + if ((regs->xcs & 3) != 3) + return 1; + + if (current->flags & PF_FREEZE) { + refrigerator(0); + goto no_signal; + } + + if (!oldset) + oldset = ¤t->blocked; + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); + if (signr > 0) { + /* Reenable any watchpoints before delivering the + * signal to user space. The processor register will + * have been cleared if the watchpoint triggered + * inside the kernel. + */ + if (unlikely(current->thread.debugreg[7])) { + __asm__("movl %0,%%db7" : : "r" (current->thread.debugreg[7])); + } + + /* Whee! Actually deliver the signal. */ + handle_signal(signr, &info, &ka, oldset, regs); + return 1; + } + + no_signal: + /* Did we come from a system call? */ + if (regs->orig_eax >= 0) { + /* Restart the system call - no handlers present */ + if (regs->eax == -ERESTARTNOHAND || + regs->eax == -ERESTARTSYS || + regs->eax == -ERESTARTNOINTR) { + regs->eax = regs->orig_eax; + regs->eip -= 2; + } + if (regs->eax == -ERESTART_RESTARTBLOCK){ + regs->eax = __NR_restart_syscall; + regs->eip -= 2; + } + } + return 0; +} + +/* + * notification of userspace execution resumption + * - triggered by current->work.notify_resume + */ +__attribute__((regparm(3))) +void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, + __u32 thread_info_flags) +{ + /* Pending single-step? */ + if (thread_info_flags & _TIF_SINGLESTEP) { + regs->eflags |= TF_MASK; + clear_thread_flag(TIF_SINGLESTEP); + } + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs,oldset); + + clear_thread_flag(TIF_IRET); +} diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c new file mode 100644 index 000000000000..6223c33ac91c --- /dev/null +++ b/arch/i386/kernel/smp.c @@ -0,0 +1,612 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/interrupt.h> + +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#include <mach_apic.h> + +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically thats so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; + +/* + * the following functions deal with sending IPIs between CPUs. + * + * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. + */ + +static inline int __prepare_ICR (unsigned int shortcut, int vector) +{ + return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; +} + +static inline int __prepare_ICR2 (unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +void __send_IPI_shortcut(unsigned int shortcut, int vector) +{ + /* + * Subtle. In the case of the 'never do double writes' workaround + * we have to lock out interrupts to be safe. As we don't care + * of the value read we use an atomic rmw access to avoid costly + * cli/sti. Otherwise we use an even cheaper single atomic write + * to the APIC. + */ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * No need to touch the target chip field + */ + cfg = __prepare_ICR(shortcut, vector); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +} + +void fastcall send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector); +} + +/* + * This is only used on smaller machines. + */ +void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long cfg; + unsigned long flags; + + local_irq_save(flags); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + apic_write_around(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); + + local_irq_restore(flags); +} + +void send_IPI_mask_sequence(cpumask_t mask, int vector) +{ + unsigned long cfg, flags; + unsigned int query_cpu; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicasts to each CPU instead. This + * should be modified to do 1 message per cluster ID - mbligh + */ + + local_irq_save(flags); + + for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { + if (cpu_isset(query_cpu, mask)) { + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu)); + apic_write_around(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); + } + } + local_irq_restore(flags); +} + +#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ + +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + */ + +static cpumask_t flush_cpumask; +static struct mm_struct * flush_mm; +static unsigned long flush_va; +static DEFINE_SPINLOCK(tlbstate_lock); +#define FLUSH_ALL 0xffffffff + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + * + * We need to reload %cr3 since the page tables may be going + * away from under us.. + */ +static inline void leave_mm (unsigned long cpu) +{ + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superflous + * tlb flush. + * 1a2) set cpu_tlbstate to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu_tlbstate[].active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu_tlbstate[].active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu_tlbstate is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + */ + +fastcall void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned long cpu; + + cpu = get_cpu(); + + if (!cpu_isset(cpu, flush_cpumask)) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_va == FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(flush_va); + } else + leave_mm(cpu); + } + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpu_clear(cpu, flush_cpumask); + smp_mb__after_clear_bit(); +out: + put_cpu_no_resched(); +} + +static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, + unsigned long va) +{ + cpumask_t tmp; + /* + * A couple of (to be removed) sanity checks: + * + * - we do not send IPIs to not-yet booted CPUs. + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + + cpus_and(tmp, cpumask, cpu_online_map); + BUG_ON(!cpus_equal(cpumask, tmp)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + + /* + * i'm not happy about this global shared spinlock in the + * MM hot path, but we'll see how contended it is. + * Temporarily this turns IRQs off, so that lockups are + * detected by the NMI watchdog. + */ + spin_lock(&tlbstate_lock); + + flush_mm = mm; + flush_va = va; +#if NR_CPUS <= BITS_PER_LONG + atomic_set_mask(cpumask, &flush_cpumask); +#else + { + int k; + unsigned long *flush_mask = (unsigned long *)&flush_cpumask; + unsigned long *cpu_mask = (unsigned long *)&cpumask; + for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) + atomic_set_mask(cpu_mask[k], &flush_mask[k]); + } +#endif + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + + while (!cpus_empty(flush_cpumask)) + /* nothing. lockup detection does not belong here */ + mb(); + + flush_mm = NULL; + flush_va = 0; + spin_unlock(&tlbstate_lock); +} + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + local_flush_tlb(); + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm (struct mm_struct * mm) +{ + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + cpumask_t cpu_mask; + + preempt_disable(); + cpu_mask = mm->cpu_vm_mask; + cpu_clear(smp_processor_id(), cpu_mask); + + if (current->active_mm == mm) { + if(current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(cpu_mask)) + flush_tlb_others(cpu_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void* info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1, 1); +} + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +void smp_send_reschedule(int cpu) +{ + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +} + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static struct call_data_struct * call_data; + +/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ + +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +/* + * [SUMMARY] Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> currently unused. + * <wait> If true, wait (atomically) until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <<func>> or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +{ + struct call_data_struct data; + int cpus = num_online_cpus()-1; + + if (!cpus) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock(&call_lock); + call_data = &data; + mb(); + + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + spin_unlock(&call_lock); + + return 0; +} + +static void stop_this_cpu (void * dummy) +{ + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + local_irq_disable(); + disable_local_APIC(); + if (cpu_data[smp_processor_id()].hlt_works_ok) + for(;;) __asm__("hlt"); + for (;;); +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +void smp_send_stop(void) +{ + smp_call_function(stop_this_cpu, NULL, 1, 0); + + local_irq_disable(); + disable_local_APIC(); + local_irq_enable(); +} + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); +} + +fastcall void smp_call_function_interrupt(struct pt_regs *regs) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + ack_APIC_irq(); + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + + if (wait) { + mb(); + atomic_inc(&call_data->finished); + } +} + diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c new file mode 100644 index 000000000000..332ee7a1d1a1 --- /dev/null +++ b/arch/i386/kernel/smpboot.c @@ -0,0 +1,1145 @@ +/* + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com> + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * This code is released under the GNU General Public License version 2 or + * later. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIPS report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX APICs + * Martin J. Bligh : Added support for multi-quad systems + * Dave Jones : Report invalid combinations of Athlon CPUs. +* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/init.h> +#include <linux/kernel.h> + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/kernel_stat.h> +#include <linux/smp_lock.h> +#include <linux/irq.h> +#include <linux/bootmem.h> + +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> + +#include <mach_apic.h> +#include <mach_wakecpu.h> +#include <smpboot_hooks.h> + +/* Set if we find a B stepping CPU */ +static int __initdata smp_b_stepping; + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ +EXPORT_SYMBOL(phys_proc_id); + +/* bitmap of online cpus */ +cpumask_t cpu_online_map; + +cpumask_t cpu_callin_map; +cpumask_t cpu_callout_map; +static cpumask_t smp_commenced_mask; + +/* Per CPU bogomips and other parameters */ +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; + +u8 x86_cpu_to_apicid[NR_CPUS] = + { [0 ... NR_CPUS-1] = 0xff }; +EXPORT_SYMBOL(x86_cpu_to_apicid); + +/* + * Trampoline 80x86 program as an array. + */ + +extern unsigned char trampoline_data []; +extern unsigned char trampoline_end []; +static unsigned char *trampoline_base; +static int trampoline_exec; + +static void map_cpu_to_logical_apicid(void); + +/* + * Currently trivial. Write the real->protected mode + * bootstrap into the page concerned. The caller + * has made sure it's suitably aligned. + */ + +static unsigned long __init setup_trampoline(void) +{ + memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); + return virt_to_phys(trampoline_base); +} + +/* + * We are called very early to get the low memory for the + * SMP bootup trampoline page. + */ +void __init smp_alloc_memory(void) +{ + trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + /* + * Has to be in very low memory so we can execute + * real-mode AP code. + */ + if (__pa(trampoline_base) >= 0x9F000) + BUG(); + /* + * Make the SMP trampoline executable: + */ + trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ + +static void __init smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c = cpu_data + id; + + *c = boot_cpu_data; + if (id!=0) + identify_cpu(c); + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) + /* + * Remember we have B step Pentia with bugs + */ + smp_b_stepping = 1; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + + /* Athlon 660/661 is valid. */ + if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model==7) && (c->x86_mask==0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability bit. + * It's worth noting that the A5 stepping (662) of some Athlon XP's + * have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more. + */ + if (((c->x86_model==6) && (c->x86_mask>=2)) || + ((c->x86_model==7) && (c->x86_mask>=1)) || + (c->x86_model> 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, it's not a certified SMP capable AMD system. */ + tainted |= TAINT_UNSAFE_SMP; + } + +valid_k7: + ; +} + +/* + * TSC synchronization. + * + * We first check whether all CPUs have their TSC's synchronized, + * then we print a warning if not, and always resync. + */ + +static atomic_t tsc_start_flag = ATOMIC_INIT(0); +static atomic_t tsc_count_start = ATOMIC_INIT(0); +static atomic_t tsc_count_stop = ATOMIC_INIT(0); +static unsigned long long tsc_values[NR_CPUS]; + +#define NR_LOOPS 5 + +static void __init synchronize_tsc_bp (void) +{ + int i; + unsigned long long t0; + unsigned long long sum, avg; + long long delta; + unsigned long one_usec; + int buggy = 0; + + printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); + + /* convert from kcyc/sec to cyc/usec */ + one_usec = cpu_khz / 1000; + + atomic_set(&tsc_start_flag, 1); + wmb(); + + /* + * We loop a few times to get a primed instruction cache, + * then the last pass is more or less synchronized and + * the BP and APs set their cycle counters to zero all at + * once. This reduces the chance of having random offsets + * between the processors, and guarantees that the maximum + * delay between the cycle counters is never bigger than + * the latency of information-passing (cachelines) between + * two CPUs. + */ + for (i = 0; i < NR_LOOPS; i++) { + /* + * all APs synchronize but they loop on '== num_cpus' + */ + while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) + mb(); + atomic_set(&tsc_count_stop, 0); + wmb(); + /* + * this lets the APs save their current TSC: + */ + atomic_inc(&tsc_count_start); + + rdtscll(tsc_values[smp_processor_id()]); + /* + * We clear the TSC in the last loop: + */ + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + /* + * Wait for all APs to leave the synchronization point: + */ + while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) + mb(); + atomic_set(&tsc_count_start, 0); + wmb(); + atomic_inc(&tsc_count_stop); + } + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_isset(i, cpu_callout_map)) { + t0 = tsc_values[i]; + sum += t0; + } + } + avg = sum; + do_div(avg, num_booting_cpus()); + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + delta = tsc_values[i] - avg; + if (delta < 0) + delta = -delta; + /* + * We report bigger than 2 microseconds clock differences. + */ + if (delta > 2*one_usec) { + long realdelta; + if (!buggy) { + buggy = 1; + printk("\n"); + } + realdelta = delta; + do_div(realdelta, one_usec); + if (tsc_values[i] < avg) + realdelta = -realdelta; + + printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta); + } + + sum += delta; + } + if (!buggy) + printk("passed.\n"); +} + +static void __init synchronize_tsc_ap (void) +{ + int i; + + /* + * Not every cpu is online at the time + * this gets called, so we first wait for the BP to + * finish SMP initialization: + */ + while (!atomic_read(&tsc_start_flag)) mb(); + + for (i = 0; i < NR_LOOPS; i++) { + atomic_inc(&tsc_count_start); + while (atomic_read(&tsc_count_start) != num_booting_cpus()) + mb(); + + rdtscll(tsc_values[smp_processor_id()]); + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + } +} +#undef NR_LOOPS + +extern void calibrate_delay(void); + +static atomic_t init_deasserted; + +static void __init smp_callin(void) +{ + int cpuid, phys_id; + unsigned long timeout; + + /* + * If waken up by an INIT in an 82489DX configuration + * we may get here before an INIT-deassert IPI reaches + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + */ + wait_for_init_deassert(&init_deasserted); + + /* + * (This works even if the APIC is not enabled.) + */ + phys_id = GET_APIC_ID(apic_read(APIC_ID)); + cpuid = smp_processor_id(); + if (cpu_isset(cpuid, cpu_callin_map)) { + printk("huh, phys CPU#%d, CPU#%d already present??\n", + phys_id, cpuid); + BUG(); + } + Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup (udelay is not yet working) + */ + timeout = jiffies + 2*HZ; + while (time_before(jiffies, timeout)) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (cpu_isset(cpuid, cpu_callout_map)) + break; + rep_nop(); + } + + if (!time_before(jiffies, timeout)) { + printk("BUG: CPU%d started up but did not get a callout!\n", + cpuid); + BUG(); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + Dprintk("CALLIN, before setup_local_APIC().\n"); + smp_callin_clear_local_apic(); + setup_local_APIC(); + map_cpu_to_logical_apicid(); + + /* + * Get our bogomips. + */ + calibrate_delay(); + Dprintk("Stack at about %p\n",&cpuid); + + /* + * Save our processor parameters + */ + smp_store_cpu_info(cpuid); + + disable_APIC_timer(); + + /* + * Allow the master to continue. + */ + cpu_set(cpuid, cpu_callin_map); + + /* + * Synchronize the TSC with the BP + */ + if (cpu_has_tsc && cpu_khz) + synchronize_tsc_ap(); +} + +static int cpucount; + +/* + * Activate a secondary processor. + */ +static void __init start_secondary(void *unused) +{ + /* + * Dont put anything before smp_callin(), SMP + * booting is too fragile that we want to limit the + * things done here to the most necessary things. + */ + cpu_init(); + smp_callin(); + while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) + rep_nop(); + setup_secondary_APIC_clock(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + enable_NMI_through_LVT0(NULL); + enable_8259A_irq(0); + } + enable_APIC_timer(); + /* + * low-memory mappings have been cleared, flush them from + * the local TLBs too. + */ + local_flush_tlb(); + cpu_set(smp_processor_id(), cpu_online_map); + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + + wmb(); + cpu_idle(); +} + +/* + * Everything has been set up for the secondary + * CPUs - they just need to reload everything + * from the task structure + * This function must not return. + */ +void __init initialize_secondary(void) +{ + /* + * We don't actually need to load the full TSS, + * basically just the stack pointer and the eip. + */ + + asm volatile( + "movl %0,%%esp\n\t" + "jmp *%1" + : + :"r" (current->thread.esp),"r" (current->thread.eip)); +} + +extern struct { + void * esp; + unsigned short ss; +} stack_start; + +#ifdef CONFIG_NUMA + +/* which logical CPUs are on which nodes */ +cpumask_t node_2_cpu_mask[MAX_NUMNODES] = + { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; +/* which node each logical CPU is on */ +int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +EXPORT_SYMBOL(cpu_2_node); + +/* set up a mapping between cpu and node. */ +static inline void map_cpu_to_node(int cpu, int node) +{ + printk("Mapping cpu %d to node %d\n", cpu, node); + cpu_set(cpu, node_2_cpu_mask[node]); + cpu_2_node[cpu] = node; +} + +/* undo a mapping between cpu and node. */ +static inline void unmap_cpu_to_node(int cpu) +{ + int node; + + printk("Unmapping cpu %d from all nodes\n", cpu); + for (node = 0; node < MAX_NUMNODES; node ++) + cpu_clear(cpu, node_2_cpu_mask[node]); + cpu_2_node[cpu] = 0; +} +#else /* !CONFIG_NUMA */ + +#define map_cpu_to_node(cpu, node) ({}) +#define unmap_cpu_to_node(cpu) ({}) + +#endif /* CONFIG_NUMA */ + +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +static void map_cpu_to_logical_apicid(void) +{ + int cpu = smp_processor_id(); + int apicid = logical_smp_processor_id(); + + cpu_2_logical_apicid[cpu] = apicid; + map_cpu_to_node(cpu, apicid_to_node(apicid)); +} + +static void unmap_cpu_to_logical_apicid(int cpu) +{ + cpu_2_logical_apicid[cpu] = BAD_APICID; + unmap_cpu_to_node(cpu); +} + +#if APIC_DEBUG +static inline void __inquire_remote_apic(int apicid) +{ + int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + char *names[] = { "ID", "VERSION", "SPIV" }; + int timeout, status; + + printk("Inquiring remote APIC #%d...\n", apicid); + + for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { + printk("... APIC #%d %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + printk("%08x\n", status); + break; + default: + printk("failed\n"); + } + } +} +#endif + +#ifdef WAKE_SECONDARY_VIA_NMI +/* + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this + * won't ... remember to clear down the APIC, etc later. + */ +static int __init +wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +{ + unsigned long send_status = 0, accept_status = 0; + int timeout, maxlvt; + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + maxlvt = get_maxlvt(); + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + Dprintk("NMI sent.\n"); + + if (send_status) + printk("APIC never delivered???\n"); + if (accept_status) + printk("APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} +#endif /* WAKE_SECONDARY_VIA_NMI */ + +#ifdef WAKE_SECONDARY_VIA_INIT +static int __init +wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +{ + unsigned long send_status = 0, accept_status = 0; + int maxlvt, timeout, num_starts, j; + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + Dprintk("Asserting INIT.\n"); + + /* + * Turn INIT on target chip + */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* + * Send IPI + */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + mdelay(10); + + Dprintk("Deasserting INIT.\n"); + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Send IPI */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + atomic_set(&init_deasserted, 1); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + Dprintk("#startup loops: %d.\n", num_starts); + + maxlvt = get_maxlvt(); + + for (j = 1; j <= num_starts; j++) { + Dprintk("Sending STARTUP #%d.\n",j); + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + Dprintk("After apic_write.\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_STARTUP + | (start_eip >> 12)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); + + Dprintk("Startup point 1.\n"); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + Dprintk("After Startup.\n"); + + if (send_status) + printk("APIC never delivered???\n"); + if (accept_status) + printk("APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} +#endif /* WAKE_SECONDARY_VIA_INIT */ + +extern cpumask_t cpu_initialized; + +static int __init do_boot_cpu(int apicid) +/* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. + */ +{ + struct task_struct *idle; + unsigned long boot_error; + int timeout, cpu; + unsigned long start_eip; + unsigned short nmi_high = 0, nmi_low = 0; + + cpu = ++cpucount; + /* + * We can't use kernel_thread since we must avoid to + * reschedule the child. + */ + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + idle->thread.eip = (unsigned long) start_secondary; + /* start_eip had better be page-aligned! */ + start_eip = setup_trampoline(); + + /* So we see what's up */ + printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); + /* Stack for startup_32 can be just as for start_secondary onwards */ + stack_start.esp = (void *) idle->thread.esp; + + irq_ctx_init(cpu); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + atomic_set(&init_deasserted, 0); + + Dprintk("Setting warm reset code and vector.\n"); + + store_NMI_vector(&nmi_high, &nmi_low); + + smpboot_setup_warm_reset_vector(start_eip); + + /* + * Starting actual IPI sequence... + */ + boot_error = wakeup_secondary_cpu(apicid, start_eip); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + if (*((volatile unsigned char *)trampoline_base) + == 0xA5) + /* trampoline started but...? */ + printk("Stuck ??\n"); + else + /* trampoline code not run */ + printk("Not responding.\n"); + inquire_remote_apic(apicid); + } + } + x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { + /* Try to put things back the way they were before ... */ + unmap_cpu_to_logical_apicid(cpu); + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ + cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ + cpucount--; + } + + /* mark "stuck" area as not stuck */ + *((volatile unsigned long *)trampoline_base) = 0; + + return boot_error; +} + +static void smp_tune_scheduling (void) +{ + unsigned long cachesize; /* kB */ + unsigned long bandwidth = 350; /* MB/s */ + /* + * Rough estimation for SMP scheduling, this is the number of + * cycles it takes for a fully memory-limited process to flush + * the SMP-local cache. + * + * (For a P5 this pretty much means we will choose another idle + * CPU almost always at wakeup time (this is due to the small + * L1 cache), on PIIs it's around 50-100 usecs, depending on + * the cache size) + */ + + if (!cpu_khz) { + /* + * this basically disables processor-affinity + * scheduling on SMP without a TSC. + */ + return; + } else { + cachesize = boot_cpu_data.x86_cache_size; + if (cachesize == -1) { + cachesize = 16; /* Pentiums, 2x8kB cache */ + bandwidth = 100; + } + } +} + +/* + * Cycle through the processors sending APIC IPIs to boot each. + */ + +static int boot_cpu_logical_apicid; +/* Where the IO area was mapped on multiquad, always 0 otherwise */ +void *xquad_portio; + +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; + +static void __init smp_boot_cpus(unsigned int max_cpus) +{ + int apicid, cpu, bit, kicked; + unsigned long bogosum = 0; + + /* + * Setup boot CPU information + */ + smp_store_cpu_info(0); /* Final full version of the data */ + printk("CPU%d: ", 0); + print_cpu_info(&cpu_data[0]); + + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + boot_cpu_logical_apicid = logical_smp_processor_id(); + x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; + + current_thread_info()->cpu = 0; + smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); + + /* + * If we couldn't find an SMP configuration at boot time, + * get out of here now! + */ + if (!smp_found_config && !acpi_lapic) { + printk(KERN_NOTICE "SMP motherboard not detected.\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + if (APIC_init_uniprocessor()) + printk(KERN_NOTICE "Local APIC not detected." + " Using dummy APIC emulation.\n"); + map_cpu_to_logical_apicid(); + return; + } + + /* + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + * Makes no sense to do this check in clustered apic mode, so skip it + */ + if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { + printk("weird, boot CPU (#%d) not listed by the BIOS.\n", + boot_cpu_physical_apicid); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * If we couldn't find a local APIC, then get out of here now! + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + return; + } + + verify_local_APIC(); + + /* + * If SMP should be disabled, then really disable it! + */ + if (!max_cpus) { + smp_found_config = 0; + printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + return; + } + + connect_bsp_APIC(); + setup_local_APIC(); + map_cpu_to_logical_apicid(); + + + setup_portio_remap(); + + /* + * Scan the CPU present map and fire up the other CPUs via do_boot_cpu + * + * In clustered apic mode, phys_cpu_present_map is a constructed thus: + * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the + * clustered apic ID. + */ + Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + + kicked = 1; + for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { + apicid = cpu_present_to_apicid(bit); + /* + * Don't even attempt to start the boot CPU! + */ + if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID)) + continue; + + if (!check_apicid_present(bit)) + continue; + if (max_cpus <= cpucount+1) + continue; + + if (do_boot_cpu(apicid)) + printk("CPU #%d not responding - cannot use it.\n", + apicid); + else + ++kicked; + } + + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + + /* + * Allow the user to impress friends. + */ + Dprintk("Before bogomips.\n"); + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_isset(cpu, cpu_callout_map)) + bogosum += cpu_data[cpu].loops_per_jiffy; + printk(KERN_INFO + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + + Dprintk("Before bogocount - setting activated=1.\n"); + + if (smp_b_stepping) + printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + if (tainted & TAINT_UNSAFE_SMP) { + if (cpucount) + printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n"); + else + tainted &= ~TAINT_UNSAFE_SMP; + } + + Dprintk("Boot done.\n"); + + /* + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + + if (smp_num_siblings > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); + } + } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + check_nmi_watchdog(); + + smpboot_setup_io_apic(); + + setup_boot_APIC_clock(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc && cpucount && cpu_khz) + synchronize_tsc_bp(); +} + +/* These are wrappers to interface to the new boot process. Someone + who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + smp_boot_cpus(max_cpus); +} + +void __devinit smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callout_map); +} + +int __devinit __cpu_up(unsigned int cpu) +{ + /* This only works at boot for x86. See "rewrite" above. */ + if (cpu_isset(cpu, smp_commenced_mask)) { + local_irq_enable(); + return -ENOSYS; + } + + /* In case one didn't come up */ + if (!cpu_isset(cpu, cpu_callin_map)) { + local_irq_enable(); + return -EIO; + } + + local_irq_enable(); + /* Unleash the CPU! */ + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return 0; +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_IO_APIC + setup_ioapic_dest(); +#endif + zap_low_mappings(); + /* + * Disable executability of the SMP trampoline: + */ + set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +} + +void __init smp_intr_init(void) +{ + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); +} diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c new file mode 100644 index 000000000000..7b3b27d64409 --- /dev/null +++ b/arch/i386/kernel/srat.c @@ -0,0 +1,456 @@ +/* + * Some of the code in this file has been gleaned from the 64 bit + * discontigmem support code base. + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to Pat Gaughen <gone@us.ibm.com> + */ +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/acpi.h> +#include <linux/nodemask.h> +#include <asm/srat.h> +#include <asm/topology.h> + +/* + * proximity macros and definitions + */ +#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ +#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ +#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) +#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) +#define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ +/* bitmap length; _PXM is at most 255 */ +#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) +static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ + +#define MAX_CHUNKS_PER_NODE 4 +#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) +struct node_memory_chunk_s { + unsigned long start_pfn; + unsigned long end_pfn; + u8 pxm; // proximity domain of node + u8 nid; // which cnode contains this chunk? + u8 bank; // which mem bank on this node +}; +static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; + +static int num_memory_chunks; /* total number of memory chunks */ +static int zholes_size_init; +static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; + +extern void * boot_ioremap(unsigned long, unsigned long); + +/* Identify CPU proximity domains */ +static void __init parse_cpu_affinity_structure(char *p) +{ + struct acpi_table_processor_affinity *cpu_affinity = + (struct acpi_table_processor_affinity *) p; + + if (!cpu_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + + printk("CPU 0x%02X in proximity domain 0x%02X\n", + cpu_affinity->apic_id, cpu_affinity->proximity_domain); +} + +/* + * Identify memory proximity domains and hot-remove capabilities. + * Fill node memory chunk list structure. + */ +static void __init parse_memory_affinity_structure (char *sratp) +{ + unsigned long long paddr, size; + unsigned long start_pfn, end_pfn; + u8 pxm; + struct node_memory_chunk_s *p, *q, *pend; + struct acpi_table_memory_affinity *memory_affinity = + (struct acpi_table_memory_affinity *) sratp; + + if (!memory_affinity->flags.enabled) + return; /* empty entry */ + + /* mark this node as "seen" in node bitmap */ + BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); + + /* calculate info for memory chunk structure */ + paddr = memory_affinity->base_addr_hi; + paddr = (paddr << 32) | memory_affinity->base_addr_lo; + size = memory_affinity->length_hi; + size = (size << 32) | memory_affinity->length_lo; + + start_pfn = paddr >> PAGE_SHIFT; + end_pfn = (paddr + size) >> PAGE_SHIFT; + + pxm = memory_affinity->proximity_domain; + + if (num_memory_chunks >= MAXCHUNKS) { + printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", + size/(1024*1024), paddr); + return; + } + + /* Insertion sort based on base address */ + pend = &node_memory_chunk[num_memory_chunks]; + for (p = &node_memory_chunk[0]; p < pend; p++) { + if (start_pfn < p->start_pfn) + break; + } + if (p < pend) { + for (q = pend; q >= p; q--) + *(q + 1) = *q; + } + p->start_pfn = start_pfn; + p->end_pfn = end_pfn; + p->pxm = pxm; + + num_memory_chunks++; + + printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", + start_pfn, end_pfn, + memory_affinity->memory_type, + memory_affinity->proximity_domain, + (memory_affinity->flags.hot_pluggable ? + "enabled and removable" : "enabled" ) ); +} + +#if MAX_NR_ZONES != 3 +#error "MAX_NR_ZONES != 3, chunk_to_zone requires review" +#endif +/* Take a chunk of pages from page frame cstart to cend and count the number + * of pages in each zone, returned via zones[]. + */ +static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, + unsigned long *zones) +{ + unsigned long max_dma; + extern unsigned long max_low_pfn; + + int z; + unsigned long rend; + + /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide + * similarly scoped information and should be handled in a consistant + * manner. + */ + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + /* Split the hole into the zones in which it falls. Repeatedly + * take the segment in which the remaining hole starts, round it + * to the end of that zone. + */ + memset(zones, 0, MAX_NR_ZONES * sizeof(long)); + while (cstart < cend) { + if (cstart < max_dma) { + z = ZONE_DMA; + rend = (cend < max_dma)? cend : max_dma; + + } else if (cstart < max_low_pfn) { + z = ZONE_NORMAL; + rend = (cend < max_low_pfn)? cend : max_low_pfn; + + } else { + z = ZONE_HIGHMEM; + rend = cend; + } + zones[z] += rend - cstart; + cstart = rend; + } +} + +/* + * The SRAT table always lists ascending addresses, so can always + * assume that the first "start" address that you see is the real + * start of the node, and that the current "end" address is after + * the previous one. + */ +static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +{ + /* + * Only add present memory as told by the e820. + * There is no guarantee from the SRAT that the memory it + * enumerates is present at boot time because it represents + * *possible* memory hotplug areas the same as normal RAM. + */ + if (memory_chunk->start_pfn >= max_pfn) { + printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", + memory_chunk->start_pfn, memory_chunk->end_pfn); + return; + } + if (memory_chunk->nid != nid) + return; + + if (!node_has_online_mem(nid)) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_start_pfn[nid] > memory_chunk->start_pfn) + node_start_pfn[nid] = memory_chunk->start_pfn; + + if (node_end_pfn[nid] < memory_chunk->end_pfn) + node_end_pfn[nid] = memory_chunk->end_pfn; +} + +/* Parse the ACPI Static Resource Affinity Table */ +static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) +{ + u8 *start, *end, *p; + int i, j, nid; + u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */ + u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */ + + start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ + p = start; + end = (u8 *)sratp + sratp->header.length; + + memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ + memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); + memset(zholes_size, 0, sizeof(zholes_size)); + + /* -1 in these maps means not available */ + memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); + memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); + + num_memory_chunks = 0; + while (p < end) { + switch (*p) { + case ACPI_SRAT_PROCESSOR_AFFINITY: + parse_cpu_affinity_structure(p); + break; + case ACPI_SRAT_MEMORY_AFFINITY: + parse_memory_affinity_structure(p); + break; + default: + printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); + break; + } + p += p[1]; + if (p[1] == 0) { + printk("acpi20_parse_srat: Entry length value is zero;" + " can't parse any further!\n"); + break; + } + } + + if (num_memory_chunks == 0) { + printk("could not finy any ACPI SRAT memory areas.\n"); + goto out_fail; + } + + /* Calculate total number of nodes in system from PXM bitmap and create + * a set of sequential node IDs starting at zero. (ACPI doesn't seem + * to specify the range of _PXM values.) + */ + /* + * MCD - we no longer HAVE to number nodes sequentially. PXM domain + * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically + * 32, so we will continue numbering them in this manner until MAX_NUMNODES + * approaches MAX_PXM_DOMAINS for i386. + */ + nodes_clear(node_online_map); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (BMAP_TEST(pxm_bitmap, i)) { + nid = num_online_nodes(); + pxm_to_nid_map[i] = nid; + nid_to_pxm_map[nid] = i; + node_set_online(nid); + } + } + BUG_ON(num_online_nodes() == 0); + + /* set cnode id in memory chunk structure */ + for (i = 0; i < num_memory_chunks; i++) + node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; + + printk("pxm bitmap: "); + for (i = 0; i < sizeof(pxm_bitmap); i++) { + printk("%02X ", pxm_bitmap[i]); + } + printk("\n"); + printk("Number of logical nodes in system = %d\n", num_online_nodes()); + printk("Number of memory chunks in system = %d\n", num_memory_chunks); + + for (j = 0; j < num_memory_chunks; j++){ + struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; + printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", + j, chunk->nid, chunk->start_pfn, chunk->end_pfn); + node_read_chunk(chunk->nid, chunk); + } + + for_each_online_node(nid) { + unsigned long start = node_start_pfn[nid]; + unsigned long end = node_end_pfn[nid]; + + memory_present(nid, start, end); + node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); + } + return 1; +out_fail: + return 0; +} + +int __init get_memcfg_from_srat(void) +{ + struct acpi_table_header *header = NULL; + struct acpi_table_rsdp *rsdp = NULL; + struct acpi_table_rsdt *rsdt = NULL; + struct acpi_pointer *rsdp_address = NULL; + struct acpi_table_rsdt saved_rsdt; + int tables = 0; + int i = 0; + + acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address); + + if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) { + printk("%s: assigning address to rsdp\n", __FUNCTION__); + rsdp = (struct acpi_table_rsdp *) + (u32)rsdp_address->pointer.physical; + } else { + printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__); + goto out_err; + } + if (!rsdp) { + printk("%s: Didn't find ACPI root!\n", __FUNCTION__); + goto out_err; + } + + printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, + rsdp->oem_id); + + if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) { + printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); + goto out_err; + } + + rsdt = (struct acpi_table_rsdt *) + boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); + + if (!rsdt) { + printk(KERN_WARNING + "%s: ACPI: Invalid root system description tables (RSDT)\n", + __FUNCTION__); + goto out_err; + } + + header = & rsdt->header; + + if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) { + printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); + goto out_err; + } + + /* + * The number of tables is computed by taking the + * size of all entries (header size minus total + * size of RSDT) divided by the size of each entry + * (4-byte table pointers). + */ + tables = (header->length - sizeof(struct acpi_table_header)) / 4; + + if (!tables) + goto out_err; + + memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); + + if (saved_rsdt.header.length > sizeof(saved_rsdt)) { + printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", + saved_rsdt.header.length); + goto out_err; + } + + printk("Begin SRAT table scan....\n"); + + for (i = 0; i < tables; i++) { + /* Map in header, then map in full table length. */ + header = (struct acpi_table_header *) + boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header)); + if (!header) + break; + header = (struct acpi_table_header *) + boot_ioremap(saved_rsdt.entry[i], header->length); + if (!header) + break; + + if (strncmp((char *) &header->signature, "SRAT", 4)) + continue; + + /* we've found the srat table. don't need to look at any more tables */ + return acpi20_parse_srat((struct acpi_table_srat *)header); + } +out_err: + printk("failed to get NUMA memory information from SRAT table\n"); + return 0; +} + +/* For each node run the memory list to determine whether there are + * any memory holes. For each hole determine which ZONE they fall + * into. + * + * NOTE#1: this requires knowledge of the zone boundries and so + * _cannot_ be performed before those are calculated in setup_memory. + * + * NOTE#2: we rely on the fact that the memory chunks are ordered by + * start pfn number during setup. + */ +static void __init get_zholes_init(void) +{ + int nid; + int c; + int first; + unsigned long end = 0; + + for_each_online_node(nid) { + first = 1; + for (c = 0; c < num_memory_chunks; c++){ + if (node_memory_chunk[c].nid == nid) { + if (first) { + end = node_memory_chunk[c].end_pfn; + first = 0; + + } else { + /* Record any gap between this chunk + * and the previous chunk on this node + * against the zones it spans. + */ + chunk_to_zones(end, + node_memory_chunk[c].start_pfn, + &zholes_size[nid * MAX_NR_ZONES]); + } + } + } + } +} + +unsigned long * __init get_zholes_size(int nid) +{ + if (!zholes_size_init) { + zholes_size_init++; + get_zholes_init(); + } + if (nid >= MAX_NUMNODES || !node_online(nid)) + printk("%s: nid = %d is invalid/offline. num_online_nodes = %d", + __FUNCTION__, nid, num_online_nodes()); + return &zholes_size[nid * MAX_NR_ZONES]; +} diff --git a/arch/i386/kernel/summit.c b/arch/i386/kernel/summit.c new file mode 100644 index 000000000000..d0e01a3acf35 --- /dev/null +++ b/arch/i386/kernel/summit.c @@ -0,0 +1,180 @@ +/* + * arch/i386/kernel/summit.c - IBM Summit-Specific Code + * + * Written By: Matthew Dobson, IBM Corporation + * + * Copyright (c) 2003 IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + * + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <asm/io.h> +#include <asm/mach-summit/mach_mpparse.h> + +static struct rio_table_hdr *rio_table_hdr __initdata; +static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; +static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata; + +static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) +{ + int twister = 0, node = 0; + int i, bus, num_buses; + + for(i = 0; i < rio_table_hdr->num_rio_dev; i++){ + if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){ + twister = rio_devs[i]->owner_id; + break; + } + } + if (i == rio_table_hdr->num_rio_dev){ + printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__); + return last_bus; + } + + for(i = 0; i < rio_table_hdr->num_scal_dev; i++){ + if (scal_devs[i]->node_id == twister){ + node = scal_devs[i]->node_id; + break; + } + } + if (i == rio_table_hdr->num_scal_dev){ + printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__); + return last_bus; + } + + switch (rio_devs[wpeg_num]->type){ + case CompatWPEG: + /* The Compatability Winnipeg controls the 2 legacy buses, + * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case + * a PCI-PCI bridge card is used in either slot: total 5 buses. + */ + num_buses = 5; + break; + case AltWPEG: + /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot + * each], their 2 "extra" buses, the 100MHz bus [2 slots] and + * the "extra" buses for each of those slots: total 7 buses. + */ + num_buses = 7; + break; + case LookOutAWPEG: + case LookOutBWPEG: + /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each] + * & the "extra" buses for each of those slots: total 9 buses. + */ + num_buses = 9; + break; + default: + printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__); + return last_bus; + } + + for(bus = last_bus; bus < last_bus + num_buses; bus++) + mp_bus_id_to_node[bus] = node; + return bus; +} + +static int __init build_detail_arrays(void) +{ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + return 0; + } + + switch (rio_table_hdr->version){ + default: + printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version); + return 0; + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + } + + ptr = (unsigned long)rio_table_hdr + 3; + for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) + scal_devs[i] = (struct scal_detail *)ptr; + + for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) + rio_devs[i] = (struct rio_detail *)ptr; + + return 1; +} + +void __init setup_summit(void) +{ + unsigned long ptr; + unsigned short offset; + int i, next_wpeg, next_bus = 0; + + /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ + ptr = *(unsigned short *)phys_to_virt(0x40Eul); + ptr = (unsigned long)phys_to_virt(ptr << 4); + + rio_table_hdr = NULL; + offset = 0x180; + while (offset){ + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + /* The next offset is stored in the 1st word. 0 means no more */ + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr){ + printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__); + return; + } + + if (!build_detail_arrays()) + return; + + /* The first Winnipeg we're looking for has an index of 0 */ + next_wpeg = 0; + do { + for(i = 0; i < rio_table_hdr->num_rio_dev; i++){ + if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){ + /* It's the Winnipeg we're looking for! */ + next_bus = setup_pci_node_map_for_wpeg(i, next_bus); + next_wpeg++; + break; + } + } + /* + * If we go through all Rio devices and don't find one with + * the next index, it means we've found all the Winnipegs, + * and thus all the PCI buses. + */ + if (i == rio_table_hdr->num_rio_dev) + next_wpeg = 0; + } while (next_wpeg != 0); +} diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c new file mode 100644 index 000000000000..a4a61976ecb9 --- /dev/null +++ b/arch/i386/kernel/sys_i386.c @@ -0,0 +1,252 @@ +/* + * linux/arch/i386/kernel/sys_i386.c + * + * This file contains various random system calls that + * have a non-standard calling sequence on the Linux/i386 + * platform. + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/sem.h> +#include <linux/msg.h> +#include <linux/shm.h> +#include <linux/stat.h> +#include <linux/syscalls.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/utsname.h> + +#include <asm/uaccess.h> +#include <asm/ipc.h> + +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +asmlinkage int sys_pipe(unsigned long __user * fildes) +{ + int fd[2]; + int error; + + error = do_pipe(fd); + if (!error) { + if (copy_to_user(fildes, fd, 2*sizeof(int))) + error = -EFAULT; + } + return error; +} + +/* common code for old and new mmaps */ +static inline long do_mmap2( + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + int error = -EBADF; + struct file * file = NULL; + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return error; +} + +asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + return do_mmap2(addr, len, prot, flags, fd, pgoff); +} + +/* + * Perform the select(nd, in, out, ex, tv) and mmap() system + * calls. Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) +{ + struct mmap_arg_struct a; + int err = -EFAULT; + + if (copy_from_user(&a, arg, sizeof(a))) + goto out; + + err = -EINVAL; + if (a.offset & ~PAGE_MASK) + goto out; + + err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); +out: + return err; +} + + +struct sel_arg_struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; +}; + +asmlinkage int old_select(struct sel_arg_struct __user *arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + /* sys_select() does the appropriate kernel locking */ + return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); +} + +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls.. + * + * This is really horribly ugly. + */ +asmlinkage int sys_ipc (uint call, int first, int second, + int third, void __user *ptr, long fifth) +{ + int version, ret; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, second, + (const struct timespec __user *)fifth); + + case SEMGET: + return sys_semget (first, second, third); + case SEMCTL: { + union semun fourth; + if (!ptr) + return -EINVAL; + if (get_user(fourth.__pad, (void __user * __user *) ptr)) + return -EFAULT; + return sys_semctl (first, second, third, fourth); + } + + case MSGSND: + return sys_msgsnd (first, (struct msgbuf __user *) ptr, + second, third); + case MSGRCV: + switch (version) { + case 0: { + struct ipc_kludge tmp; + if (!ptr) + return -EINVAL; + + if (copy_from_user(&tmp, + (struct ipc_kludge __user *) ptr, + sizeof (tmp))) + return -EFAULT; + return sys_msgrcv (first, tmp.msgp, second, + tmp.msgtyp, third); + } + default: + return sys_msgrcv (first, + (struct msgbuf __user *) ptr, + second, fifth, third); + } + case MSGGET: + return sys_msgget ((key_t) first, second); + case MSGCTL: + return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); + + case SHMAT: + switch (version) { + default: { + ulong raddr; + ret = do_shmat (first, (char __user *) ptr, second, &raddr); + if (ret) + return ret; + return put_user (raddr, (ulong __user *) third); + } + case 1: /* iBCS2 emulator entry point */ + if (!segment_eq(get_fs(), get_ds())) + return -EINVAL; + /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ + return do_shmat (first, (char __user *) ptr, second, (ulong *) third); + } + case SHMDT: + return sys_shmdt ((char __user *)ptr); + case SHMGET: + return sys_shmget (first, second, third); + case SHMCTL: + return sys_shmctl (first, second, + (struct shmid_ds __user *) ptr); + default: + return -ENOSYS; + } +} + +/* + * Old cruft + */ +asmlinkage int sys_uname(struct old_utsname __user * name) +{ + int err; + if (!name) + return -EFAULT; + down_read(&uts_sem); + err=copy_to_user(name, &system_utsname, sizeof (*name)); + up_read(&uts_sem); + return err?-EFAULT:0; +} + +asmlinkage int sys_olduname(struct oldold_utsname __user * name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + + error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); + error |= __put_user(0,name->sysname+__OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); + error |= __put_user(0,name->nodename+__OLD_UTS_LEN); + error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); + error |= __put_user(0,name->release+__OLD_UTS_LEN); + error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); + error |= __put_user(0,name->version+__OLD_UTS_LEN); + error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); + error |= __put_user(0,name->machine+__OLD_UTS_LEN); + + up_read(&uts_sem); + + error = error ? -EFAULT : 0; + + return error; +} diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c new file mode 100644 index 000000000000..960d8bd137d0 --- /dev/null +++ b/arch/i386/kernel/sysenter.c @@ -0,0 +1,65 @@ +/* + * linux/arch/i386/kernel/sysenter.c + * + * (C) Copyright 2002 Linus Torvalds + * + * This file contains the needed initializations to support sysenter. + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/thread_info.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/string.h> +#include <linux/elf.h> + +#include <asm/cpufeature.h> +#include <asm/msr.h> +#include <asm/pgtable.h> +#include <asm/unistd.h> + +extern asmlinkage void sysenter_entry(void); + +void enable_sep_cpu(void *info) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + tss->ss1 = __KERNEL_CS; + tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); + put_cpu(); +} + +/* + * These symbols are defined by vsyscall.o to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vsyscall_int80_start, vsyscall_int80_end; +extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; + +static int __init sysenter_setup(void) +{ + void *page = (void *)get_zeroed_page(GFP_ATOMIC); + + __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + memcpy(page, + &vsyscall_int80_start, + &vsyscall_int80_end - &vsyscall_int80_start); + return 0; + } + + memcpy(page, + &vsyscall_sysenter_start, + &vsyscall_sysenter_end - &vsyscall_sysenter_start); + + on_each_cpu(enable_sep_cpu, NULL, 1, 1); + return 0; +} + +__initcall(sysenter_setup); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c new file mode 100644 index 000000000000..9b55e30e4490 --- /dev/null +++ b/arch/i386/kernel/time.c @@ -0,0 +1,476 @@ +/* + * linux/arch/i386/kernel/time.c + * + * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * + * This file contains the PC-specific time handling details: + * reading the RTC at bootup, etc.. + * 1994-07-02 Alan Modra + * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime + * 1995-03-26 Markus Kuhn + * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 + * precision CMOS clock update + * 1996-05-03 Ingo Molnar + * fixed time warps in do_[slow|fast]_gettimeoffset() + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-09-05 (Various) + * More robust do_fast_gettimeoffset() algorithm implemented + * (works with APM, Cyrix 6x86MX and Centaur C6), + * monotonic gettimeofday() with fast_get_timeoffset(), + * drift-proof precision TSC calibration on boot + * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D. + * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>; + * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>). + * 1998-12-16 Andrea Arcangeli + * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy + * because was not accounting lost_ticks. + * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli + * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/time.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/bcd.h> +#include <linux/efi.h> +#include <linux/mca.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/msr.h> +#include <asm/delay.h> +#include <asm/mpspec.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/timer.h> + +#include "mach_time.h" + +#include <linux/timex.h> +#include <linux/config.h> + +#include <asm/hpet.h> + +#include <asm/arch_hooks.h> + +#include "io_ports.h" + +extern spinlock_t i8259A_lock; +int pit_latch_buggy; /* extern */ + +#include "do_timer.h" + +u64 jiffies_64 = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ + +extern unsigned long wall_jiffies; + +DEFINE_SPINLOCK(rtc_lock); + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +struct timer_opts *cur_timer = &timer_none; + +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +/* + * This version of gettimeofday has microsecond resolution + * and better than microsecond precision on fast x86 machines with TSC. + */ +void do_gettimeofday(struct timeval *tv) +{ + unsigned long seq; + unsigned long usec, sec; + unsigned long max_ntp_tick; + + do { + unsigned long lost; + + seq = read_seqbegin(&xtime_lock); + + usec = cur_timer->get_offset(); + lost = jiffies - wall_jiffies; + + /* + * If time_adjust is negative then NTP is slowing the clock + * so make sure not to go into next possible interval. + * Better to lose some accuracy than have time go backwards.. + */ + if (unlikely(time_adjust < 0)) { + max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; + usec = min(usec, max_ntp_tick); + + if (lost) + usec += lost * max_ntp_tick; + } + else if (unlikely(lost)) + usec += lost * (USEC_PER_SEC / HZ); + + sec = xtime.tv_sec; + usec += (xtime.tv_nsec / 1000); + } while (read_seqretry(&xtime_lock, seq)); + + while (usec >= 1000000) { + usec -= 1000000; + sec++; + } + + tv->tv_sec = sec; + tv->tv_usec = usec; +} + +EXPORT_SYMBOL(do_gettimeofday); + +int do_settimeofday(struct timespec *tv) +{ + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irq(&xtime_lock); + /* + * This is revolting. We need to set "xtime" correctly. However, the + * value in this location is the value at the most recent update of + * wall time. Discover what correction gettimeofday() would have + * made, and then undo it! + */ + nsec -= cur_timer->get_offset() * NSEC_PER_USEC; + nsec -= (jiffies - wall_jiffies) * TICK_NSEC; + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + write_sequnlock_irq(&xtime_lock); + clock_was_set(); + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + + WARN_ON(irqs_disabled()); + + /* gets recalled with irq locally disabled */ + spin_lock_irq(&rtc_lock); + if (efi_enabled) + retval = efi_set_rtc_mmss(nowtime); + else + retval = mach_set_rtc_mmss(nowtime); + spin_unlock_irq(&rtc_lock); + + return retval; +} + + +int timer_ack; + +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +unsigned long long monotonic_clock(void) +{ + return cur_timer->monotonic_clock(); +} +EXPORT_SYMBOL(monotonic_clock); + +#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (in_lock_functions(pc)) + return *(unsigned long *)(regs->ebp + 4); + + return pc; +} +EXPORT_SYMBOL(profile_pc); +#endif + +/* + * timer_interrupt() needs to keep up the real-time clock, + * as well as call the "do_timer()" routine every clocktick + */ +static inline void do_timer_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ +#ifdef CONFIG_X86_IO_APIC + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to reset the IRR bit for do_slow_gettimeoffset(). + * This will also deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } +#endif + + do_timer_interrupt_hook(regs); + + + if (MCA_bus) { + /* The PS/2 uses level-triggered interrupts. You can't + turn them off, nor would you want to (any attempt to + enable edge-triggered interrupts usually gets intercepted by a + special hardware circuit). Hence we have to acknowledge + the timer interrupt. Through some incredibly stupid + design idea, the reset for IRQ 0 is done by setting the + high bit of the PPI port B (0x61). Note that some PS/2s, + notably the 55SX, work fine if this is removed. */ + + irq = inb_p( 0x61 ); /* read the current state */ + outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ + } +} + +/* + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. + */ +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + + cur_timer->mark_offset(); + + do_timer_interrupt(irq, NULL, regs); + + write_sequnlock(&xtime_lock); + return IRQ_HANDLED; +} + +/* not static: needed by APM */ +unsigned long get_cmos_time(void) +{ + unsigned long retval; + + spin_lock(&rtc_lock); + + if (efi_enabled) + retval = efi_get_time(); + else + retval = mach_get_cmos_time(); + + spin_unlock(&rtc_lock); + + return retval; +} +static void sync_cmos_clock(unsigned long dummy); + +static struct timer_list sync_cmos_timer = + TIMER_INITIALIZER(sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timeval now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if ((time_status & STA_UNSYNC) != 0) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + do_gettimeofday(&now); + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) + fail = set_rtc_mmss(now.tv_sec); + + next.tv_usec = USEC_AFTER - now.tv_usec; + if (next.tv_usec <= 0) + next.tv_usec += USEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_usec >= USEC_PER_SEC) { + next.tv_sec++; + next.tv_usec -= USEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); +} + +void notify_arch_cmos_timer(void) +{ + mod_timer(&sync_cmos_timer, jiffies + 1); +} + +static long clock_cmos_diff, sleep_start; + +static int timer_suspend(struct sys_device *dev, u32 state) +{ + /* + * Estimate time zone so that set_time can update the clock + */ + clock_cmos_diff = -get_cmos_time(); + clock_cmos_diff += get_seconds(); + sleep_start = get_cmos_time(); + return 0; +} + +static int timer_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long sec; + unsigned long sleep_length; + +#ifdef CONFIG_HPET_TIMER + if (is_hpet_enabled()) + hpet_reenable(); +#endif + sec = get_cmos_time() + clock_cmos_diff; + sleep_length = (get_cmos_time() - sleep_start) * HZ; + write_seqlock_irqsave(&xtime_lock, flags); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + jiffies += sleep_length; + wall_jiffies += sleep_length; + return 0; +} + +static struct sysdev_class timer_sysclass = { + .resume = timer_resume, + .suspend = timer_suspend, + set_kset_name("timer"), +}; + + +/* XXX this driverfs stuff should probably go elsewhere later -john */ +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int time_init_device(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(time_init_device); + +#ifdef CONFIG_HPET_TIMER +extern void (*late_time_init)(void); +/* Duplicate of time_init() below, with hpet_enable part added */ +static void __init hpet_time_init(void) +{ + xtime.tv_sec = get_cmos_time(); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + if (hpet_enable() >= 0) { + printk("Using HPET for base-timer\n"); + } + + cur_timer = select_timer(); + printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + + time_init_hook(); +} +#endif + +void __init time_init(void) +{ +#ifdef CONFIG_HPET_TIMER + if (is_hpet_capable()) { + /* + * HPET initialization needs to do memory-mapped io. So, let + * us do a late initialization after mem_init(). + */ + late_time_init = hpet_time_init; + return; + } +#endif + xtime.tv_sec = get_cmos_time(); + xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + cur_timer = select_timer(); + printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + + time_init_hook(); +} diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c new file mode 100644 index 000000000000..244a31b04be7 --- /dev/null +++ b/arch/i386/kernel/time_hpet.c @@ -0,0 +1,458 @@ +/* + * linux/arch/i386/kernel/time_hpet.c + * This code largely copied from arch/x86_64/kernel/time.c + * See that file for credits. + * + * 2003-06-30 Venkatesh Pallipadi - Additional changes for HPET support + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/timer.h> +#include <asm/fixmap.h> +#include <asm/apic.h> + +#include <linux/timex.h> +#include <linux/config.h> + +#include <asm/hpet.h> +#include <linux/hpet.h> + +static unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* hpet clks count per tick */ +unsigned long hpet_address; /* hpet memory map physical address */ + +static int use_hpet; /* can be used for runtime check of hpet */ +static int boot_hpet_disable; /* boottime override for HPET timer */ +static void __iomem * hpet_virt_address; /* hpet kernel virtual address */ + +#define FSEC_TO_USEC (1000000000UL) + +int hpet_readl(unsigned long a) +{ + return readl(hpet_virt_address + a); +} + +static void hpet_writel(unsigned long d, unsigned long a) +{ + writel(d, hpet_virt_address + a); +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * HPET counters dont wrap around on every tick. They just change the + * comparator value and continue. Next tick can be caught by checking + * for a change in the comparator value. Used in apic.c. + */ +static void __init wait_hpet_tick(void) +{ + unsigned int start_cmp_val, end_cmp_val; + + start_cmp_val = hpet_readl(HPET_T0_CMP); + do { + end_cmp_val = hpet_readl(HPET_T0_CMP); + } while (start_cmp_val == end_cmp_val); +} +#endif + +static int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + + /* + * Stop the timers and reset the main counter. + */ + cfg = hpet_readl(HPET_CFG); + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + + /* + * Set up timer 0, as periodic with first interrupt to happen at + * hpet_tick, and period also hpet_tick. + */ + cfg = hpet_readl(HPET_T0_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + + /* + * The first write after writing TN_SETVAL to the config register sets + * the counter value, the second write sets the threshold. + */ + hpet_writel(tick, HPET_T0_CMP); + hpet_writel(tick, HPET_T0_CMP); + + /* + * Go! + */ + cfg = hpet_readl(HPET_CFG); + cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +/* + * Check whether HPET was found by ACPI boot parse. If yes setup HPET + * counter 0 for kernel base timer. + */ +int __init hpet_enable(void) +{ + unsigned int id; + unsigned long tick_fsec_low, tick_fsec_high; /* tick in femto sec */ + unsigned long hpet_tick_rem; + + if (boot_hpet_disable) + return -1; + + if (!hpet_address) { + return -1; + } + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + /* + * Read the period, compute tick and quotient. + */ + id = hpet_readl(HPET_ID); + + /* + * We are checking for value '1' or more in number field if + * CONFIG_HPET_EMULATE_RTC is set because we will need an + * additional timer for RTC emulation. + * However, we can do with one timer otherwise using the + * the single HPET timer for system time. + */ + if ( +#ifdef CONFIG_HPET_EMULATE_RTC + !(id & HPET_ID_NUMBER) || +#endif + !(id & HPET_ID_LEGSUP)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) + return -1; + + /* + * 64 bit math + * First changing tick into fsec + * Then 64 bit div to find number of hpet clk per tick + */ + ASM_MUL64_REG(tick_fsec_low, tick_fsec_high, + KERNEL_TICK_USEC, FSEC_TO_USEC); + ASM_DIV64_REG(hpet_tick, hpet_tick_rem, + hpet_period, tick_fsec_low, tick_fsec_high); + + if (hpet_tick_rem > (hpet_period >> 1)) + hpet_tick++; /* rounding the result */ + + if (hpet_timer_stop_set_go(hpet_tick)) + return -1; + + use_hpet = 1; + +#ifdef CONFIG_HPET + { + struct hpet_data hd; + unsigned int ntimer; + + memset(&hd, 0, sizeof (hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet __iomem *hpet; + struct hpet_timer __iomem *timer; + int i; + + hpet = hpet_virt_address; + + for (i = 2, timer = &hpet->hpet_timers[2]; i < ntimer; + timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + } +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + wait_timer_tick = wait_hpet_tick; +#endif + return 0; +} + +int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +int is_hpet_enabled(void) +{ + return use_hpet; +} + +int is_hpet_capable(void) +{ + if (!boot_hpet_disable && hpet_address) + return 1; + return 0; +} + +static int __init hpet_setup(char* str) +{ + if (str) { + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + } + return 1; +} + +__setup("hpet=", hpet_setup); + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/mc146818rtc.h> +#include <linux/rtc.h> + +extern irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs); + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + local_irq_restore(flags); + + cfg = hpet_readl(HPET_T1_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt; + + if (!(PIE_on | AIE_on | UIE_on)) + return; + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + cnt = hpet_readl(HPET_T1_CMP); + cnt += hpet_tick*HZ/hpet_rtc_int_freq; + hpet_writel(cnt, HPET_T1_CMP); + + cfg = hpet_readl(HPET_T1_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id, regs); + } + return IRQ_HANDLED; +} +#endif + diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile new file mode 100644 index 000000000000..8fa12be658dd --- /dev/null +++ b/arch/i386/kernel/timers/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for x86 timers +# + +obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o + +obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o +obj-$(CONFIG_HPET_TIMER) += timer_hpet.o +obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c new file mode 100644 index 000000000000..f7f90005e22e --- /dev/null +++ b/arch/i386/kernel/timers/common.c @@ -0,0 +1,160 @@ +/* + * Common functions used across the timers go here + */ + +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/errno.h> +#include <linux/jiffies.h> + +#include <asm/io.h> +#include <asm/timer.h> +#include <asm/hpet.h> + +#include "mach_timer.h" + +/* ------ Calibrate the TSC ------- + * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). + * Too much 64-bit arithmetic here to do this cleanly in C, and for + * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) + * output busy loop as low as possible. We avoid reading the CTC registers + * directly because of the awkward 8-bit access mechanism of the 82C54 + * device. + */ + +#define CALIBRATE_TIME (5 * 1000020/HZ) + +unsigned long __init calibrate_tsc(void) +{ + mach_prepare_counter(); + + { + unsigned long startlow, starthigh; + unsigned long endlow, endhigh; + unsigned long count; + + rdtsc(startlow,starthigh); + mach_countup(&count); + rdtsc(endlow,endhigh); + + + /* Error: ECTCNEVERSET */ + if (count <= 1) + goto bad_ctc; + + /* 64-bit subtract - gcc just messes up with long longs */ + __asm__("subl %2,%0\n\t" + "sbbl %3,%1" + :"=a" (endlow), "=d" (endhigh) + :"g" (startlow), "g" (starthigh), + "0" (endlow), "1" (endhigh)); + + /* Error: ECPUTOOFAST */ + if (endhigh) + goto bad_ctc; + + /* Error: ECPUTOOSLOW */ + if (endlow <= CALIBRATE_TIME) + goto bad_ctc; + + __asm__("divl %2" + :"=a" (endlow), "=d" (endhigh) + :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); + + return endlow; + } + + /* + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ +bad_ctc: + return 0; +} + +#ifdef CONFIG_HPET_TIMER +/* ------ Calibrate the TSC using HPET ------- + * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. + * Second output is parameter 1 (when non NULL) + * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). + * calibrate_tsc() calibrates the processor TSC by comparing + * it to the HPET timer of known frequency. + * Too much 64-bit arithmetic here to do this cleanly in C + */ +#define CALIBRATE_CNT_HPET (5 * hpet_tick) +#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) + +unsigned long __init calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) +{ + unsigned long tsc_startlow, tsc_starthigh; + unsigned long tsc_endlow, tsc_endhigh; + unsigned long hpet_start, hpet_end; + unsigned long result, remain; + + hpet_start = hpet_readl(HPET_COUNTER); + rdtsc(tsc_startlow, tsc_starthigh); + do { + hpet_end = hpet_readl(HPET_COUNTER); + } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); + rdtsc(tsc_endlow, tsc_endhigh); + + /* 64-bit subtract - gcc just messes up with long longs */ + __asm__("subl %2,%0\n\t" + "sbbl %3,%1" + :"=a" (tsc_endlow), "=d" (tsc_endhigh) + :"g" (tsc_startlow), "g" (tsc_starthigh), + "0" (tsc_endlow), "1" (tsc_endhigh)); + + /* Error: ECPUTOOFAST */ + if (tsc_endhigh) + goto bad_calibration; + + /* Error: ECPUTOOSLOW */ + if (tsc_endlow <= CALIBRATE_TIME_HPET) + goto bad_calibration; + + ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); + if (remain > (tsc_endlow >> 1)) + result++; /* rounding the result */ + + if (tsc_hpet_quotient_ptr) { + unsigned long tsc_hpet_quotient; + + ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, + CALIBRATE_CNT_HPET); + if (remain > (tsc_endlow >> 1)) + tsc_hpet_quotient++; /* rounding the result */ + *tsc_hpet_quotient_ptr = tsc_hpet_quotient; + } + + return result; +bad_calibration: + /* + * the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + return 0; +} +#endif + +/* calculate cpu_khz */ +void __init init_cpu_khz(void) +{ + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc(); + if (tsc_quotient) { + /* report CPU clock rate in Hz. + * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + { unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cpu_khz), "=d" (edx) + :"r" (tsc_quotient), + "0" (eax), "1" (edx)); + printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + } + } + } +} diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c new file mode 100644 index 000000000000..a3d6a288088b --- /dev/null +++ b/arch/i386/kernel/timers/timer.c @@ -0,0 +1,66 @@ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <asm/timer.h> + +#ifdef CONFIG_HPET_TIMER +/* + * HPET memory read is slower than tsc reads, but is more dependable as it + * always runs at constant frequency and reduces complexity due to + * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use + * timer_pit when HPET is active. So, we default to timer_tsc. + */ +#endif +/* list of timers, ordered by preference, NULL terminated */ +static struct init_timer_opts* __initdata timers[] = { +#ifdef CONFIG_X86_CYCLONE_TIMER + &timer_cyclone_init, +#endif +#ifdef CONFIG_HPET_TIMER + &timer_hpet_init, +#endif +#ifdef CONFIG_X86_PM_TIMER + &timer_pmtmr_init, +#endif + &timer_tsc_init, + &timer_pit_init, + NULL, +}; + +static char clock_override[10] __initdata; + +static int __init clock_setup(char* str) +{ + if (str) + strlcpy(clock_override, str, sizeof(clock_override)); + return 1; +} +__setup("clock=", clock_setup); + + +/* The chosen timesource has been found to be bad. + * Fall back to a known good timesource (the PIT) + */ +void clock_fallback(void) +{ + cur_timer = &timer_pit; +} + +/* iterates through the list of timers, returning the first + * one that initializes successfully. + */ +struct timer_opts* __init select_timer(void) +{ + int i = 0; + + /* find most preferred working timer */ + while (timers[i]) { + if (timers[i]->init) + if (timers[i]->init(clock_override) == 0) + return timers[i]->opts; + ++i; + } + + panic("select_timer: Cannot find a suitable timer\n"); + return NULL; +} diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c new file mode 100644 index 000000000000..f6f1206a11bb --- /dev/null +++ b/arch/i386/kernel/timers/timer_cyclone.c @@ -0,0 +1,259 @@ +/* Cyclone-timer: + * This code implements timer_ops for the cyclone counter found + * on IBM x440, x360, and other Summit based systems. + * + * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) + */ + + +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/jiffies.h> + +#include <asm/timer.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include "io_ports.h" + +extern spinlock_t i8253_lock; + +/* Number of usecs that the last interrupt was delayed */ +static int delay_at_last_interrupt; + +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 +#define CYCLONE_PMCC_OFFSET 0x51A0 +#define CYCLONE_MPMC_OFFSET 0x51D0 +#define CYCLONE_MPCS_OFFSET 0x51A8 +#define CYCLONE_TIMER_FREQ 100000000 +#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ +int use_cyclone = 0; + +static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ +static u32 last_cyclone_low; +static u32 last_cyclone_high; +static unsigned long long monotonic_base; +static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; + +/* helper macro to atomically read both cyclone counter registers */ +#define read_cyclone_counter(low,high) \ + do{ \ + high = cyclone_timer[1]; low = cyclone_timer[0]; \ + } while (high != cyclone_timer[1]); + + +static void mark_offset_cyclone(void) +{ + unsigned long lost, delay; + unsigned long delta = last_cyclone_low; + int count; + unsigned long long this_offset, last_offset; + + write_seqlock(&monotonic_lock); + last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + + spin_lock(&i8253_lock); + read_cyclone_counter(last_cyclone_low,last_cyclone_high); + + /* read values for delay_at_last_interrupt */ + outb_p(0x00, 0x43); /* latch the count ASAP */ + + count = inb_p(0x40); /* read the latched count */ + count |= inb(0x40) << 8; + + /* + * VIA686a test code... reset the latch if count > max + 1 + * from timer_pit.c - cjb + */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + spin_unlock(&i8253_lock); + + /* lost tick compensation */ + delta = last_cyclone_low - delta; + delta /= (CYCLONE_TIMER_FREQ/1000000); + delta += delay_at_last_interrupt; + lost = delta/(1000000/HZ); + delay = delta%(1000000/HZ); + if (lost >= 2) + jiffies_64 += lost-1; + + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; + write_sequnlock(&monotonic_lock); + + /* calculate delay_at_last_interrupt */ + count = ((LATCH-1) - count) * TICK_SIZE; + delay_at_last_interrupt = (count + LATCH/2) / LATCH; + + + /* catch corner case where tick rollover occured + * between cyclone and pit reads (as noted when + * usec delta is > 90% # of usecs/tick) + */ + if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) + jiffies_64++; +} + +static unsigned long get_offset_cyclone(void) +{ + u32 offset; + + if(!cyclone_timer) + return delay_at_last_interrupt; + + /* Read the cyclone timer */ + offset = cyclone_timer[0]; + + /* .. relative to previous jiffy */ + offset = offset - last_cyclone_low; + + /* convert cyclone ticks to microseconds */ + /* XXX slow, can we speed this up? */ + offset = offset/(CYCLONE_TIMER_FREQ/1000000); + + /* our adjusted time offset in microseconds */ + return delay_at_last_interrupt + offset; +} + +static unsigned long long monotonic_clock_cyclone(void) +{ + u32 now_low, now_high; + unsigned long long last_offset, this_offset, base; + unsigned long long ret; + unsigned seq; + + /* atomically read monotonic base & last_offset */ + do { + seq = read_seqbegin(&monotonic_lock); + last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; + base = monotonic_base; + } while (read_seqretry(&monotonic_lock, seq)); + + + /* Read the cyclone counter */ + read_cyclone_counter(now_low,now_high); + this_offset = ((unsigned long long)now_high<<32)|now_low; + + /* convert to nanoseconds */ + ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); + return ret * (1000000000 / CYCLONE_TIMER_FREQ); +} + +static int __init init_cyclone(char* override) +{ + u32* reg; + u32 base; /* saved cyclone base address */ + u32 pageaddr; /* page that contains cyclone_timer register */ + u32 offset; /* offset from pageaddr to cyclone_timer register */ + int i; + + /* check clock override */ + if (override[0] && strncmp(override,"cyclone",7)) + return -ENODEV; + + /*make sure we're on a summit box*/ + if(!use_cyclone) return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address */ + pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; + offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); + set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); + reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + return -ENODEV; + } + base = *reg; + if(!base){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + return -ENODEV; + } + + /* setup PMCC */ + pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; + offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); + set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); + reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + return -ENODEV; + } + reg[0] = 0x00000001; + + /* setup MPCS */ + pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; + offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); + set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); + reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + return -ENODEV; + } + reg[0] = 0x00000001; + + /* map in cyclone_timer */ + pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; + offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); + set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); + cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); + if(!cyclone_timer){ + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + return -ENODEV; + } + + /*quick test to make sure its ticking*/ + for(i=0; i<3; i++){ + u32 old = cyclone_timer[0]; + int stall = 100; + while(stall--) barrier(); + if(cyclone_timer[0] == old){ + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + cyclone_timer = 0; + return -ENODEV; + } + } + + init_cpu_khz(); + + /* Everything looks good! */ + return 0; +} + + +static void delay_cyclone(unsigned long loops) +{ + unsigned long bclock, now; + if(!cyclone_timer) + return; + bclock = cyclone_timer[0]; + do { + rep_nop(); + now = cyclone_timer[0]; + } while ((now-bclock) < loops); +} +/************************************************************/ + +/* cyclone timer_opts struct */ +static struct timer_opts timer_cyclone = { + .name = "cyclone", + .mark_offset = mark_offset_cyclone, + .get_offset = get_offset_cyclone, + .monotonic_clock = monotonic_clock_cyclone, + .delay = delay_cyclone, +}; + +struct init_timer_opts __initdata timer_cyclone_init = { + .init = init_cyclone, + .opts = &timer_cyclone, +}; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c new file mode 100644 index 000000000000..713134e71844 --- /dev/null +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -0,0 +1,191 @@ +/* + * This code largely moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/jiffies.h> + +#include <asm/timer.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "io_ports.h" +#include "mach_timer.h" +#include <asm/hpet.h> + +static unsigned long hpet_usec_quotient; /* convert hpet clks to usec */ +static unsigned long tsc_hpet_quotient; /* convert tsc to hpet clks */ +static unsigned long hpet_last; /* hpet counter value at last tick*/ +static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ +static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ +static unsigned long long monotonic_base; +static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +static unsigned long long monotonic_clock_hpet(void) +{ + unsigned long long last_offset, this_offset, base; + unsigned seq; + + /* atomically read monotonic base & last_offset */ + do { + seq = read_seqbegin(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + base = monotonic_base; + } while (read_seqretry(&monotonic_lock, seq)); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return base + cycles_2_ns(this_offset - last_offset); +} + +static unsigned long get_offset_hpet(void) +{ + register unsigned long eax, edx; + + eax = hpet_readl(HPET_COUNTER); + eax -= hpet_last; /* hpet delta */ + + /* + * Time offset = (hpet delta) * ( usecs per HPET clock ) + * = (hpet delta) * ( usecs per tick / HPET clocks per tick) + * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) + * + * Where, + * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick + * + * Using a mull instead of a divl saves some cycles in critical path. + */ + ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); + + /* our adjusted time offset in microseconds */ + return edx; +} + +static void mark_offset_hpet(void) +{ + unsigned long long this_offset, last_offset; + unsigned long offset; + + write_seqlock(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + rdtsc(last_tsc_low, last_tsc_high); + + offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) { + int lost_ticks = (offset - hpet_last) / hpet_tick; + jiffies_64 += lost_ticks; + } + hpet_last = offset; + + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + monotonic_base += cycles_2_ns(this_offset - last_offset); + write_sequnlock(&monotonic_lock); +} + +static void delay_hpet(unsigned long loops) +{ + unsigned long hpet_start, hpet_end; + unsigned long eax; + + /* loops is the number of cpu cycles. Convert it to hpet clocks */ + ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); + + hpet_start = hpet_readl(HPET_COUNTER); + do { + rep_nop(); + hpet_end = hpet_readl(HPET_COUNTER); + } while ((hpet_end - hpet_start) < (loops)); +} + +static int __init init_hpet(char* override) +{ + unsigned long result, remain; + + /* check clock override */ + if (override[0] && strncmp(override,"hpet",4)) + return -ENODEV; + + if (!is_hpet_enabled()) + return -ENODEV; + + printk("Using HPET for gettimeofday\n"); + if (cpu_has_tsc) { + unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); + if (tsc_quotient) { + /* report CPU clock rate in Hz. + * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + { unsigned long eax=0, edx=1000; + ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, + eax, edx); + printk("Detected %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + } + set_cyc2ns_scale(cpu_khz/1000); + } + } + + /* + * Math to calculate hpet to usec multiplier + * Look for the comments at get_offset_hpet() + */ + ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); + if (remain > (hpet_tick >> 1)) + result++; /* rounding the result */ + hpet_usec_quotient = result; + + return 0; +} + +/************************************************************/ + +/* tsc timer_opts struct */ +static struct timer_opts timer_hpet = { + .name = "hpet", + .mark_offset = mark_offset_hpet, + .get_offset = get_offset_hpet, + .monotonic_clock = monotonic_clock_hpet, + .delay = delay_hpet, +}; + +struct init_timer_opts __initdata timer_hpet_init = { + .init = init_hpet, + .opts = &timer_hpet, +}; diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c new file mode 100644 index 000000000000..4ea2f414dbbd --- /dev/null +++ b/arch/i386/kernel/timers/timer_none.c @@ -0,0 +1,39 @@ +#include <linux/init.h> +#include <asm/timer.h> + +static void mark_offset_none(void) +{ + /* nothing needed */ +} + +static unsigned long get_offset_none(void) +{ + return 0; +} + +static unsigned long long monotonic_clock_none(void) +{ + return 0; +} + +static void delay_none(unsigned long loops) +{ + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + +/* none timer_opts struct */ +struct timer_opts timer_none = { + .name = "none", + .mark_offset = mark_offset_none, + .get_offset = get_offset_none, + .monotonic_clock = monotonic_clock_none, + .delay = delay_none, +}; diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c new file mode 100644 index 000000000000..967d5453cd0e --- /dev/null +++ b/arch/i386/kernel/timers/timer_pit.c @@ -0,0 +1,206 @@ +/* + * This code largely moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/sysdev.h> +#include <linux/timex.h> +#include <asm/delay.h> +#include <asm/mpspec.h> +#include <asm/timer.h> +#include <asm/smp.h> +#include <asm/io.h> +#include <asm/arch_hooks.h> + +extern spinlock_t i8259A_lock; +extern spinlock_t i8253_lock; +#include "do_timer.h" +#include "io_ports.h" + +static int count_p; /* counter in get_offset_pit() */ + +static int __init init_pit(char* override) +{ + /* check clock override */ + if (override[0] && strncmp(override,"pit",3)) + printk(KERN_ERR "Warning: clock= override failed. Defaulting to PIT\n"); + + count_p = LATCH; + return 0; +} + +static void mark_offset_pit(void) +{ + /* nothing needed */ +} + +static unsigned long long monotonic_clock_pit(void) +{ + return 0; +} + +static void delay_pit(unsigned long loops) +{ + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (loops)); +} + + +/* This function must be called with xtime_lock held. + * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs + * + * However, the pc-audio speaker driver changes the divisor so that + * it gets interrupted rather more often - it loads 64 into the + * counter rather than 11932! This has an adverse impact on + * do_gettimeoffset() -- it stops working! What is also not + * good is that the interval that our timer function gets called + * is no longer 10.0002 ms, but 9.9767 ms. To get around this + * would require using a different timing source. Maybe someone + * could use the RTC - I know that this can interrupt at frequencies + * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix + * it so that at startup, the timer code in sched.c would select + * using either the RTC or the 8253 timer. The decision would be + * based on whether there was any other device around that needed + * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, + * and then do some jiggery to have a version of do_timer that + * advanced the clock by 1/1024 s. Every time that reached over 1/100 + * of a second, then do all the old code. If the time was kept correct + * then do_gettimeoffset could just return 0 - there is no low order + * divider that can be accessed. + * + * Ideally, you would be able to use the RTC for the speaker driver, + * but it appears that the speaker driver really needs interrupt more + * often than every 120 us or so. + * + * Anyway, this needs more thought.... pjsg (1993-08-28) + * + * If you are really that interested, you should be reading + * comp.protocols.time.ntp! + */ + +static unsigned long get_offset_pit(void) +{ + int count; + unsigned long flags; + static unsigned long jiffies_p = 0; + + /* + * cache volatile jiffies temporarily; we have xtime_lock. + */ + unsigned long jiffies_t; + + spin_lock_irqsave(&i8253_lock, flags); + /* timer count may underflow right here */ + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + + count = inb_p(PIT_CH0); /* read the latched count */ + + /* + * We do this guaranteed double memory access instead of a _p + * postfix in the previous port access. Wheee, hackady hack + */ + jiffies_t = jiffies; + + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * avoiding timer inconsistencies (they are rare, but they happen)... + * there are two kinds of problems that must be avoided here: + * 1. the timer counter underflows + * 2. hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + */ + + if( jiffies_t == jiffies_p ) { + if( count > count_p ) { + /* the nutcase */ + count = do_timer_overflow(count); + } + } else + jiffies_p = jiffies_t; + + count_p = count; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = ((LATCH-1) - count) * TICK_SIZE; + count = (count + LATCH/2) / LATCH; + + return count; +} + + +/* tsc timer_opts struct */ +struct timer_opts timer_pit = { + .name = "pit", + .mark_offset = mark_offset_pit, + .get_offset = get_offset_pit, + .monotonic_clock = monotonic_clock_pit, + .delay = delay_pit, +}; + +struct init_timer_opts __initdata timer_pit_init = { + .init = init_pit, + .opts = &timer_pit, +}; + +void setup_pit_timer(void) +{ + extern spinlock_t i8253_lock; + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static int timer_resume(struct sys_device *dev) +{ + setup_pit_timer(); + return 0; +} + +static struct sysdev_class timer_sysclass = { + set_kset_name("timer_pit"), + .resume = timer_resume, +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int __init init_timer_sysfs(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(init_timer_sysfs); + diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c new file mode 100644 index 000000000000..d77f22030fe6 --- /dev/null +++ b/arch/i386/kernel/timers/timer_pm.c @@ -0,0 +1,258 @@ +/* + * (C) Dominik Brodowski <linux@brodo.de> 2003 + * + * Driver to use the Power Management Timer (PMTMR) available in some + * southbridges as primary timing source for the Linux kernel. + * + * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, + * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. + * + * This file is licensed under the GPL v2. + */ + + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/init.h> +#include <asm/types.h> +#include <asm/timer.h> +#include <asm/smp.h> +#include <asm/io.h> +#include <asm/arch_hooks.h> + +#include <linux/timex.h> +#include "mach_timer.h" + +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 +#define PMTMR_EXPECTED_RATE \ + ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) + + +/* The I/O port the PMTMR resides at. + * The location is detected during setup_arch(), + * in arch/i386/acpi/boot.c */ +u32 pmtmr_ioport = 0; + + +/* value of the Power timer at last timer interrupt */ +static u32 offset_tick; +static u32 offset_delay; + +static unsigned long long monotonic_base; +static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; + +#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ + +/*helper function to safely read acpi pm timesource*/ +static inline u32 read_pmtmr(void) +{ + u32 v1=0,v2=0,v3=0; + /* It has been reported that because of various broken + * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time + * source is not latched, so you must read it multiple + * times to insure a safe value is read. + */ + do { + v1 = inl(pmtmr_ioport); + v2 = inl(pmtmr_ioport); + v3 = inl(pmtmr_ioport); + } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) + || (v3 > v1 && v3 < v2)); + + /* mask the output to 24 bits */ + return v2 & ACPI_PM_MASK; +} + + +/* + * Some boards have the PMTMR running way too fast. We check + * the PMTMR rate against PIT channel 2 to catch these cases. + */ +static int verify_pmtmr_rate(void) +{ + u32 value1, value2; + unsigned long count, delta; + + mach_prepare_counter(); + value1 = read_pmtmr(); + mach_countup(&count); + value2 = read_pmtmr(); + delta = (value2 - value1) & ACPI_PM_MASK; + + /* Check that the PMTMR delta is within 5% of what we expect */ + if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || + delta > (PMTMR_EXPECTED_RATE * 21) / 20) { + printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); + return -1; + } + + return 0; +} + + +static int init_pmtmr(char* override) +{ + u32 value1, value2; + unsigned int i; + + if (override[0] && strncmp(override,"pmtmr",5)) + return -ENODEV; + + if (!pmtmr_ioport) + return -ENODEV; + + /* we use the TSC for delay_pmtmr, so make sure it exists */ + if (!cpu_has_tsc) + return -ENODEV; + + /* "verify" this timing source */ + value1 = read_pmtmr(); + for (i = 0; i < 10000; i++) { + value2 = read_pmtmr(); + if (value2 == value1) + continue; + if (value2 > value1) + goto pm_good; + if ((value2 < value1) && ((value2) < 0xFFF)) + goto pm_good; + printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); + return -EINVAL; + } + printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); + return -ENODEV; + +pm_good: + if (verify_pmtmr_rate() != 0) + return -ENODEV; + + init_cpu_khz(); + return 0; +} + +static inline u32 cyc2us(u32 cycles) +{ + /* The Power Management Timer ticks at 3.579545 ticks per microsecond. + * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] + * + * Even with HZ = 100, delta is at maximum 35796 ticks, so it can + * easily be multiplied with 286 (=0x11E) without having to fear + * u32 overflows. + */ + cycles *= 286; + return (cycles >> 10); +} + +/* + * this gets called during each timer interrupt + * - Called while holding the writer xtime_lock + */ +static void mark_offset_pmtmr(void) +{ + u32 lost, delta, last_offset; + static int first_run = 1; + last_offset = offset_tick; + + write_seqlock(&monotonic_lock); + + offset_tick = read_pmtmr(); + + /* calculate tick interval */ + delta = (offset_tick - last_offset) & ACPI_PM_MASK; + + /* convert to usecs */ + delta = cyc2us(delta); + + /* update the monotonic base value */ + monotonic_base += delta * NSEC_PER_USEC; + write_sequnlock(&monotonic_lock); + + /* convert to ticks */ + delta += offset_delay; + lost = delta / (USEC_PER_SEC / HZ); + offset_delay = delta % (USEC_PER_SEC / HZ); + + + /* compensate for lost ticks */ + if (lost >= 2) + jiffies_64 += lost - 1; + + /* don't calculate delay for first run, + or if we've got less then a tick */ + if (first_run || (lost < 1)) { + first_run = 0; + offset_delay = 0; + } +} + + +static unsigned long long monotonic_clock_pmtmr(void) +{ + u32 last_offset, this_offset; + unsigned long long base, ret; + unsigned seq; + + + /* atomically read monotonic base & last_offset */ + do { + seq = read_seqbegin(&monotonic_lock); + last_offset = offset_tick; + base = monotonic_base; + } while (read_seqretry(&monotonic_lock, seq)); + + /* Read the pmtmr */ + this_offset = read_pmtmr(); + + /* convert to nanoseconds */ + ret = (this_offset - last_offset) & ACPI_PM_MASK; + ret = base + (cyc2us(ret) * NSEC_PER_USEC); + return ret; +} + +static void delay_pmtmr(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + + +/* + * get the offset (in microseconds) from the last call to mark_offset() + * - Called holding a reader xtime_lock + */ +static unsigned long get_offset_pmtmr(void) +{ + u32 now, offset, delta = 0; + + offset = offset_tick; + now = read_pmtmr(); + delta = (now - offset)&ACPI_PM_MASK; + + return (unsigned long) offset_delay + cyc2us(delta); +} + + +/* acpi timer_opts struct */ +static struct timer_opts timer_pmtmr = { + .name = "pmtmr", + .mark_offset = mark_offset_pmtmr, + .get_offset = get_offset_pmtmr, + .monotonic_clock = monotonic_clock_pmtmr, + .delay = delay_pmtmr, +}; + +struct init_timer_opts __initdata timer_pmtmr_init = { + .init = init_pmtmr, + .opts = &timer_pmtmr, +}; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); +MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c new file mode 100644 index 000000000000..a685994e5c8e --- /dev/null +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -0,0 +1,560 @@ +/* + * This code largely moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + * + * 2004-06-25 Jesper Juhl + * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 + * failing to inline. + */ + +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/timex.h> +#include <linux/errno.h> +#include <linux/cpufreq.h> +#include <linux/string.h> +#include <linux/jiffies.h> + +#include <asm/timer.h> +#include <asm/io.h> +/* processor.h for distable_tsc flag */ +#include <asm/processor.h> + +#include "io_ports.h" +#include "mach_timer.h" + +#include <asm/hpet.h> + +#ifdef CONFIG_HPET_TIMER +static unsigned long hpet_usec_quotient; +static unsigned long hpet_last; +static struct timer_opts timer_tsc; +#endif + +static inline void cpufreq_delayed_get(void); + +int tsc_disable __initdata = 0; + +extern spinlock_t i8253_lock; + +static int use_tsc; +/* Number of usecs that the last interrupt was delayed */ +static int delay_at_last_interrupt; + +static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ +static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ +static unsigned long long monotonic_base; +static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; + +/* convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_mhz * 10^6)) + * ns = cycles * (10^3 / cpu_mhz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^3 * SC / cpu_mhz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_mhz) +{ + cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +static int count2; /* counter for mark_offset_tsc() */ + +/* Cached *multiplier* to convert TSC counts to microseconds. + * (see the equation below). + * Equal to 2^32 * (1 / (clocks per usec) ). + * Initialized in time_init. + */ +static unsigned long fast_gettimeoffset_quotient; + +static unsigned long get_offset_tsc(void) +{ + register unsigned long eax, edx; + + /* Read the Time Stamp Counter */ + + rdtsc(eax,edx); + + /* .. relative to previous jiffy (32 bits is enough) */ + eax -= last_tsc_low; /* tsc_low delta */ + + /* + * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient + * = (tsc_low delta) * (usecs_per_clock) + * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) + * + * Using a mull instead of a divl saves up to 31 clock cycles + * in the critical path. + */ + + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (fast_gettimeoffset_quotient), + "0" (eax)); + + /* our adjusted time offset in microseconds */ + return delay_at_last_interrupt + edx; +} + +static unsigned long long monotonic_clock_tsc(void) +{ + unsigned long long last_offset, this_offset, base; + unsigned seq; + + /* atomically read monotonic base & last_offset */ + do { + seq = read_seqbegin(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + base = monotonic_base; + } while (read_seqretry(&monotonic_lock, seq)); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return base + cycles_2_ns(this_offset - last_offset); +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * In the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!use_tsc) +#endif + /* no locking but a rare wrong value is not a big deal */ + return jiffies_64 * (1000000000 / HZ); + + /* Read the Time Stamp Counter */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +#ifdef CONFIG_HPET_TIMER +static void mark_offset_tsc_hpet(void) +{ + unsigned long long this_offset, last_offset; + unsigned long offset, temp, hpet_current; + + write_seqlock(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + /* + * It is important that these two operations happen almost at + * the same time. We do the RDTSC stuff first, since it's + * faster. To avoid any inconsistencies, we need interrupts + * disabled locally. + */ + /* + * Interrupts are just disabled locally since the timer irq + * has the SA_INTERRUPT flag set. -arca + */ + /* read Pentium cycle counter */ + + hpet_current = hpet_readl(HPET_COUNTER); + rdtsc(last_tsc_low, last_tsc_high); + + /* lost tick compensation */ + offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) { + int lost_ticks = (offset - hpet_last) / hpet_tick; + jiffies_64 += lost_ticks; + } + hpet_last = hpet_current; + + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + monotonic_base += cycles_2_ns(this_offset - last_offset); + write_sequnlock(&monotonic_lock); + + /* calculate delay_at_last_interrupt */ + /* + * Time offset = (hpet delta) * ( usecs per HPET clock ) + * = (hpet delta) * ( usecs per tick / HPET clocks per tick) + * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) + * Where, + * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick + */ + delay_at_last_interrupt = hpet_current - offset; + ASM_MUL64_REG(temp, delay_at_last_interrupt, + hpet_usec_quotient, delay_at_last_interrupt); +} +#endif + + +#ifdef CONFIG_CPU_FREQ +#include <linux/workqueue.h> + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* If the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +#ifndef CONFIG_SMP +static unsigned long fast_gettimeoffset_ref = 0; +static unsigned long cpu_khz_ref = 0; +#endif + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE) + write_seqlock_irq(&xtime_lock); + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; +#ifndef CONFIG_SMP + fast_gettimeoffset_ref = fast_gettimeoffset_quotient; + cpu_khz_ref = cpu_khz; +#endif + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); +#ifndef CONFIG_SMP + if (cpu_khz) + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (use_tsc) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); + set_cyc2ns_scale(cpu_khz/1000); + } + } +#endif + } + + if (val != CPUFREQ_RESUMECHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + + +static int __init cpufreq_tsc(void) +{ + int ret; + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + return ret; +} +core_initcall(cpufreq_tsc); + +#else /* CONFIG_CPU_FREQ */ +static inline void cpufreq_delayed_get(void) { return; } +#endif + +static void mark_offset_tsc(void) +{ + unsigned long lost,delay; + unsigned long delta = last_tsc_low; + int count; + int countmp; + static int count1 = 0; + unsigned long long this_offset, last_offset; + static int lost_count = 0; + + write_seqlock(&monotonic_lock); + last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + /* + * It is important that these two operations happen almost at + * the same time. We do the RDTSC stuff first, since it's + * faster. To avoid any inconsistencies, we need interrupts + * disabled locally. + */ + + /* + * Interrupts are just disabled locally since the timer irq + * has the SA_INTERRUPT flag set. -arca + */ + + /* read Pentium cycle counter */ + + rdtsc(last_tsc_low, last_tsc_high); + + spin_lock(&i8253_lock); + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb(PIT_CH0) << 8; + + /* + * VIA686a test code... reset the latch if count > max + 1 + * from timer_pit.c - cjb + */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + spin_unlock(&i8253_lock); + + if (pit_latch_buggy) { + /* get center value of last 3 time lutch */ + if ((count2 >= count && count >= count1) + || (count1 >= count && count >= count2)) { + count2 = count1; count1 = count; + } else if ((count1 >= count2 && count2 >= count) + || (count >= count2 && count2 >= count1)) { + countmp = count;count = count2; + count2 = count1;count1 = countmp; + } else { + count2 = count1; count1 = count; count = count1; + } + } + + /* lost tick compensation */ + delta = last_tsc_low - delta; + { + register unsigned long eax, edx; + eax = delta; + __asm__("mull %2" + :"=a" (eax), "=d" (edx) + :"rm" (fast_gettimeoffset_quotient), + "0" (eax)); + delta = edx; + } + delta += delay_at_last_interrupt; + lost = delta/(1000000/HZ); + delay = delta%(1000000/HZ); + if (lost >= 2) { + jiffies_64 += lost-1; + + /* sanity check to ensure we're not always losing ticks */ + if (lost_count++ > 100) { + printk(KERN_WARNING "Losing too many ticks!\n"); + printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); + printk(KERN_WARNING "Possible reasons for this are:\n"); + printk(KERN_WARNING " You're running with Speedstep,\n"); + printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); + printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); + printk(KERN_WARNING "Falling back to a sane timesource now.\n"); + + clock_fallback(); + } + /* ... but give the TSC a fair chance */ + if (lost_count > 25) + cpufreq_delayed_get(); + } else + lost_count = 0; + /* update the monotonic base value */ + this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + monotonic_base += cycles_2_ns(this_offset - last_offset); + write_sequnlock(&monotonic_lock); + + /* calculate delay_at_last_interrupt */ + count = ((LATCH-1) - count) * TICK_SIZE; + delay_at_last_interrupt = (count + LATCH/2) / LATCH; + + /* catch corner case where tick rollover occured + * between tsc and pit reads (as noted when + * usec delta is > 90% # of usecs/tick) + */ + if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) + jiffies_64++; +} + +static int __init init_tsc(char* override) +{ + + /* check clock override */ + if (override[0] && strncmp(override,"tsc",3)) { +#ifdef CONFIG_HPET_TIMER + if (is_hpet_enabled()) { + printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); + } else +#endif + { + return -ENODEV; + } + } + + /* + * If we have APM enabled or the CPU clock speed is variable + * (CPU stops clock on HLT or slows clock to save power) + * then the TSC timestamps may diverge by up to 1 jiffy from + * 'real time' but nothing will break. + * The most frequent case is that the CPU is "woken" from a halt + * state by the timer interrupt itself, so we get 0 error. In the + * rare cases where a driver would "wake" the CPU and request a + * timestamp, the maximum error is < 1 jiffy. But timestamps are + * still perfectly ordered. + * Note that the TSC counter will be reset if APM suspends + * to disk; this won't break the kernel, though, 'cuz we're + * smart. See arch/i386/kernel/apm.c. + */ + /* + * Firstly we have to do a CPU check for chips with + * a potentially buggy TSC. At this point we haven't run + * the ident/bugs checks so we must run this hook as it + * may turn off the TSC flag. + * + * NOTE: this doesn't yet handle SMP 486 machines where only + * some CPU's have a TSC. Thats never worked and nobody has + * moaned if you have the only one in the world - you fix it! + */ + + count2 = LATCH; /* initialize counter for mark_offset_tsc() */ + + if (cpu_has_tsc) { + unsigned long tsc_quotient; +#ifdef CONFIG_HPET_TIMER + if (is_hpet_enabled()){ + unsigned long result, remain; + printk("Using TSC for gettimeofday\n"); + tsc_quotient = calibrate_tsc_hpet(NULL); + timer_tsc.mark_offset = &mark_offset_tsc_hpet; + /* + * Math to calculate hpet to usec multiplier + * Look for the comments at get_offset_tsc_hpet() + */ + ASM_DIV64_REG(result, remain, hpet_tick, + 0, KERNEL_TICK_USEC); + if (remain > (hpet_tick >> 1)) + result++; /* rounding the result */ + + hpet_usec_quotient = result; + } else +#endif + { + tsc_quotient = calibrate_tsc(); + } + + if (tsc_quotient) { + fast_gettimeoffset_quotient = tsc_quotient; + use_tsc = 1; + /* + * We could be more selective here I suspect + * and just enable this for the next intel chips ? + */ + /* report CPU clock rate in Hz. + * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = + * clock/second. Our precision is about 100 ppm. + */ + { unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (cpu_khz), "=d" (edx) + :"r" (tsc_quotient), + "0" (eax), "1" (edx)); + printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + } + set_cyc2ns_scale(cpu_khz/1000); + return 0; + } + } + return -ENODEV; +} + +#ifndef CONFIG_X86_TSC +/* disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + return 1; +} +#else +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#endif +__setup("notsc", tsc_setup); + + + +/************************************************************/ + +/* tsc timer_opts struct */ +static struct timer_opts timer_tsc = { + .name = "tsc", + .mark_offset = mark_offset_tsc, + .get_offset = get_offset_tsc, + .monotonic_clock = monotonic_clock_tsc, + .delay = delay_tsc, +}; + +struct init_timer_opts __initdata timer_tsc_init = { + .init = init_tsc, + .opts = &timer_tsc, +}; diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S new file mode 100644 index 000000000000..fcce0e61b0e7 --- /dev/null +++ b/arch/i386/kernel/trampoline.S @@ -0,0 +1,80 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * + * This is only used for booting secondary CPUs in SMP machine + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * In fact we don't actually need a stack so we don't + * set one up. + * + * We jump into the boot/compressed/head.S code. So you'd + * better be running a compressed kernel image or you + * won't get very far. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * If you work on this file, check the object module with + * objdump --reloc to make sure there are no relocation + * entries except for: + * + * TYPE VALUE + * R_386_32 startup_32_smp + * R_386_32 boot_gdt_table + */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> + +.data + +.code16 + +ENTRY(trampoline_data) +r_base = . + wbinvd # Needed for NUMA-Q should be harmless for others + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + + cli # We should be safe anyway + + movl $0xA5A5A5A5, trampoline_data - r_base + # write marker for master knows we're running + + /* GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl boot_idt - r_base # load idt with 0, 0 + lgdtl boot_gdt - r_base # load gdt with whatever is appropriate + + xor %ax, %ax + inc %ax # protected mode (PE) bit + lmsw %ax # into protected mode + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S + ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) + + # These need to be in the same 64K segment as the above; + # hence we don't use the boot_gdt_descr defined in head.S +boot_gdt: + .word __BOOT_DS + 7 # gdt limit + .long boot_gdt_table-__PAGE_OFFSET # gdt base + +boot_idt: + .word 0 # idt limit = 0 + .long 0 # idt base = 0L + +.globl trampoline_end +trampoline_end: diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c new file mode 100644 index 000000000000..6c0e383915b6 --- /dev/null +++ b/arch/i386/kernel/traps.c @@ -0,0 +1,1084 @@ +/* + * linux/arch/i386/traps.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * 'Traps.c' handles hardware traps and faults after we have saved some + * state in 'asm.s'. + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/highmem.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/utsname.h> +#include <linux/kprobes.h> + +#ifdef CONFIG_EISA +#include <linux/ioport.h> +#include <linux/eisa.h> +#endif + +#ifdef CONFIG_MCA +#include <linux/mca.h> +#endif + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/atomic.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/nmi.h> + +#include <asm/smp.h> +#include <asm/arch_hooks.h> +#include <asm/kdebug.h> + +#include <linux/irq.h> +#include <linux/module.h> + +#include "mach_traps.h" + +asmlinkage int system_call(void); + +struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 } }; + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq = 0; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; + +asmlinkage void divide_error(void); +asmlinkage void debug(void); +asmlinkage void nmi(void); +asmlinkage void int3(void); +asmlinkage void overflow(void); +asmlinkage void bounds(void); +asmlinkage void invalid_op(void); +asmlinkage void device_not_available(void); +asmlinkage void coprocessor_segment_overrun(void); +asmlinkage void invalid_TSS(void); +asmlinkage void segment_not_present(void); +asmlinkage void stack_segment(void); +asmlinkage void general_protection(void); +asmlinkage void page_fault(void); +asmlinkage void coprocessor_error(void); +asmlinkage void simd_coprocessor_error(void); +asmlinkage void alignment_check(void); +asmlinkage void spurious_interrupt_bug(void); +asmlinkage void machine_check(void); + +static int kstack_depth_to_print = 24; +struct notifier_block *i386die_chain; +static DEFINE_SPINLOCK(die_notifier_lock); + +int register_die_notifier(struct notifier_block *nb) +{ + int err = 0; + unsigned long flags; + spin_lock_irqsave(&die_notifier_lock, flags); + err = notifier_chain_register(&i386die_chain, nb); + spin_unlock_irqrestore(&die_notifier_lock, flags); + return err; +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + addr = *(unsigned long *)(ebp + 4); + printk(" [<%08lx>] ", addr); + print_symbol("%s", addr); + printk("\n"); + ebp = *(unsigned long *)ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + printk(" [<%08lx>]", addr); + print_symbol(" %s", addr); + printk("\n"); + } + } +#endif + return ebp; +} + +void show_trace(struct task_struct *task, unsigned long * stack) +{ + unsigned long ebp; + + if (!task) + task = current; + + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } + + while (1) { + struct thread_info *context; + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + ebp = print_context_stack(context, stack, ebp); + stack = (unsigned long*)context->previous_esp; + if (!stack) + break; + printk(" =======================\n"); + } +} + +void show_stack(struct task_struct *task, unsigned long *esp) +{ + unsigned long *stack; + int i; + + if (esp == NULL) { + if (task) + esp = (unsigned long*)task->thread.esp; + else + esp = (unsigned long *)&esp; + } + + stack = esp; + for(i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % 8) == 0)) + printk("\n "); + printk("%08lx ", *stack++); + } + printk("\nCall Trace:\n"); + show_trace(task, esp); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long stack; + + show_trace(current, &stack); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + int in_kernel = 1; + unsigned long esp; + unsigned short ss; + + esp = (unsigned long) (®s->esp); + ss = __KERNEL_DS; + if (regs->xcs & 3) { + in_kernel = 0; + esp = regs->esp; + ss = regs->xss & 0xffff; + } + print_modules(); + printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" + " (%s) \n", + smp_processor_id(), 0xffff & regs->xcs, regs->eip, + print_tainted(), regs->eflags, system_utsname.release); + print_symbol("EIP is at %s\n", regs->eip); + printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk("ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk("Process %s (pid: %d, threadinfo=%p task=%p)", + current->comm, current->pid, current_thread_info(), current); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (in_kernel) { + u8 *eip; + + printk("\nStack: "); + show_stack(NULL, (unsigned long*)esp); + + printk("Code: "); + + eip = (u8 *)regs->eip - 43; + for (i = 0; i < 64; i++, eip++) { + unsigned char c; + + if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { + printk(" Bad EIP value."); + break; + } + if (eip == (u8 *)regs->eip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +static void handle_BUG(struct pt_regs *regs) +{ + unsigned short ud2; + unsigned short line; + char *file; + char c; + unsigned long eip; + + if (regs->xcs & 3) + goto no_bug; /* Not in kernel */ + + eip = regs->eip; + + if (eip < PAGE_OFFSET) + goto no_bug; + if (__get_user(ud2, (unsigned short *)eip)) + goto no_bug; + if (ud2 != 0x0b0f) + goto no_bug; + if (__get_user(line, (unsigned short *)(eip + 2))) + goto bug; + if (__get_user(file, (char **)(eip + 4)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = "<bad filename>"; + + printk("------------[ cut here ]------------\n"); + printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line); + +no_bug: + return; + + /* Here we know it was a BUG but file-n-line is unavailable */ +bug: + printk("Kernel BUG\n"); +} + +void die(const char * str, struct pt_regs * regs, long err) +{ + static struct { + spinlock_t lock; + u32 lock_owner; + int lock_owner_depth; + } die = { + .lock = SPIN_LOCK_UNLOCKED, + .lock_owner = -1, + .lock_owner_depth = 0 + }; + static int die_counter; + + if (die.lock_owner != _smp_processor_id()) { + console_verbose(); + spin_lock_irq(&die.lock); + die.lock_owner = smp_processor_id(); + die.lock_owner_depth = 0; + bust_spinlocks(1); + } + + if (++die.lock_owner_depth < 3) { + int nl = 0; + handle_BUG(regs); + printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); + nl = 1; +#endif +#ifdef CONFIG_SMP + printk("SMP "); + nl = 1; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); + nl = 1; +#endif + if (nl) + printk("\n"); + notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); + show_registers(regs); + } else + printk(KERN_ERR "Recursive die() failure, output suppressed\n"); + + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irq(&die.lock); + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) { + printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); + ssleep(5); + panic("Fatal exception"); + } + do_exit(SIGSEGV); +} + +static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +{ + if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + die(str, regs, err); +} + +static void do_trap(int trapnr, int signr, char *str, int vm86, + struct pt_regs * regs, long error_code, siginfo_t *info) +{ + if (regs->eflags & VM_MASK) { + if (vm86) + goto vm86_trap; + goto trap_signal; + } + + if (!(regs->xcs & 3)) + goto kernel_trap; + + trap_signal: { + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + } + + kernel_trap: { + if (!fixup_exception(regs)) + die(str, regs, error_code); + return; + } + + vm86_trap: { + int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); + if (ret) goto trap_signal; + return; + } +} + +#define DO_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_ERROR(trapnr, signr, str, name) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +fastcall void do_##name(struct pt_regs * regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +} + +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +#ifndef CONFIG_KPROBES +DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) +#endif +DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +fastcall void do_general_protection(struct pt_regs * regs, long error_code) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct thread_struct *thread = ¤t->thread; + + /* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS + * and we set the offset field correctly. Then we let the CPU to + * restart the faulting instruction. + */ + if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + tss->io_bitmap_max = thread->io_bitmap_max; + tss->io_bitmap_base = IO_BITMAP_OFFSET; + put_cpu(); + return; + } + put_cpu(); + + if (regs->eflags & VM_MASK) + goto gp_in_vm86; + + if (!(regs->xcs & 3)) + goto gp_in_kernel; + + current->thread.error_code = error_code; + current->thread.trap_no = 13; + force_sig(SIGSEGV, current); + return; + +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; + +gp_in_kernel: + if (!fixup_exception(regs)) { + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); + } +} + +static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +{ + printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); + printk("You probably have a hardware problem with your RAM chips\n"); + + /* Clear and disable the memory parity error line. */ + clear_mem_error(reason); +} + +static void io_check_error(unsigned char reason, struct pt_regs * regs) +{ + unsigned long i; + + printk("NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + i = 2000; + while (--i) udelay(1000); + reason &= ~8; + outb(reason, 0x61); +} + +static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +{ +#ifdef CONFIG_MCA + /* Might actually be able to figure out what the guilty party + * is. */ + if( MCA_bus ) { + mca_handle_nmi(); + return; + } +#endif + printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk("Dazed and confused, but trying to continue\n"); + printk("Do you have a strange power saving mode enabled?\n"); +} + +static DEFINE_SPINLOCK(nmi_print_lock); + +void die_nmi (struct pt_regs *regs, const char *msg) +{ + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + bust_spinlocks(1); + printk(msg); + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); + printk("console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + do_exit(SIGSEGV); +} + +static void default_do_nmi(struct pt_regs * regs) +{ + unsigned char reason = 0; + + /* Only the BSP gets external NMIs from the system. */ + if (!smp_processor_id()) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog) { + nmi_watchdog_tick(regs); + return; + } +#endif + unknown_nmi_error(reason, regs); + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) + return; + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered. + */ + reassert_nmi(); +} + +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +fastcall void do_nmi(struct pt_regs * regs, long error_code) +{ + int cpu; + + nmi_enter(); + + cpu = smp_processor_id(); + ++nmi_count(cpu); + + if (!nmi_callback(regs, cpu)) + default_do_nmi(regs); + + nmi_exit(); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + nmi_callback = callback; +} + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} + +#ifdef CONFIG_KPROBES +fastcall int do_int3(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return 1; + /* This is an interrupt gate, because kprobes wants interrupts + disabled. Normal trap handlers don't. */ + restore_interrupts(regs); + do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + return 0; +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + */ +fastcall void do_debug(struct pt_regs * regs, long error_code) +{ + unsigned int condition; + struct task_struct *tsk = current; + + __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + /* It's safe to allow irq's after DR6 has been saved */ + if (regs->eflags & X86_EFLAGS_IF) + local_irq_enable(); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg[7]) + goto clear_dr7; + } + + if (regs->eflags & VM_MASK) + goto debug_vm86; + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg[6] = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + /* + * We already checked v86 mode above, so we can + * check for kernel mode by just checking the CPL + * of CS. + */ + if ((regs->xcs & 3) == 0) + goto clear_TF_reenable; + } + + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code); + + /* Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + __asm__("movl %0,%%db7" + : /* no output */ + : "r" (0)); + return; + +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + return; + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->eflags &= ~TF_MASK; + return; +} + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't syncronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + case 0x041: /* Stack Fault */ + case 0x241: /* Stack Fault | Direction */ + info.si_code = FPE_FLTINV; + /* Should we clear the SF or let user space do it ???? */ + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +{ + ignore_fpu_irq = 1; + math_error((void __user *)regs->eip); +} + +static void simd_math_error(void __user *eip) +{ + struct task_struct * task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = eip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +fastcall void do_simd_coprocessor_error(struct pt_regs * regs, + long error_code) +{ + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->eip); + } else { + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->eflags & VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, + error_code); + return; + } + die_if_kernel("cache flush denied", regs, error_code); + current->thread.trap_no = 19; + current->thread.error_code = error_code; + force_sig(SIGSEGV, current); + } +} + +fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, + long error_code) +{ +#if 0 + /* No need to warn about this any longer. */ + printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +fastcall void setup_x86_bogus_stack(unsigned char * stk) +{ + unsigned long *switch16_ptr, *switch32_ptr; + struct pt_regs *regs; + unsigned long stack_top, stack_bot; + unsigned short iret_frame16_off; + int cpu = smp_processor_id(); + /* reserve the space on 32bit stack for the magic switch16 pointer */ + memmove(stk, stk + 8, sizeof(struct pt_regs)); + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); + regs = (struct pt_regs *)stk; + /* now the switch32 on 16bit stack */ + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; + /* copy iret frame on 16bit stack */ + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); + /* fill in the switch pointers */ + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; + switch16_ptr[1] = __ESPFIX_SS; + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + + 8 - CPU_16BIT_STACK_SIZE; + switch32_ptr[1] = __KERNEL_DS; +} + +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) +{ + unsigned long *switch32_ptr; + unsigned char *stack16, *stack32; + unsigned long stack_top, stack_bot; + int len; + int cpu = smp_processor_id(); + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + /* copy the data from 16bit stack to 32bit stack */ + len = CPU_16BIT_STACK_SIZE - 8 - sp; + stack16 = (unsigned char *)(stack_bot + sp); + stack32 = (unsigned char *) + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); + memcpy(stack32, stack16, len); + return stack32; +} + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(struct pt_regs regs) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + clts(); /* Allow maths ops (or we recurse) */ + if (!tsk_used_math(tsk)) + init_fpu(tsk); + restore_fpu(tsk); + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ +} + +#ifndef CONFIG_MATH_EMULATION + +asmlinkage void math_emulate(long arg) +{ + printk("math-emulation not enabled and no coprocessor found.\n"); + printk("killing %s.\n",current->comm); + force_sig(SIGFPE,current); + schedule(); +} + +#endif /* CONFIG_MATH_EMULATION */ + +#ifdef CONFIG_X86_F00F_BUG +void __init trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); +} +#endif + +#define _set_gate(gate_addr,type,dpl,addr,seg) \ +do { \ + int __d0, __d1; \ + __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ + "movw %4,%%dx\n\t" \ + "movl %%eax,%0\n\t" \ + "movl %%edx,%1" \ + :"=m" (*((long *) (gate_addr))), \ + "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ + :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ + "3" ((char *) (addr)),"2" ((seg) << 16)); \ +} while (0) + + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +void set_intr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); +} + +static void __init set_trap_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); +} + +static void __init set_system_gate(unsigned int n, void *addr) +{ + _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); +} + +static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); +} + + +void __init trap_init(void) +{ +#ifdef CONFIG_EISA + void __iomem *p = ioremap(0x0FFFD9, 4); + if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { + EISA_bus = 1; + } + iounmap(p); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + init_apic_mappings(); +#endif + + set_trap_gate(0,÷_error); + set_intr_gate(1,&debug); + set_intr_gate(2,&nmi); + set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ + set_system_gate(4,&overflow); + set_system_gate(5,&bounds); + set_trap_gate(6,&invalid_op); + set_trap_gate(7,&device_not_available); + set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); + set_trap_gate(9,&coprocessor_segment_overrun); + set_trap_gate(10,&invalid_TSS); + set_trap_gate(11,&segment_not_present); + set_trap_gate(12,&stack_segment); + set_trap_gate(13,&general_protection); + set_intr_gate(14,&page_fault); + set_trap_gate(15,&spurious_interrupt_bug); + set_trap_gate(16,&coprocessor_error); + set_trap_gate(17,&alignment_check); +#ifdef CONFIG_X86_MCE + set_trap_gate(18,&machine_check); +#endif + set_trap_gate(19,&simd_coprocessor_error); + + set_system_gate(SYSCALL_VECTOR,&system_call); + + /* + * Should be a barrier for any external CPU state. + */ + cpu_init(); + + trap_init_hook(); +} + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("kstack=", kstack_setup); diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c new file mode 100644 index 000000000000..2f3d52dacff7 --- /dev/null +++ b/arch/i386/kernel/vm86.c @@ -0,0 +1,804 @@ +/* + * linux/kernel/vm86.c + * + * Copyright (C) 1994 Linus Torvalds + * + * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86 + * stack - Manfred Spraul <manfreds@colorfullife.com> + * + * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle + * them correctly. Now the emulation will be in a + * consistent state after stackfaults - Kasper Dupont + * <kasperd@daimi.au.dk> + * + * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont + * <kasperd@daimi.au.dk> + * + * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault + * caused by Kasper Dupont's changes - Stas Sergeev + * + * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes. + * Kasper Dupont <kasperd@daimi.au.dk> + * + * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault. + * Kasper Dupont <kasperd@daimi.au.dk> + * + * 9 apr 2002 - Changed stack access macros to jump to a label + * instead of returning to userspace. This simplifies + * do_int, and is needed by handle_vm6_fault. Kasper + * Dupont <kasperd@daimi.au.dk> + * + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/tlbflush.h> +#include <asm/irq.h> + +/* + * Known problems: + * + * Interrupt handling is not guaranteed: + * - a real x86 will disable all interrupts for one instruction + * after a "mov ss,xx" to make stack handling atomic even without + * the 'lss' instruction. We can't guarantee this in v86 mode, + * as the next instruction might result in a page fault or similar. + * - a real x86 will have interrupts disabled for one instruction + * past the 'sti' that enables them. We don't bother with all the + * details yet. + * + * Let's hope these problems do not actually matter for anything. + */ + + +#define KVM86 ((struct kernel_vm86_struct *)regs) +#define VMPI KVM86->vm86plus + + +/* + * 8- and 16-bit register defines.. + */ +#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->eip)) +#define SP(regs) (*(unsigned short *)&((regs)->esp)) + +/* + * virtual flags (16 and 32-bit versions) + */ +#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) +#define VEFLAGS (current->thread.v86flags) + +#define set_flags(X,new,mask) \ +((X) = ((X) & ~(mask)) | ((new) & (mask))) + +#define SAFE_MASK (0xDD5) +#define RETURN_MASK (0xDFF) + +#define VM86_REGS_PART2 orig_eax +#define VM86_REGS_SIZE1 \ + ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) +#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) + +struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); +struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) +{ + struct tss_struct *tss; + struct pt_regs *ret; + unsigned long tmp; + + /* + * This gets called from entry.S with interrupts disabled, but + * from process context. Enable interrupts here, before trying + * to access user space. + */ + local_irq_enable(); + + if (!current->thread.vm86_info) { + printk("no vm86_info: BAD\n"); + do_exit(SIGSEGV); + } + set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); + tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, + ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); + tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); + if (tmp) { + printk("vm86: could not access userspace vm86_info\n"); + do_exit(SIGSEGV); + } + + tss = &per_cpu(init_tss, get_cpu()); + current->thread.esp0 = current->thread.saved_esp0; + current->thread.sysenter_cs = __KERNEL_CS; + load_esp0(tss, ¤t->thread); + current->thread.saved_esp0 = 0; + put_cpu(); + + loadsegment(fs, current->thread.saved_fs); + loadsegment(gs, current->thread.saved_gs); + ret = KVM86->regs32; + return ret; +} + +static void mark_screen_rdonly(struct task_struct * tsk) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *mapped; + int i; + + preempt_disable(); + spin_lock(&tsk->mm->page_table_lock); + pgd = pgd_offset(tsk->mm, 0xA0000); + if (pgd_none_or_clear_bad(pgd)) + goto out; + pud = pud_offset(pgd, 0xA0000); + if (pud_none_or_clear_bad(pud)) + goto out; + pmd = pmd_offset(pud, 0xA0000); + if (pmd_none_or_clear_bad(pmd)) + goto out; + pte = mapped = pte_offset_map(pmd, 0xA0000); + for (i = 0; i < 32; i++) { + if (pte_present(*pte)) + set_pte(pte, pte_wrprotect(*pte)); + pte++; + } + pte_unmap(mapped); +out: + spin_unlock(&tsk->mm->page_table_lock); + preempt_enable(); + flush_tlb(); +} + + + +static int do_vm86_irq_handling(int subfunction, int irqnumber); +static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); + +asmlinkage int sys_vm86old(struct pt_regs regs) +{ + struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. + * This remains on the stack until we + * return to 32 bit user space. + */ + struct task_struct *tsk; + int tmp, ret = -EPERM; + + tsk = current; + if (tsk->thread.saved_esp0) + goto out; + tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); + tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, + (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); + ret = -EFAULT; + if (tmp) + goto out; + memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); + info.regs32 = ®s; + tsk->thread.vm86_info = v86; + do_sys_vm86(&info, tsk); + ret = 0; /* we never return here */ +out: + return ret; +} + + +asmlinkage int sys_vm86(struct pt_regs regs) +{ + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. + * This remains on the stack until we + * return to 32 bit user space. + */ + struct task_struct *tsk; + int tmp, ret; + struct vm86plus_struct __user *v86; + + tsk = current; + switch (regs.ebx) { + case VM86_REQUEST_IRQ: + case VM86_FREE_IRQ: + case VM86_GET_IRQ_BITS: + case VM86_GET_AND_RESET_IRQ: + ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); + goto out; + case VM86_PLUS_INSTALL_CHECK: + /* NOTE: on old vm86 stuff this will return the error + from verify_area(), because the subfunction is + interpreted as (invalid) address to vm86_struct. + So the installation check works. + */ + ret = 0; + goto out; + } + + /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ + ret = -EPERM; + if (tsk->thread.saved_esp0) + goto out; + v86 = (struct vm86plus_struct __user *)regs.ecx; + tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); + tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, + (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); + ret = -EFAULT; + if (tmp) + goto out; + info.regs32 = ®s; + info.vm86plus.is_vm86pus = 1; + tsk->thread.vm86_info = (struct vm86_struct __user *)v86; + do_sys_vm86(&info, tsk); + ret = 0; /* we never return here */ +out: + return ret; +} + + +static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) +{ + struct tss_struct *tss; +/* + * make sure the vm86() system call doesn't try to do anything silly + */ + info->regs.__null_ds = 0; + info->regs.__null_es = 0; + +/* we are clearing fs,gs later just before "jmp resume_userspace", + * because starting with Linux 2.1.x they aren't no longer saved/restored + */ + +/* + * The eflags register is also special: we cannot trust that the user + * has set it up safely, so this makes sure interrupt etc flags are + * inherited from protected mode. + */ + VEFLAGS = info->regs.eflags; + info->regs.eflags &= SAFE_MASK; + info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; + info->regs.eflags |= VM_MASK; + + switch (info->cpu_type) { + case CPU_286: + tsk->thread.v86mask = 0; + break; + case CPU_386: + tsk->thread.v86mask = NT_MASK | IOPL_MASK; + break; + case CPU_486: + tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK; + break; + default: + tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK; + break; + } + +/* + * Save old state, set default return value (%eax) to 0 + */ + info->regs32->eax = 0; + tsk->thread.saved_esp0 = tsk->thread.esp0; + asm volatile("movl %%fs,%0":"=m" (tsk->thread.saved_fs)); + asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); + + tss = &per_cpu(init_tss, get_cpu()); + tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + if (cpu_has_sep) + tsk->thread.sysenter_cs = 0; + load_esp0(tss, &tsk->thread); + put_cpu(); + + tsk->thread.screen_bitmap = info->screen_bitmap; + if (info->flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(tsk); + __asm__ __volatile__( + "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" + "movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" + "jmp resume_userspace" + : /* no outputs */ + :"r" (&info->regs), "r" (tsk->thread_info) : "ax"); + /* we never return here */ +} + +static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval) +{ + struct pt_regs * regs32; + + regs32 = save_v86_state(regs16); + regs32->eax = retval; + __asm__ __volatile__("movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" + "jmp resume_userspace" + : : "r" (regs32), "r" (current_thread_info())); +} + +static inline void set_IF(struct kernel_vm86_regs * regs) +{ + VEFLAGS |= VIF_MASK; + if (VEFLAGS & VIP_MASK) + return_to_32bit(regs, VM86_STI); +} + +static inline void clear_IF(struct kernel_vm86_regs * regs) +{ + VEFLAGS &= ~VIF_MASK; +} + +static inline void clear_TF(struct kernel_vm86_regs * regs) +{ + regs->eflags &= ~TF_MASK; +} + +static inline void clear_AC(struct kernel_vm86_regs * regs) +{ + regs->eflags &= ~AC_MASK; +} + +/* It is correct to call set_IF(regs) from the set_vflags_* + * functions. However someone forgot to call clear_IF(regs) + * in the opposite case. + * After the command sequence CLI PUSHF STI POPF you should + * end up with interrups disabled, but you ended up with + * interrupts enabled. + * ( I was testing my own changes, but the only bug I + * could find was in a function I had not changed. ) + * [KD] + */ + +static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) +{ + set_flags(VEFLAGS, eflags, current->thread.v86mask); + set_flags(regs->eflags, eflags, SAFE_MASK); + if (eflags & IF_MASK) + set_IF(regs); + else + clear_IF(regs); +} + +static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) +{ + set_flags(VFLAGS, flags, current->thread.v86mask); + set_flags(regs->eflags, flags, SAFE_MASK); + if (flags & IF_MASK) + set_IF(regs); + else + clear_IF(regs); +} + +static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) +{ + unsigned long flags = regs->eflags & RETURN_MASK; + + if (VEFLAGS & VIF_MASK) + flags |= IF_MASK; + flags |= IOPL_MASK; + return flags | (VEFLAGS & current->thread.v86mask); +} + +static inline int is_revectored(int nr, struct revectored_struct * bitmap) +{ + __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" + :"=r" (nr) + :"m" (*bitmap),"r" (nr)); + return nr; +} + +#define val_byte(val, n) (((__u8 *)&val)[n]) + +#define pushb(base, ptr, val, err_label) \ + do { \ + __u8 __val = val; \ + ptr--; \ + if (put_user(__val, base + ptr) < 0) \ + goto err_label; \ + } while(0) + +#define pushw(base, ptr, val, err_label) \ + do { \ + __u16 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while(0) + +#define pushl(base, ptr, val, err_label) \ + do { \ + __u32 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 3), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 2), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while(0) + +#define popb(base, ptr, err_label) \ + ({ \ + __u8 __res; \ + if (get_user(__res, base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popw(base, ptr, err_label) \ + ({ \ + __u16 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popl(base, ptr, err_label) \ + ({ \ + __u32 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 2), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 3), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +/* There are so many possible reasons for this function to return + * VM86_INTx, so adding another doesn't bother me. We can expect + * userspace programs to be able to handle it. (Getting a problem + * in userspace is always better than an Oops anyway.) [KD] + */ +static void do_int(struct kernel_vm86_regs *regs, int i, + unsigned char __user * ssp, unsigned short sp) +{ + unsigned long __user *intr_ptr; + unsigned long segoffs; + + if (regs->cs == BIOSSEG) + goto cannot_handle; + if (is_revectored(i, &KVM86->int_revectored)) + goto cannot_handle; + if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored)) + goto cannot_handle; + intr_ptr = (unsigned long __user *) (i << 2); + if (get_user(segoffs, intr_ptr)) + goto cannot_handle; + if ((segoffs >> 16) == BIOSSEG) + goto cannot_handle; + pushw(ssp, sp, get_vflags(regs), cannot_handle); + pushw(ssp, sp, regs->cs, cannot_handle); + pushw(ssp, sp, IP(regs), cannot_handle); + regs->cs = segoffs >> 16; + SP(regs) -= 6; + IP(regs) = segoffs & 0xffff; + clear_TF(regs); + clear_IF(regs); + clear_AC(regs); + return; + +cannot_handle: + return_to_32bit(regs, VM86_INTx + (i << 8)); +} + +int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno) +{ + if (VMPI.is_vm86pus) { + if ( (trapno==3) || (trapno==1) ) + return_to_32bit(regs, VM86_TRAP + (trapno << 8)); + do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); + return 0; + } + if (trapno !=1) + return 1; /* we let this handle by the calling routine */ + if (current->ptrace & PT_PTRACED) { + unsigned long flags; + spin_lock_irqsave(¤t->sighand->siglock, flags); + sigdelset(¤t->blocked, SIGTRAP); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + send_sig(SIGTRAP, current, 1); + current->thread.trap_no = trapno; + current->thread.error_code = error_code; + return 0; +} + +void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) +{ + unsigned char opcode; + unsigned char __user *csp; + unsigned char __user *ssp; + unsigned short ip, sp; + int data32, pref_done; + +#define CHECK_IF_IN_TRAP \ + if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ + newflags |= TF_MASK +#define VM86_FAULT_RETURN do { \ + if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \ + return_to_32bit(regs, VM86_PICRETURN); \ + return; } while (0) + + csp = (unsigned char __user *) (regs->cs << 4); + ssp = (unsigned char __user *) (regs->ss << 4); + sp = SP(regs); + ip = IP(regs); + + data32 = 0; + pref_done = 0; + do { + switch (opcode = popb(csp, ip, simulate_sigsegv)) { + case 0x66: /* 32-bit data */ data32=1; break; + case 0x67: /* 32-bit address */ break; + case 0x2e: /* CS */ break; + case 0x3e: /* DS */ break; + case 0x26: /* ES */ break; + case 0x36: /* SS */ break; + case 0x65: /* GS */ break; + case 0x64: /* FS */ break; + case 0xf2: /* repnz */ break; + case 0xf3: /* rep */ break; + default: pref_done = 1; + } + } while (!pref_done); + + switch (opcode) { + + /* pushf */ + case 0x9c: + if (data32) { + pushl(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 4; + } else { + pushw(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 2; + } + IP(regs) = ip; + VM86_FAULT_RETURN; + + /* popf */ + case 0x9d: + { + unsigned long newflags; + if (data32) { + newflags=popl(ssp, sp, simulate_sigsegv); + SP(regs) += 4; + } else { + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 2; + } + IP(regs) = ip; + CHECK_IF_IN_TRAP; + if (data32) { + set_vflags_long(newflags, regs); + } else { + set_vflags_short(newflags, regs); + } + VM86_FAULT_RETURN; + } + + /* int xx */ + case 0xcd: { + int intno=popb(csp, ip, simulate_sigsegv); + IP(regs) = ip; + if (VMPI.vm86dbg_active) { + if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] ) + return_to_32bit(regs, VM86_INTx + (intno << 8)); + } + do_int(regs, intno, ssp, sp); + return; + } + + /* iret */ + case 0xcf: + { + unsigned long newip; + unsigned long newcs; + unsigned long newflags; + if (data32) { + newip=popl(ssp, sp, simulate_sigsegv); + newcs=popl(ssp, sp, simulate_sigsegv); + newflags=popl(ssp, sp, simulate_sigsegv); + SP(regs) += 12; + } else { + newip = popw(ssp, sp, simulate_sigsegv); + newcs = popw(ssp, sp, simulate_sigsegv); + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 6; + } + IP(regs) = newip; + regs->cs = newcs; + CHECK_IF_IN_TRAP; + if (data32) { + set_vflags_long(newflags, regs); + } else { + set_vflags_short(newflags, regs); + } + VM86_FAULT_RETURN; + } + + /* cli */ + case 0xfa: + IP(regs) = ip; + clear_IF(regs); + VM86_FAULT_RETURN; + + /* sti */ + /* + * Damn. This is incorrect: the 'sti' instruction should actually + * enable interrupts after the /next/ instruction. Not good. + * + * Probably needs some horsing around with the TF flag. Aiee.. + */ + case 0xfb: + IP(regs) = ip; + set_IF(regs); + VM86_FAULT_RETURN; + + default: + return_to_32bit(regs, VM86_UNKNOWN); + } + + return; + +simulate_sigsegv: + /* FIXME: After a long discussion with Stas we finally + * agreed, that this is wrong. Here we should + * really send a SIGSEGV to the user program. + * But how do we create the correct context? We + * are inside a general protection fault handler + * and has just returned from a page fault handler. + * The correct context for the signal handler + * should be a mixture of the two, but how do we + * get the information? [KD] + */ + return_to_32bit(regs, VM86_UNKNOWN); +} + +/* ---------------- vm86 special IRQ passing stuff ----------------- */ + +#define VM86_IRQNAME "vm86irq" + +static struct vm86_irqs { + struct task_struct *tsk; + int sig; +} vm86_irqs[16]; + +static DEFINE_SPINLOCK(irqbits_lock); +static int irqbits; + +#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \ + | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \ + | (1 << SIGUNUSED) ) + +static irqreturn_t irq_handler(int intno, void *dev_id, struct pt_regs * regs) +{ + int irq_bit; + unsigned long flags; + + spin_lock_irqsave(&irqbits_lock, flags); + irq_bit = 1 << intno; + if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk) + goto out; + irqbits |= irq_bit; + if (vm86_irqs[intno].sig) + send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1); + spin_unlock_irqrestore(&irqbits_lock, flags); + /* + * IRQ will be re-enabled when user asks for the irq (whether + * polling or as a result of the signal) + */ + disable_irq(intno); + return IRQ_HANDLED; + +out: + spin_unlock_irqrestore(&irqbits_lock, flags); + return IRQ_NONE; +} + +static inline void free_vm86_irq(int irqnumber) +{ + unsigned long flags; + + free_irq(irqnumber, NULL); + vm86_irqs[irqnumber].tsk = NULL; + + spin_lock_irqsave(&irqbits_lock, flags); + irqbits &= ~(1 << irqnumber); + spin_unlock_irqrestore(&irqbits_lock, flags); +} + +void release_vm86_irqs(struct task_struct *task) +{ + int i; + for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++) + if (vm86_irqs[i].tsk == task) + free_vm86_irq(i); +} + +static inline int get_and_reset_irq(int irqnumber) +{ + int bit; + unsigned long flags; + + if (invalid_vm86_irq(irqnumber)) return 0; + if (vm86_irqs[irqnumber].tsk != current) return 0; + spin_lock_irqsave(&irqbits_lock, flags); + bit = irqbits & (1 << irqnumber); + irqbits &= ~bit; + spin_unlock_irqrestore(&irqbits_lock, flags); + if (!bit) + return 0; + enable_irq(irqnumber); + return 1; +} + + +static int do_vm86_irq_handling(int subfunction, int irqnumber) +{ + int ret; + switch (subfunction) { + case VM86_GET_AND_RESET_IRQ: { + return get_and_reset_irq(irqnumber); + } + case VM86_GET_IRQ_BITS: { + return irqbits; + } + case VM86_REQUEST_IRQ: { + int sig = irqnumber >> 8; + int irq = irqnumber & 255; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM; + if (invalid_vm86_irq(irq)) return -EPERM; + if (vm86_irqs[irq].tsk) return -EPERM; + ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL); + if (ret) return ret; + vm86_irqs[irq].sig = sig; + vm86_irqs[irq].tsk = current; + return irq; + } + case VM86_FREE_IRQ: { + if (invalid_vm86_irq(irqnumber)) return -EPERM; + if (!vm86_irqs[irqnumber].tsk) return 0; + if (vm86_irqs[irqnumber].tsk != current) return -EPERM; + free_vm86_irq(irqnumber); + return 0; + } + } + return -EINVAL; +} + diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S new file mode 100644 index 000000000000..e0512cc8bea7 --- /dev/null +++ b/arch/i386/kernel/vmlinux.lds.S @@ -0,0 +1,134 @@ +/* ld script to make i386 Linux kernel + * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; + */ + +#include <asm-generic/vmlinux.lds.h> +#include <asm/thread_info.h> +#include <asm/page.h> + +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(startup_32) +jiffies = jiffies_64; +SECTIONS +{ + . = __PAGE_OFFSET + 0x100000; + /* read-only */ + _text = .; /* Text and read-only data */ + .text : { + *(.text) + SCHED_TEXT + LOCK_TEXT + *(.fixup) + *(.gnu.warning) + } = 0x9090 + + _etext = .; /* End of text section */ + + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } + __stop___ex_table = .; + + RODATA + + /* writeable */ + .data : { /* Data */ + *(.data) + CONSTRUCTORS + } + + . = ALIGN(4096); + __nosave_begin = .; + .data_nosave : { *(.data.nosave) } + . = ALIGN(4096); + __nosave_end = .; + + . = ALIGN(4096); + .data.page_aligned : { *(.data.idt) } + + . = ALIGN(32); + .data.cacheline_aligned : { *(.data.cacheline_aligned) } + + _edata = .; /* End of data section */ + + . = ALIGN(THREAD_SIZE); /* init_task */ + .data.init_task : { *(.data.init_task) } + + /* will be freed after init */ + . = ALIGN(4096); /* Init code and data */ + __init_begin = .; + .init.text : { + _sinittext = .; + *(.init.text) + _einittext = .; + } + .init.data : { *(.init.data) } + . = ALIGN(16); + __setup_start = .; + .init.setup : { *(.init.setup) } + __setup_end = .; + __initcall_start = .; + .initcall.init : { + *(.initcall1.init) + *(.initcall2.init) + *(.initcall3.init) + *(.initcall4.init) + *(.initcall5.init) + *(.initcall6.init) + *(.initcall7.init) + } + __initcall_end = .; + __con_initcall_start = .; + .con_initcall.init : { *(.con_initcall.init) } + __con_initcall_end = .; + SECURITY_INIT + . = ALIGN(4); + __alt_instructions = .; + .altinstructions : { *(.altinstructions) } + __alt_instructions_end = .; + .altinstr_replacement : { *(.altinstr_replacement) } + /* .exit.text is discard at runtime, not link time, to deal with references + from .altinstructions and .eh_frame */ + .exit.text : { *(.exit.text) } + .exit.data : { *(.exit.data) } + . = ALIGN(4096); + __initramfs_start = .; + .init.ramfs : { *(.init.ramfs) } + __initramfs_end = .; + . = ALIGN(32); + __per_cpu_start = .; + .data.percpu : { *(.data.percpu) } + __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; + /* freed after init ends here */ + + __bss_start = .; /* BSS */ + .bss : { + *(.bss.page_aligned) + *(.bss) + } + . = ALIGN(4); + __bss_stop = .; + + _end = . ; + + /* This is where the kernel creates the early boot page tables */ + . = ALIGN(4096); + pg0 = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + } + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } +} diff --git a/arch/i386/kernel/vsyscall-int80.S b/arch/i386/kernel/vsyscall-int80.S new file mode 100644 index 000000000000..530d0525e5e2 --- /dev/null +++ b/arch/i386/kernel/vsyscall-int80.S @@ -0,0 +1,53 @@ +/* + * Code for the vsyscall page. This version uses the old int $0x80 method. + * + * NOTE: + * 1) __kernel_vsyscall _must_ be first in this page. + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S + * for details. + */ + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + +/* + * Get the common code for the sigreturn entry points. + */ +#include "vsyscall-sigreturn.S" diff --git a/arch/i386/kernel/vsyscall-sigreturn.S b/arch/i386/kernel/vsyscall-sigreturn.S new file mode 100644 index 000000000000..c8fcf75b9be3 --- /dev/null +++ b/arch/i386/kernel/vsyscall-sigreturn.S @@ -0,0 +1,142 @@ +/* + * Common code for the sigreturn entry points on the vsyscall page. + * So far this code is the same for both int80 and sysenter versions. + * This file is #include'd by vsyscall-*.S to define them after the + * vsyscall entry point. The kernel assumes that the addresses of these + * routines are constant for all vsyscall implementations. + */ + +#include <asm/unistd.h> +#include <asm/asm_offsets.h> + + +/* XXX + Should these be named "_sigtramp" or something? +*/ + + .text + .org __kernel_vsyscall+32 + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax /* XXX does this mean it needs unwind info? */ + movl $__NR_sigreturn, %eax + int $0x80 +.LEND_sigreturn: + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .balign 32 + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_rt_sigreturn, %eax + int $0x80 +.LEND_rt_sigreturn: + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI1: + .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 +.LSTARTCIEDLSI1: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0 /* DW_CFA_nop */ + .align 4 +.LENDCIEDLSI1: + .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ +.LSTARTFDEDLSI1: + .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(SIGCONTEXT_esp+4) + do_expr(0, SIGCONTEXT_eax+4) + do_expr(1, SIGCONTEXT_ecx+4) + do_expr(2, SIGCONTEXT_edx+4) + do_expr(3, SIGCONTEXT_ebx+4) + do_expr(5, SIGCONTEXT_ebp+4) + do_expr(6, SIGCONTEXT_esi+4) + do_expr(7, SIGCONTEXT_edi+4) + do_expr(8, SIGCONTEXT_eip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(SIGCONTEXT_esp) + do_expr(0, SIGCONTEXT_eax) + do_expr(1, SIGCONTEXT_ecx) + do_expr(2, SIGCONTEXT_edx) + do_expr(3, SIGCONTEXT_ebx) + do_expr(5, SIGCONTEXT_ebp) + do_expr(6, SIGCONTEXT_esi) + do_expr(7, SIGCONTEXT_edi) + do_expr(8, SIGCONTEXT_eip) + + .align 4 +.LENDFDEDLSI1: + + .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ +.LSTARTFDEDLSI2: + .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) + do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) + do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) + do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) + do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) + do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) + do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) + do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) + do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) + + .align 4 +.LENDFDEDLSI2: + .previous diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S new file mode 100644 index 000000000000..4daefb2ec1b2 --- /dev/null +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -0,0 +1,104 @@ +/* + * Code for the vsyscall page. This version uses the sysenter instruction. + * + * NOTE: + * 1) __kernel_vsyscall _must_ be first in this page. + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S + * for details. + */ + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function +__kernel_vsyscall: +.LSTART_vsyscall: + push %ecx +.Lpush_ecx: + push %edx +.Lpush_edx: + push %ebp +.Lenter_kernel: + movl %esp,%ebp + sysenter + + /* 7: align return point with nop's to make disassembly easier */ + .space 7,0x90 + + /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ + jmp .Lenter_kernel + /* 16: System call normal return point is here! */ + .globl SYSENTER_RETURN /* Symbol used by entry.S. */ +SYSENTER_RETURN: + pop %ebp +.Lpop_ebp: + pop %edx +.Lpop_edx: + pop %ecx +.Lpop_ecx: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpush_ecx-.LSTART_vsyscall + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpush_edx-.Lpush_ecx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lenter_kernel-.Lpush_edx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x10 /* RA at offset 16 now */ + .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ + /* Finally the epilogue. */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_ebp-.Lenter_kernel + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_edx-.Lpop_ebp + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_ecx-.Lpop_edx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x04 /* RA at offset 4 now */ + .align 4 +.LENDFDEDLSI: + .previous + +/* + * Get the common code for the sigreturn entry points. + */ +#include "vsyscall-sigreturn.S" diff --git a/arch/i386/kernel/vsyscall.S b/arch/i386/kernel/vsyscall.S new file mode 100644 index 000000000000..b403890fe39b --- /dev/null +++ b/arch/i386/kernel/vsyscall.S @@ -0,0 +1,15 @@ +#include <linux/init.h> + +__INITDATA + + .globl vsyscall_int80_start, vsyscall_int80_end +vsyscall_int80_start: + .incbin "arch/i386/kernel/vsyscall-int80.so" +vsyscall_int80_end: + + .globl vsyscall_sysenter_start, vsyscall_sysenter_end +vsyscall_sysenter_start: + .incbin "arch/i386/kernel/vsyscall-sysenter.so" +vsyscall_sysenter_end: + +__FINIT diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S new file mode 100644 index 000000000000..3a8329d6536e --- /dev/null +++ b/arch/i386/kernel/vsyscall.lds.S @@ -0,0 +1,65 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address, and with only one read-only + * segment (that fits in one page). This script controls its layout. + */ +#include <asm/asm_offsets.h> + +SECTIONS +{ + . = VSYSCALL_BASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VSYSCALL_BASE + 0x400; + + .text : { *(.text) } :text =0x90909090 + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile new file mode 100644 index 000000000000..7b1932d20f96 --- /dev/null +++ b/arch/i386/lib/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for i386-specific library files.. +# + + +lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \ + bitops.o + +lib-$(CONFIG_X86_USE_3DNOW) += mmx.o +lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o diff --git a/arch/i386/lib/bitops.c b/arch/i386/lib/bitops.c new file mode 100644 index 000000000000..97db3853dc82 --- /dev/null +++ b/arch/i386/lib/bitops.c @@ -0,0 +1,70 @@ +#include <linux/bitops.h> +#include <linux/module.h> + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +int find_next_bit(const unsigned long *addr, int size, int offset) +{ + const unsigned long *p = addr + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - addr)); + return (offset + set + res); +} +EXPORT_SYMBOL(find_next_bit); + +/** + * find_next_zero_bit - find the first zero bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +int find_next_zero_bit(const unsigned long *addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for zero in the first 32 bits. + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (~(*p >> bit))); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No zero yet, search remaining full bytes for a zero + */ + res = find_first_zero_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + set + res); +} +EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/i386/lib/checksum.S b/arch/i386/lib/checksum.S new file mode 100644 index 000000000000..94c7867ddc33 --- /dev/null +++ b/arch/i386/lib/checksum.S @@ -0,0 +1,496 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <asm/errno.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + +.text +.align 4 +.globl csum_partial + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: unsigned char *buff + testl $3, %esi # Check alignment. + jz 2f # Jump if alignment is ok. + testl $1, %esi # Check alignment. + jz 10f # Jump if alignment is boundary of 2bytes. + + # buf is odd + dec %ecx + jl 8f + movzbl (%esi), %ebx + adcl %ebx, %eax + roll $8, %eax + inc %esi + testl $2, %esi + jz 2f +10: + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +1: movw (%esi), %bx + addl $2, %esi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, %edx + shrl $5, %ecx + jz 2f + testl %esi, %esi +1: movl (%esi), %ebx + adcl %ebx, %eax + movl 4(%esi), %ebx + adcl %ebx, %eax + movl 8(%esi), %ebx + adcl %ebx, %eax + movl 12(%esi), %ebx + adcl %ebx, %eax + movl 16(%esi), %ebx + adcl %ebx, %eax + movl 20(%esi), %ebx + adcl %ebx, %eax + movl 24(%esi), %ebx + adcl %ebx, %eax + movl 28(%esi), %ebx + adcl %ebx, %eax + lea 32(%esi), %esi + dec %ecx + jne 1b + adcl $0, %eax +2: movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +3: adcl (%esi), %eax + lea 4(%esi), %esi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f + movw (%esi),%cx + leal 2(%esi),%esi + je 6f + shll $16,%ecx +5: movb (%esi),%cl +6: addl %ecx,%eax + adcl $0, %eax +7: + testl $1, 12(%esp) + jz 8f + roll $8, %eax +8: + popl %ebx + popl %esi + ret + +#else + +/* Version for PentiumII/PPro */ + +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: const unsigned char *buf + + testl $3, %esi + jnz 25f +10: + movl %ecx, %edx + movl %ecx, %ebx + andl $0x7c, %ebx + shrl $7, %ecx + addl %ebx,%esi + shrl $2, %ebx + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + + # Handle 2-byte-aligned regions +20: addw (%esi), %ax + lea 2(%esi), %esi + adcl $0, %eax + jmp 10b +25: + testl $1, %esi + jz 30f + # buf is odd + dec %ecx + jl 90f + movzbl (%esi), %ebx + addl %ebx, %eax + adcl $0, %eax + roll $8, %eax + inc %esi + testl $2, %esi + jz 10b + +30: subl $2, %ecx + ja 20b + je 32f + addl $2, %ecx + jz 80f + movzbl (%esi),%ebx # csumming 1 byte, 2-aligned + addl %ebx, %eax + adcl $0, %eax + jmp 80f +32: + addw (%esi), %ax # csumming 2 bytes, 2-aligned + adcl $0, %eax + jmp 80f + +40: + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +45: + lea 128(%esi), %esi + adcl $0, %eax + dec %ecx + jge 40b + movl %edx, %ecx +50: andl $3, %ecx + jz 80f + + # Handle the last 1-3 bytes without jumping + notl %ecx # 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff,%ebx # by the shll and shrl instructions + shll $3,%ecx + shrl %cl,%ebx + andl -128(%esi),%ebx # esi is 4-aligned so should be ok + addl %ebx,%eax + adcl $0,%eax +80: + testl $1, 12(%esp) + jz 90f + roll $8, %eax +90: + popl %ebx + popl %esi + ret + +#endif + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +.align 4 +.globl csum_partial_copy_generic + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + +#define ARGBASE 16 +#define FP 12 + +csum_partial_copy_generic: + subl $4,%esp + pushl %edi + pushl %esi + pushl %ebx + movl ARGBASE+16(%esp),%eax # sum + movl ARGBASE+12(%esp),%ecx # len + movl ARGBASE+4(%esp),%esi # src + movl ARGBASE+8(%esp),%edi # dst + + testl $2, %edi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +SRC(1: movw (%esi), %bx ) + addl $2, %esi +DST( movw %bx, (%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, FP(%esp) + shrl $5, %ecx + jz 2f + testl %esi, %esi +SRC(1: movl (%esi), %ebx ) +SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + adcl %edx, %eax +DST( movl %edx, 4(%edi) ) + +SRC( movl 8(%esi), %ebx ) +SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 8(%edi) ) + adcl %edx, %eax +DST( movl %edx, 12(%edi) ) + +SRC( movl 16(%esi), %ebx ) +SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 16(%edi) ) + adcl %edx, %eax +DST( movl %edx, 20(%edi) ) + +SRC( movl 24(%esi), %ebx ) +SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 24(%edi) ) + adcl %edx, %eax +DST( movl %edx, 28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi + dec %ecx + jne 1b + adcl $0, %eax +2: movl FP(%esp), %edx + movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx +SRC(5: movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +6: addl %ecx, %eax + adcl $0, %eax +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + + # zero the complete destination - computing the rest + # is too much work + movl ARGBASE+8(%esp), %edi # dst + movl ARGBASE+12(%esp), %ecx # len + xorl %eax,%eax + rep ; stosb + + jmp 5000b + +6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT,(%ebx) + jmp 5000b + +.previous + + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp + ret + +#else + +/* Version for PentiumII/PPro */ + +#define ROUND1(x) \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ROUND(x) \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ARGBASE 12 + +csum_partial_copy_generic: + pushl %ebx + pushl %edi + pushl %esi + movl ARGBASE+4(%esp),%esi #src + movl ARGBASE+8(%esp),%edi #dst + movl ARGBASE+12(%esp),%ecx #len + movl ARGBASE+16(%esp),%eax #sum +# movl %ecx, %edx + movl %ecx, %ebx + movl %esi, %edx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + lea -1(%esi),%edx + andl $-32,%edx + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi + jmp *%ebx +1: addl $64,%esi + addl $64,%edi + SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) + ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) +3: adcl $0,%eax + addl $64, %edx + dec %ecx + jge 1b +4: movl ARGBASE+12(%esp),%edx #len + andl $3, %edx + jz 7f + cmpl $2, %edx + jb 5f +SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +DST( movw %dx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx +5: +SRC( movb (%esi), %dl ) +DST( movb %dl, (%edi) ) +6: addl %edx, %eax + adcl $0, %eax +7: +.section .fixup, "ax" +6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len + xorl %eax,%eax + rep; stosb + jmp 7b +6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 7b +.previous + + popl %esi + popl %edi + popl %ebx + ret + +#undef ROUND +#undef ROUND1 + +#endif diff --git a/arch/i386/lib/dec_and_lock.c b/arch/i386/lib/dec_and_lock.c new file mode 100644 index 000000000000..ab43394dc775 --- /dev/null +++ b/arch/i386/lib/dec_and_lock.c @@ -0,0 +1,40 @@ +/* + * x86 version of "atomic_dec_and_lock()" using + * the atomic "cmpxchg" instruction. + * + * (For CPU's lacking cmpxchg, we use the slow + * generic version, and this one never even gets + * compiled). + */ + +#include <linux/spinlock.h> +#include <asm/atomic.h> + +int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +{ + int counter; + int newcount; + +repeat: + counter = atomic_read(atomic); + newcount = counter-1; + + if (!newcount) + goto slow_path; + + asm volatile("lock; cmpxchgl %1,%2" + :"=a" (newcount) + :"r" (newcount), "m" (atomic->counter), "0" (counter)); + + /* If the above failed, "eax" will have changed */ + if (newcount != counter) + goto repeat; + return 0; + +slow_path: + spin_lock(lock); + if (atomic_dec_and_test(atomic)) + return 1; + spin_unlock(lock); + return 0; +} diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c new file mode 100644 index 000000000000..080639f262b1 --- /dev/null +++ b/arch/i386/lib/delay.c @@ -0,0 +1,49 @@ +/* + * Precise Delay Loops for i386 + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * The __delay function must _NOT_ be inlined as its execution time + * depends wildly on alignment on many x86 processors. The additional + * jump magic is needed to get the timing stable on all the CPU's + * we have to worry about. + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <asm/processor.h> +#include <asm/delay.h> +#include <asm/timer.h> + +#ifdef CONFIG_SMP +#include <asm/smp.h> +#endif + +extern struct timer_opts* timer; + +void __delay(unsigned long loops) +{ + cur_timer->delay(loops); +} + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + xloops *= 4; + __asm__("mull %0" + :"=d" (xloops), "=&a" (d0) + :"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4))); + __delay(++xloops); +} + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} diff --git a/arch/i386/lib/getuser.S b/arch/i386/lib/getuser.S new file mode 100644 index 000000000000..62d7f178a326 --- /dev/null +++ b/arch/i386/lib/getuser.S @@ -0,0 +1,70 @@ +/* + * __get_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ +#include <asm/thread_info.h> + + +/* + * __get_user_X + * + * Inputs: %eax contains the address + * + * Outputs: %eax is error code (0 or -EFAULT) + * %edx contains zero-extended value + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +.text +.align 4 +.globl __get_user_1 +__get_user_1: + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +1: movzbl (%eax),%edx + xorl %eax,%eax + ret + +.align 4 +.globl __get_user_2 +__get_user_2: + addl $1,%eax + jc bad_get_user + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +2: movzwl -1(%eax),%edx + xorl %eax,%eax + ret + +.align 4 +.globl __get_user_4 +__get_user_4: + addl $3,%eax + jc bad_get_user + GET_THREAD_INFO(%edx) + cmpl TI_addr_limit(%edx),%eax + jae bad_get_user +3: movl -3(%eax),%edx + xorl %eax,%eax + ret + +bad_get_user: + xorl %edx,%edx + movl $-14,%eax + ret + +.section __ex_table,"a" + .long 1b,bad_get_user + .long 2b,bad_get_user + .long 3b,bad_get_user +.previous diff --git a/arch/i386/lib/memcpy.c b/arch/i386/lib/memcpy.c new file mode 100644 index 000000000000..891b2359d18a --- /dev/null +++ b/arch/i386/lib/memcpy.c @@ -0,0 +1,44 @@ +#include <linux/config.h> +#include <linux/string.h> +#include <linux/module.h> + +#undef memcpy +#undef memset + +void *memcpy(void *to, const void *from, size_t n) +{ +#ifdef CONFIG_X86_USE_3DNOW + return __memcpy3d(to, from, n); +#else + return __memcpy(to, from, n); +#endif +} +EXPORT_SYMBOL(memcpy); + +void *memset(void *s, int c, size_t count) +{ + return __memset(s, c, count); +} +EXPORT_SYMBOL(memset); + +void *memmove(void *dest, const void *src, size_t n) +{ + int d0, d1, d2; + + if (dest < src) { + memcpy(dest,src,n); + } else { + __asm__ __volatile__( + "std\n\t" + "rep\n\t" + "movsb\n\t" + "cld" + : "=&c" (d0), "=&S" (d1), "=&D" (d2) + :"0" (n), + "1" (n-1+(const char *)src), + "2" (n-1+(char *)dest) + :"memory"); + } + return dest; +} +EXPORT_SYMBOL(memmove); diff --git a/arch/i386/lib/mmx.c b/arch/i386/lib/mmx.c new file mode 100644 index 000000000000..01f8b1a2cc84 --- /dev/null +++ b/arch/i386/lib/mmx.c @@ -0,0 +1,399 @@ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/hardirq.h> + +#include <asm/i387.h> + + +/* + * MMX 3DNow! library helper functions + * + * To do: + * We can use MMX just for prefetch in IRQ's. This may be a win. + * (reported so on K6-III) + * We should use a better code neutral filler for the short jump + * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? + * We also want to clobber the filler register so we don't get any + * register forwarding stalls on the filler. + * + * Add *user handling. Checksums are not a win with MMX on any CPU + * tested so far for any MMX solution figured. + * + * 22/09/2000 - Arjan van de Ven + * Improved for non-egineering-sample Athlons + * + */ + +void *_mmx_memcpy(void *to, const void *from, size_t len) +{ + void *p; + int i; + + if (unlikely(in_interrupt())) + return __memcpy(to, from, len); + + p = to; + i = len >> 6; /* len/64 */ + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + "1: prefetch (%0)\n" /* This set is 28 bytes */ + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2: \n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) ); + + + for(; i>5; i--) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + + for(; i>0; i--) + { + __asm__ __volatile__ ( + " movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + /* + * Now do the tail of the block + */ + __memcpy(to, from, len&63); + kernel_fpu_end(); + return p; +} + +#ifdef CONFIG_MK7 + +/* + * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and + * other MMX using processors do not. + */ + +static void fast_clear_page(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i=0;i<4096/64;i++) + { + __asm__ __volatile__ ( + " movntq %%mm0, (%0)\n" + " movntq %%mm0, 8(%0)\n" + " movntq %%mm0, 16(%0)\n" + " movntq %%mm0, 24(%0)\n" + " movntq %%mm0, 32(%0)\n" + " movntq %%mm0, 40(%0)\n" + " movntq %%mm0, 48(%0)\n" + " movntq %%mm0, 56(%0)\n" + : : "r" (page) : "memory"); + page+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence \n" : : + ); + kernel_fpu_end(); +} + +static void fast_copy_page(void *to, void *from) +{ + int i; + + kernel_fpu_begin(); + + /* maybe the prefetch stuff can go before the expensive fnsave... + * but that is for later. -AV + */ + __asm__ __volatile__ ( + "1: prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2: \n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) ); + + for(i=0; i<(4096-320)/64; i++) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + for(i=(4096-320)/64; i<4096/64; i++) + { + __asm__ __volatile__ ( + "2: movq (%0), %%mm0\n" + " movntq %%mm0, (%1)\n" + " movq 8(%0), %%mm1\n" + " movntq %%mm1, 8(%1)\n" + " movq 16(%0), %%mm2\n" + " movntq %%mm2, 16(%1)\n" + " movq 24(%0), %%mm3\n" + " movntq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm4\n" + " movntq %%mm4, 32(%1)\n" + " movq 40(%0), %%mm5\n" + " movntq %%mm5, 40(%1)\n" + " movq 48(%0), %%mm6\n" + " movntq %%mm6, 48(%1)\n" + " movq 56(%0), %%mm7\n" + " movntq %%mm7, 56(%1)\n" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + /* since movntq is weakly-ordered, a "sfence" is needed to become + * ordered again. + */ + __asm__ __volatile__ ( + " sfence \n" : : + ); + kernel_fpu_end(); +} + +#else + +/* + * Generic MMX implementation without K7 specific streaming + */ + +static void fast_clear_page(void *page) +{ + int i; + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + " pxor %%mm0, %%mm0\n" : : + ); + + for(i=0;i<4096/128;i++) + { + __asm__ __volatile__ ( + " movq %%mm0, (%0)\n" + " movq %%mm0, 8(%0)\n" + " movq %%mm0, 16(%0)\n" + " movq %%mm0, 24(%0)\n" + " movq %%mm0, 32(%0)\n" + " movq %%mm0, 40(%0)\n" + " movq %%mm0, 48(%0)\n" + " movq %%mm0, 56(%0)\n" + " movq %%mm0, 64(%0)\n" + " movq %%mm0, 72(%0)\n" + " movq %%mm0, 80(%0)\n" + " movq %%mm0, 88(%0)\n" + " movq %%mm0, 96(%0)\n" + " movq %%mm0, 104(%0)\n" + " movq %%mm0, 112(%0)\n" + " movq %%mm0, 120(%0)\n" + : : "r" (page) : "memory"); + page+=128; + } + + kernel_fpu_end(); +} + +static void fast_copy_page(void *to, void *from) +{ + int i; + + + kernel_fpu_begin(); + + __asm__ __volatile__ ( + "1: prefetch (%0)\n" + " prefetch 64(%0)\n" + " prefetch 128(%0)\n" + " prefetch 192(%0)\n" + " prefetch 256(%0)\n" + "2: \n" + ".section .fixup, \"ax\"\n" + "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from) ); + + for(i=0; i<4096/64; i++) + { + __asm__ __volatile__ ( + "1: prefetch 320(%0)\n" + "2: movq (%0), %%mm0\n" + " movq 8(%0), %%mm1\n" + " movq 16(%0), %%mm2\n" + " movq 24(%0), %%mm3\n" + " movq %%mm0, (%1)\n" + " movq %%mm1, 8(%1)\n" + " movq %%mm2, 16(%1)\n" + " movq %%mm3, 24(%1)\n" + " movq 32(%0), %%mm0\n" + " movq 40(%0), %%mm1\n" + " movq 48(%0), %%mm2\n" + " movq 56(%0), %%mm3\n" + " movq %%mm0, 32(%1)\n" + " movq %%mm1, 40(%1)\n" + " movq %%mm2, 48(%1)\n" + " movq %%mm3, 56(%1)\n" + ".section .fixup, \"ax\"\n" + "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b, 3b\n" + ".previous" + : : "r" (from), "r" (to) : "memory"); + from+=64; + to+=64; + } + kernel_fpu_end(); +} + + +#endif + +/* + * Favour MMX for page clear and copy. + */ + +static void slow_zero_page(void * page) +{ + int d0, d1; + __asm__ __volatile__( \ + "cld\n\t" \ + "rep ; stosl" \ + : "=&c" (d0), "=&D" (d1) + :"a" (0),"1" (page),"0" (1024) + :"memory"); +} + +void mmx_clear_page(void * page) +{ + if(unlikely(in_interrupt())) + slow_zero_page(page); + else + fast_clear_page(page); +} + +static void slow_copy_page(void *to, void *from) +{ + int d0, d1, d2; + __asm__ __volatile__( \ + "cld\n\t" \ + "rep ; movsl" \ + : "=&c" (d0), "=&D" (d1), "=&S" (d2) \ + : "0" (1024),"1" ((long) to),"2" ((long) from) \ + : "memory"); +} + + +void mmx_copy_page(void *to, void *from) +{ + if(unlikely(in_interrupt())) + slow_copy_page(to, from); + else + fast_copy_page(to, from); +} diff --git a/arch/i386/lib/putuser.S b/arch/i386/lib/putuser.S new file mode 100644 index 000000000000..a32d9f570f48 --- /dev/null +++ b/arch/i386/lib/putuser.S @@ -0,0 +1,87 @@ +/* + * __put_user functions. + * + * (C) Copyright 2005 Linus Torvalds + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ +#include <asm/thread_info.h> + + +/* + * __put_user_X + * + * Inputs: %eax[:%edx] contains the data + * %ecx contains the address + * + * Outputs: %eax is error code (0 or -EFAULT) + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#define ENTER pushl %ebx ; GET_THREAD_INFO(%ebx) +#define EXIT popl %ebx ; ret + +.text +.align 4 +.globl __put_user_1 +__put_user_1: + ENTER + cmpl TI_addr_limit(%ebx),%ecx + jae bad_put_user +1: movb %al,(%ecx) + xorl %eax,%eax + EXIT + +.align 4 +.globl __put_user_2 +__put_user_2: + ENTER + movl TI_addr_limit(%ebx),%ebx + subl $1,%ebx + cmpl %ebx,%ecx + jae bad_put_user +2: movw %ax,(%ecx) + xorl %eax,%eax + EXIT + +.align 4 +.globl __put_user_4 +__put_user_4: + ENTER + movl TI_addr_limit(%ebx),%ebx + subl $3,%ebx + cmpl %ebx,%ecx + jae bad_put_user +3: movl %eax,(%ecx) + xorl %eax,%eax + EXIT + +.align 4 +.globl __put_user_8 +__put_user_8: + ENTER + movl TI_addr_limit(%ebx),%ebx + subl $7,%ebx + cmpl %ebx,%ecx + jae bad_put_user +4: movl %eax,(%ecx) +5: movl %edx,4(%ecx) + xorl %eax,%eax + EXIT + +bad_put_user: + movl $-14,%eax + EXIT + +.section __ex_table,"a" + .long 1b,bad_put_user + .long 2b,bad_put_user + .long 3b,bad_put_user + .long 4b,bad_put_user + .long 5b,bad_put_user +.previous diff --git a/arch/i386/lib/strstr.c b/arch/i386/lib/strstr.c new file mode 100644 index 000000000000..a3dafbf59dae --- /dev/null +++ b/arch/i386/lib/strstr.c @@ -0,0 +1,31 @@ +#include <linux/string.h> + +char * strstr(const char * cs,const char * ct) +{ +int d0, d1; +register char * __res; +__asm__ __volatile__( + "movl %6,%%edi\n\t" + "repne\n\t" + "scasb\n\t" + "notl %%ecx\n\t" + "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */ + "movl %%ecx,%%edx\n" + "1:\tmovl %6,%%edi\n\t" + "movl %%esi,%%eax\n\t" + "movl %%edx,%%ecx\n\t" + "repe\n\t" + "cmpsb\n\t" + "je 2f\n\t" /* also works for empty string, see above */ + "xchgl %%eax,%%esi\n\t" + "incl %%esi\n\t" + "cmpb $0,-1(%%eax)\n\t" + "jne 1b\n\t" + "xorl %%eax,%%eax\n\t" + "2:" + :"=a" (__res), "=&c" (d0), "=&S" (d1) + :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) + :"dx", "di"); +return __res; +} + diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c new file mode 100644 index 000000000000..51aa2bbb0269 --- /dev/null +++ b/arch/i386/lib/usercopy.c @@ -0,0 +1,636 @@ +/* + * User address space access functions. + * The non inlined parts of asm-i386/uaccess.h are here. + * + * Copyright 1997 Andi Kleen <ak@muc.de> + * Copyright 1997 Linus Torvalds + */ +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/mmx.h> + +static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) +{ +#ifdef CONFIG_X86_INTEL_USERCOPY + if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask)) + return 0; +#endif + return 1; +} +#define movsl_is_ok(a1,a2,n) \ + __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n)) + +/* + * Copy a null terminated string from userspace. + */ + +#define __do_strncpy_from_user(dst,src,count,res) \ +do { \ + int __d0, __d1, __d2; \ + might_sleep(); \ + __asm__ __volatile__( \ + " testl %1,%1\n" \ + " jz 2f\n" \ + "0: lodsb\n" \ + " stosb\n" \ + " testb %%al,%%al\n" \ + " jz 1f\n" \ + " decl %1\n" \ + " jnz 0b\n" \ + "1: subl %1,%0\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl %5,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 0b,3b\n" \ + ".previous" \ + : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ + "=&D" (__d2) \ + : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ + : "memory"); \ +} while (0) + +/** + * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Source address, in user space. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from userspace to kernel space. + * Caller must check the specified block with access_ok() before calling + * this function. + * + * On success, returns the length of the string (not including the trailing + * NUL). + * + * If access to userspace fails, returns -EFAULT (some data may have been + * copied). + * + * If @count is smaller than the length of the string, copies @count bytes + * and returns @count. + */ +long +__strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res; + __do_strncpy_from_user(dst, src, count, res); + return res; +} + +/** + * strncpy_from_user: - Copy a NUL terminated string from userspace. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Source address, in user space. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from userspace to kernel space. + * + * On success, returns the length of the string (not including the trailing + * NUL). + * + * If access to userspace fails, returns -EFAULT (some data may have been + * copied). + * + * If @count is smaller than the length of the string, copies @count bytes + * and returns @count. + */ +long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res = -EFAULT; + if (access_ok(VERIFY_READ, src, 1)) + __do_strncpy_from_user(dst, src, count, res); + return res; +} + + +/* + * Zero Userspace + */ + +#define __do_clear_user(addr,size) \ +do { \ + int __d0; \ + might_sleep(); \ + __asm__ __volatile__( \ + "0: rep; stosl\n" \ + " movl %2,%0\n" \ + "1: rep; stosb\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: lea 0(%2,%0,4),%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 0b,3b\n" \ + " .long 1b,2b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0) \ + : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ +} while (0) + +/** + * clear_user: - Zero a block of memory in user space. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ +unsigned long +clear_user(void __user *to, unsigned long n) +{ + might_sleep(); + if (access_ok(VERIFY_WRITE, to, n)) + __do_clear_user(to, n); + return n; +} + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ +unsigned long +__clear_user(void __user *to, unsigned long n) +{ + __do_clear_user(to, n); + return n; +} + +/** + * strlen_user: - Get the size of a string in user space. + * @s: The string to measure. + * @n: The maximum valid length + * + * Get the size of a NUL-terminated string in user space. + * + * Returns the size of the string INCLUDING the terminating NUL. + * On exception, returns 0. + * If the string is too long, returns a value greater than @n. + */ +long strnlen_user(const char __user *s, long n) +{ + unsigned long mask = -__addr_ok(s); + unsigned long res, tmp; + + might_sleep(); + + __asm__ __volatile__( + " testl %0, %0\n" + " jz 3f\n" + " andl %0,%%ecx\n" + "0: repne; scasb\n" + " setne %%al\n" + " subl %%ecx,%0\n" + " addl %0,%%eax\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: xorl %%eax,%%eax\n" + " jmp 1b\n" + "3: movb $1,%%al\n" + " jmp 1b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 0b,2b\n" + ".previous" + :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp) + :"0" (n), "1" (s), "2" (0), "3" (mask) + :"cc"); + return res & mask; +} + +#ifdef CONFIG_X86_INTEL_USERCOPY +static unsigned long +__copy_user_intel(void __user *to, const void *from, unsigned long size) +{ + int d0, d1; + __asm__ __volatile__( + " .align 2,0x90\n" + "1: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" + " jbe 3f\n" + "2: movl 64(%4), %%eax\n" + " .align 2,0x90\n" + "3: movl 0(%4), %%eax\n" + "4: movl 4(%4), %%edx\n" + "5: movl %%eax, 0(%3)\n" + "6: movl %%edx, 4(%3)\n" + "7: movl 8(%4), %%eax\n" + "8: movl 12(%4),%%edx\n" + "9: movl %%eax, 8(%3)\n" + "10: movl %%edx, 12(%3)\n" + "11: movl 16(%4), %%eax\n" + "12: movl 20(%4), %%edx\n" + "13: movl %%eax, 16(%3)\n" + "14: movl %%edx, 20(%3)\n" + "15: movl 24(%4), %%eax\n" + "16: movl 28(%4), %%edx\n" + "17: movl %%eax, 24(%3)\n" + "18: movl %%edx, 28(%3)\n" + "19: movl 32(%4), %%eax\n" + "20: movl 36(%4), %%edx\n" + "21: movl %%eax, 32(%3)\n" + "22: movl %%edx, 36(%3)\n" + "23: movl 40(%4), %%eax\n" + "24: movl 44(%4), %%edx\n" + "25: movl %%eax, 40(%3)\n" + "26: movl %%edx, 44(%3)\n" + "27: movl 48(%4), %%eax\n" + "28: movl 52(%4), %%edx\n" + "29: movl %%eax, 48(%3)\n" + "30: movl %%edx, 52(%3)\n" + "31: movl 56(%4), %%eax\n" + "32: movl 60(%4), %%edx\n" + "33: movl %%eax, 56(%3)\n" + "34: movl %%edx, 60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" + " cmpl $63, %0\n" + " ja 1b\n" + "35: movl %0, %%eax\n" + " shrl $2, %0\n" + " andl $3, %%eax\n" + " cld\n" + "99: rep; movsl\n" + "36: movl %%eax, %0\n" + "37: rep; movsb\n" + "100:\n" + ".section .fixup,\"ax\"\n" + "101: lea 0(%%eax,%0,4),%0\n" + " jmp 100b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,100b\n" + " .long 2b,100b\n" + " .long 3b,100b\n" + " .long 4b,100b\n" + " .long 5b,100b\n" + " .long 6b,100b\n" + " .long 7b,100b\n" + " .long 8b,100b\n" + " .long 9b,100b\n" + " .long 10b,100b\n" + " .long 11b,100b\n" + " .long 12b,100b\n" + " .long 13b,100b\n" + " .long 14b,100b\n" + " .long 15b,100b\n" + " .long 16b,100b\n" + " .long 17b,100b\n" + " .long 18b,100b\n" + " .long 19b,100b\n" + " .long 20b,100b\n" + " .long 21b,100b\n" + " .long 22b,100b\n" + " .long 23b,100b\n" + " .long 24b,100b\n" + " .long 25b,100b\n" + " .long 26b,100b\n" + " .long 27b,100b\n" + " .long 28b,100b\n" + " .long 29b,100b\n" + " .long 30b,100b\n" + " .long 31b,100b\n" + " .long 32b,100b\n" + " .long 33b,100b\n" + " .long 34b,100b\n" + " .long 35b,100b\n" + " .long 36b,100b\n" + " .long 37b,100b\n" + " .long 99b,101b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) + : "1"(to), "2"(from), "0"(size) + : "eax", "edx", "memory"); + return size; +} + +static unsigned long +__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) +{ + int d0, d1; + __asm__ __volatile__( + " .align 2,0x90\n" + "0: movl 32(%4), %%eax\n" + " cmpl $67, %0\n" + " jbe 2f\n" + "1: movl 64(%4), %%eax\n" + " .align 2,0x90\n" + "2: movl 0(%4), %%eax\n" + "21: movl 4(%4), %%edx\n" + " movl %%eax, 0(%3)\n" + " movl %%edx, 4(%3)\n" + "3: movl 8(%4), %%eax\n" + "31: movl 12(%4),%%edx\n" + " movl %%eax, 8(%3)\n" + " movl %%edx, 12(%3)\n" + "4: movl 16(%4), %%eax\n" + "41: movl 20(%4), %%edx\n" + " movl %%eax, 16(%3)\n" + " movl %%edx, 20(%3)\n" + "10: movl 24(%4), %%eax\n" + "51: movl 28(%4), %%edx\n" + " movl %%eax, 24(%3)\n" + " movl %%edx, 28(%3)\n" + "11: movl 32(%4), %%eax\n" + "61: movl 36(%4), %%edx\n" + " movl %%eax, 32(%3)\n" + " movl %%edx, 36(%3)\n" + "12: movl 40(%4), %%eax\n" + "71: movl 44(%4), %%edx\n" + " movl %%eax, 40(%3)\n" + " movl %%edx, 44(%3)\n" + "13: movl 48(%4), %%eax\n" + "81: movl 52(%4), %%edx\n" + " movl %%eax, 48(%3)\n" + " movl %%edx, 52(%3)\n" + "14: movl 56(%4), %%eax\n" + "91: movl 60(%4), %%edx\n" + " movl %%eax, 56(%3)\n" + " movl %%edx, 60(%3)\n" + " addl $-64, %0\n" + " addl $64, %4\n" + " addl $64, %3\n" + " cmpl $63, %0\n" + " ja 0b\n" + "5: movl %0, %%eax\n" + " shrl $2, %0\n" + " andl $3, %%eax\n" + " cld\n" + "6: rep; movsl\n" + " movl %%eax,%0\n" + "7: rep; movsb\n" + "8:\n" + ".section .fixup,\"ax\"\n" + "9: lea 0(%%eax,%0,4),%0\n" + "16: pushl %0\n" + " pushl %%eax\n" + " xorl %%eax,%%eax\n" + " rep; stosb\n" + " popl %%eax\n" + " popl %0\n" + " jmp 8b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 0b,16b\n" + " .long 1b,16b\n" + " .long 2b,16b\n" + " .long 21b,16b\n" + " .long 3b,16b\n" + " .long 31b,16b\n" + " .long 4b,16b\n" + " .long 41b,16b\n" + " .long 10b,16b\n" + " .long 51b,16b\n" + " .long 11b,16b\n" + " .long 61b,16b\n" + " .long 12b,16b\n" + " .long 71b,16b\n" + " .long 13b,16b\n" + " .long 81b,16b\n" + " .long 14b,16b\n" + " .long 91b,16b\n" + " .long 6b,9b\n" + " .long 7b,16b\n" + ".previous" + : "=&c"(size), "=&D" (d0), "=&S" (d1) + : "1"(to), "2"(from), "0"(size) + : "eax", "edx", "memory"); + return size; +} +#else +/* + * Leave these declared but undefined. They should not be any references to + * them + */ +unsigned long +__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size); +unsigned long +__copy_user_intel(void __user *to, const void *from, unsigned long size); +#endif /* CONFIG_X86_INTEL_USERCOPY */ + +/* Generic arbitrary sized copy. */ +#define __copy_user(to,from,size) \ +do { \ + int __d0, __d1, __d2; \ + __asm__ __volatile__( \ + " cmp $7,%0\n" \ + " jbe 1f\n" \ + " movl %1,%0\n" \ + " negl %0\n" \ + " andl $7,%0\n" \ + " subl %0,%3\n" \ + "4: rep; movsb\n" \ + " movl %3,%0\n" \ + " shrl $2,%0\n" \ + " andl $3,%3\n" \ + " .align 2,0x90\n" \ + "0: rep; movsl\n" \ + " movl %3,%0\n" \ + "1: rep; movsb\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "5: addl %3,%0\n" \ + " jmp 2b\n" \ + "3: lea 0(%3,%0,4),%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 4b,5b\n" \ + " .long 0b,3b\n" \ + " .long 1b,2b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ + : "3"(size), "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + +#define __copy_user_zeroing(to,from,size) \ +do { \ + int __d0, __d1, __d2; \ + __asm__ __volatile__( \ + " cmp $7,%0\n" \ + " jbe 1f\n" \ + " movl %1,%0\n" \ + " negl %0\n" \ + " andl $7,%0\n" \ + " subl %0,%3\n" \ + "4: rep; movsb\n" \ + " movl %3,%0\n" \ + " shrl $2,%0\n" \ + " andl $3,%3\n" \ + " .align 2,0x90\n" \ + "0: rep; movsl\n" \ + " movl %3,%0\n" \ + "1: rep; movsb\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "5: addl %3,%0\n" \ + " jmp 6f\n" \ + "3: lea 0(%3,%0,4),%0\n" \ + "6: pushl %0\n" \ + " pushl %%eax\n" \ + " xorl %%eax,%%eax\n" \ + " rep; stosb\n" \ + " popl %%eax\n" \ + " popl %0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 4b,5b\n" \ + " .long 0b,3b\n" \ + " .long 1b,6b\n" \ + ".previous" \ + : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ + : "3"(size), "0"(size), "1"(to), "2"(from) \ + : "memory"); \ +} while (0) + + +unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) +{ + BUG_ON((long) n < 0); +#ifndef CONFIG_X86_WP_WORKS_OK + if (unlikely(boot_cpu_data.wp_works_ok == 0) && + ((unsigned long )to) < TASK_SIZE) { + /* + * CPU does not honor the WP bit when writing + * from supervisory mode, and due to preemption or SMP, + * the page tables can change at any time. + * Do it manually. Manfred <manfred@colorfullife.com> + */ + while (n) { + unsigned long offset = ((unsigned long)to)%PAGE_SIZE; + unsigned long len = PAGE_SIZE - offset; + int retval; + struct page *pg; + void *maddr; + + if (len > n) + len = n; + +survive: + down_read(¤t->mm->mmap_sem); + retval = get_user_pages(current, current->mm, + (unsigned long )to, 1, 1, 0, &pg, NULL); + + if (retval == -ENOMEM && current->pid == 1) { + up_read(¤t->mm->mmap_sem); + blk_congestion_wait(WRITE, HZ/50); + goto survive; + } + + if (retval != 1) { + up_read(¤t->mm->mmap_sem); + break; + } + + maddr = kmap_atomic(pg, KM_USER0); + memcpy(maddr + offset, from, len); + kunmap_atomic(maddr, KM_USER0); + set_page_dirty_lock(pg); + put_page(pg); + up_read(¤t->mm->mmap_sem); + + from += len; + to += len; + n -= len; + } + return n; + } +#endif + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else + n = __copy_user_intel(to, from, n); + return n; +} + +unsigned long +__copy_from_user_ll(void *to, const void __user *from, unsigned long n) +{ + BUG_ON((long)n < 0); + if (movsl_is_ok(to, from, n)) + __copy_user_zeroing(to, from, n); + else + n = __copy_user_zeroing_intel(to, from, n); + return n; +} + +/** + * copy_to_user: - Copy a block of data into user space. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep. + * + * Copy data from kernel space to user space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +unsigned long +copy_to_user(void __user *to, const void *from, unsigned long n) +{ + might_sleep(); + BUG_ON((long) n < 0); + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(copy_to_user); + +/** + * copy_from_user: - Copy a block of data from user space. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep. + * + * Copy data from user space to kernel space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + */ +unsigned long +copy_from_user(void *to, const void __user *from, unsigned long n) +{ + might_sleep(); + BUG_ON((long) n < 0); + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} +EXPORT_SYMBOL(copy_from_user); diff --git a/arch/i386/mach-default/Makefile b/arch/i386/mach-default/Makefile new file mode 100644 index 000000000000..e95bb0237921 --- /dev/null +++ b/arch/i386/mach-default/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the linux kernel. +# + +obj-y := setup.o topology.o diff --git a/arch/i386/mach-default/setup.c b/arch/i386/mach-default/setup.c new file mode 100644 index 000000000000..0aa08eaa8932 --- /dev/null +++ b/arch/i386/mach-default/setup.c @@ -0,0 +1,106 @@ +/* + * Machine specific setup for generic + */ + +#include <linux/config.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/arch_hooks.h> + +/** + * pre_intr_init_hook - initialisation prior to setting up interrupt vectors + * + * Description: + * Perform any necessary interrupt initialisation prior to setting up + * the "ordinary" interrupt call gates. For legacy reasons, the ISA + * interrupts should be initialised here if the machine emulates a PC + * in any way. + **/ +void __init pre_intr_init_hook(void) +{ + init_ISA_irqs(); +} + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; + +/** + * intr_init_hook - post gate setup interrupt initialisation + * + * Description: + * Fill in any interrupts that may have been left out by the general + * init_IRQ() routine. interrupts having to do with the machine rather + * than the devices on the I/O bus (like APIC interrupts in intel MP + * systems) are started here. + **/ +void __init intr_init_hook(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic_intr_init(); +#endif + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} + +/** + * pre_setup_arch_hook - hook called prior to any setup_arch() execution + * + * Description: + * generally used to activate any machine specific identification + * routines that may be needed before setup_arch() runs. On VISWS + * this is used to get the board revision and type. + **/ +void __init pre_setup_arch_hook(void) +{ +} + +/** + * trap_init_hook - initialise system specific traps + * + * Description: + * Called as the final act of trap_init(). Used in VISWS to initialise + * the various board specific APIC traps. + **/ +void __init trap_init_hook(void) +{ +} + +static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; + +/** + * time_init_hook - do any specific initialisations for the system timer. + * + * Description: + * Must plug the system timer interrupt source at HZ into the IRQ listed + * in irq_vectors.h:TIMER_IRQ + **/ +void __init time_init_hook(void) +{ + setup_irq(0, &irq0); +} + +#ifdef CONFIG_MCA +/** + * mca_nmi_hook - hook into MCA specific NMI chain + * + * Description: + * The MCA (Microchannel Arcitecture) has an NMI chain for NMI sources + * along the MCA bus. Use this to hook into that chain if you will need + * it. + **/ +void __init mca_nmi_hook(void) +{ + /* If I recall correctly, there's a whole bunch of other things that + * we can do to check for NMI problems, but that's all I know about + * at the moment. + */ + + printk("NMI generated from unknown source!\n"); +} +#endif diff --git a/arch/i386/mach-default/topology.c b/arch/i386/mach-default/topology.c new file mode 100644 index 000000000000..5b3e8817dae8 --- /dev/null +++ b/arch/i386/mach-default/topology.c @@ -0,0 +1,98 @@ +/* + * arch/i386/mach-generic/topology.c - Populate driverfs with topology information + * + * Written by: Matthew Dobson, IBM Corporation + * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + */ +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/nodemask.h> +#include <asm/cpu.h> + +static struct i386_cpu cpu_devices[NR_CPUS]; + +int arch_register_cpu(int num){ + struct node *parent = NULL; + +#ifdef CONFIG_NUMA + int node = cpu_to_node(num); + if (node_online(node)) + parent = &node_devices[node].node; +#endif /* CONFIG_NUMA */ + + return register_cpu(&cpu_devices[num].cpu, num, parent); +} + +#ifdef CONFIG_HOTPLUG_CPU + +void arch_unregister_cpu(int num) { + struct node *parent = NULL; + +#ifdef CONFIG_NUMA + int node = cpu_to_node(num); + if (node_online(node)) + parent = &node_devices[node].node; +#endif /* CONFIG_NUMA */ + + return unregister_cpu(&cpu_devices[num].cpu, parent); +} +EXPORT_SYMBOL(arch_register_cpu); +EXPORT_SYMBOL(arch_unregister_cpu); +#endif /*CONFIG_HOTPLUG_CPU*/ + + + +#ifdef CONFIG_NUMA +#include <linux/mmzone.h> +#include <asm/node.h> + +struct i386_node node_devices[MAX_NUMNODES]; + +static int __init topology_init(void) +{ + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (node_online(i)) + arch_register_node(i); + } + for (i = 0; i < NR_CPUS; i++) + if (cpu_possible(i)) arch_register_cpu(i); + return 0; +} + +#else /* !CONFIG_NUMA */ + +static int __init topology_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_possible(i)) arch_register_cpu(i); + return 0; +} + +#endif /* CONFIG_NUMA */ + +subsys_initcall(topology_init); diff --git a/arch/i386/mach-es7000/Makefile b/arch/i386/mach-es7000/Makefile new file mode 100644 index 000000000000..69dd4da218dc --- /dev/null +++ b/arch/i386/mach-es7000/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the linux kernel. +# + +obj-$(CONFIG_X86_ES7000) := es7000plat.o +obj-$(CONFIG_X86_GENERICARCH) := es7000plat.o diff --git a/arch/i386/mach-es7000/es7000.h b/arch/i386/mach-es7000/es7000.h new file mode 100644 index 000000000000..70691f0c4ce2 --- /dev/null +++ b/arch/i386/mach-es7000/es7000.h @@ -0,0 +1,110 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +#define MIP_REG 1 +#define MIP_PSAI_REG 4 + +#define MIP_BUSY 1 +#define MIP_SPIN 0xf0000 +#define MIP_VALID 0x0100000000000000ULL +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) + +#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) + +struct mip_reg_info { + unsigned long long mip_info; + unsigned long long delivery_info; + unsigned long long host_reg; + unsigned long long mip_reg; +}; + +struct part_info { + unsigned char type; + unsigned char length; + unsigned char part_id; + unsigned char apic_mode; + unsigned long snum; + char ptype[16]; + char sname[64]; + char pname[64]; +}; + +struct psai { + unsigned long long entry_type; + unsigned long long addr; + unsigned long long bep_addr; +}; + +struct es7000_mem_info { + unsigned char type; + unsigned char length; + unsigned char resv[6]; + unsigned long long start; + unsigned long long size; +}; + +struct es7000_oem_table { + unsigned long long hdr; + struct mip_reg_info mip; + struct part_info pif; + struct es7000_mem_info shm; + struct psai psai; +}; + +struct acpi_table_sdt { + unsigned long pa; + unsigned long count; + struct { + unsigned long pa; + enum acpi_table_id id; + unsigned long size; + } entry[50]; +}; + +struct oem_table { + struct acpi_table_header Header; + u32 OEMTableAddr; + u32 OEMTableSize; +}; + +struct mip_reg { + unsigned long long off_0; + unsigned long long off_8; + unsigned long long off_10; + unsigned long long off_18; + unsigned long long off_20; + unsigned long long off_28; + unsigned long long off_30; + unsigned long long off_38; +}; + +#define MIP_SW_APIC 0x1020b +#define MIP_FUNC(VALUE) (VALUE & 0xff) + +extern int parse_unisys_oem (char *oemptr, int oem_entries); +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length); +extern int es7000_start_cpu(int cpu, unsigned long eip); +extern void es7000_sw_apic(void); diff --git a/arch/i386/mach-es7000/es7000plat.c b/arch/i386/mach-es7000/es7000plat.c new file mode 100644 index 000000000000..d5936d500479 --- /dev/null +++ b/arch/i386/mach-es7000/es7000plat.c @@ -0,0 +1,316 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <asm/io.h> +#include <asm/nmi.h> +#include <asm/smp.h> +#include <asm/apicdef.h> +#include "es7000.h" + +/* + * ES7000 Globals + */ + +volatile unsigned long *psai = NULL; +struct mip_reg *mip_reg; +struct mip_reg *host_reg; +int mip_port; +unsigned long mip_addr, host_addr; + +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || defined(CONFIG_ACPI_BOOT)) + +/* + * GSI override for ES7000 platforms. + */ + +static unsigned int base; + +static int +es7000_rename_gsi(int ioapic, int gsi) +{ + if (!base) { + int i; + for (i = 0; i < nr_ioapics; i++) + base += nr_ioapic_registers[i]; + } + + if (!ioapic && (gsi < 16)) + gsi += base; + return gsi; +} + +#endif // (CONFIG_X86_IO_APIC) && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT) + +/* + * Parse the OEM Table + */ + +int __init +parse_unisys_oem (char *oemptr, int oem_entries) +{ + int i; + int success = 0; + unsigned char type, size; + unsigned long val; + char *tp = NULL; + struct psai *psaip = NULL; + struct mip_reg_info *mi; + struct mip_reg *host, *mip; + + tp = oemptr; + + tp += 8; + + for (i=0; i <= oem_entries; i++) { + type = *tp++; + size = *tp++; + tp -= 2; + switch (type) { + case MIP_REG: + mi = (struct mip_reg_info *)tp; + val = MIP_RD_LO(mi->host_reg); + host_addr = val; + host = (struct mip_reg *)val; + host_reg = __va(host); + val = MIP_RD_LO(mi->mip_reg); + mip_port = MIP_PORT(mi->mip_info); + mip_addr = val; + mip = (struct mip_reg *)val; + mip_reg = __va(mip); + Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); + success++; + break; + case MIP_PSAI_REG: + psaip = (struct psai *)tp; + if (tp != NULL) { + if (psaip->addr) + psai = __va(psaip->addr); + else + psai = NULL; + success++; + } + break; + default: + break; + } + if (i == 6) break; + tp += size; + } + + if (success < 2) { + es7000_plat = 0; + } else { + printk("\nEnabling ES7000 specific features...\n"); + /* + * Determine the generation of the ES7000 currently running. + * + * es7000_plat = 0 if the machine is NOT a Unisys ES7000 box + * es7000_plat = 1 if the machine is a 5xx ES7000 box + * es7000_plat = 2 if the machine is a x86_64 ES7000 box + * + */ + if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) + es7000_plat = 2; + else + es7000_plat = 1; + + ioapic_renumber_irq = es7000_rename_gsi; + } + return es7000_plat; +} + +int __init +find_unisys_acpi_oem_table(unsigned long *oem_addr, int *length) +{ + struct acpi_table_rsdp *rsdp = NULL; + unsigned long rsdp_phys = 0; + struct acpi_table_header *header = NULL; + int i; + struct acpi_table_sdt sdt; + + rsdp_phys = acpi_find_rsdp(); + rsdp = __va(rsdp_phys); + if (rsdp->rsdt_address) { + struct acpi_table_rsdt *mapped_rsdt = NULL; + sdt.pa = rsdp->rsdt_address; + + header = (struct acpi_table_header *) + __acpi_map_table(sdt.pa, sizeof(struct acpi_table_header)); + if (!header) + return -ENODEV; + + sdt.count = (header->length - sizeof(struct acpi_table_header)) >> 3; + mapped_rsdt = (struct acpi_table_rsdt *) + __acpi_map_table(sdt.pa, header->length); + if (!mapped_rsdt) + return -ENODEV; + + header = &mapped_rsdt->header; + + for (i = 0; i < sdt.count; i++) + sdt.entry[i].pa = (unsigned long) mapped_rsdt->entry[i]; + }; + for (i = 0; i < sdt.count; i++) { + + header = (struct acpi_table_header *) + __acpi_map_table(sdt.entry[i].pa, + sizeof(struct acpi_table_header)); + if (!header) + continue; + if (!strncmp((char *) &header->signature, "OEM1", 4)) { + if (!strncmp((char *) &header->oem_id, "UNISYS", 6)) { + void *addr; + struct oem_table *t; + acpi_table_print(header, sdt.entry[i].pa); + t = (struct oem_table *) __acpi_map_table(sdt.entry[i].pa, header->length); + addr = (void *) __acpi_map_table(t->OEMTableAddr, t->OEMTableSize); + *length = header->length; + *oem_addr = (unsigned long) addr; + return 0; + } + } + } + Dprintk("ES7000: did not find Unisys ACPI OEM table!\n"); + return -1; +} + +static void +es7000_spin(int n) +{ + int i = 0; + + while (i++ < n) + rep_nop(); +} + +static int __init +es7000_mip_write(struct mip_reg *mip_reg) +{ + int status = 0; + int spin; + + spin = MIP_SPIN; + while (((unsigned long long)host_reg->off_38 & + (unsigned long long)MIP_VALID) != 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); + outb(1, mip_port); + + spin = MIP_SPIN; + + while (((unsigned long long)mip_reg->off_38 & + (unsigned long long)MIP_VALID) == 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + status = ((unsigned long long)mip_reg->off_0 & + (unsigned long long)0xffff0000000000ULL) >> 48; + mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & + (unsigned long long)~MIP_VALID); + return status; +} + +int +es7000_start_cpu(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; + +} + +int +es7000_stop_cpu(int cpu) +{ + int startup; + + if (psai == NULL) + return -1; + + startup= (0x1000000 | cpu); + + while ((*psai & 0xff00ffff) != startup) + ; + + startup = (*psai & 0xff0000) >> 16; + *psai &= 0xffffff; + + return 0; + +} + +void __init +es7000_sw_apic() +{ + if (es7000_plat) { + int mip_status; + struct mip_reg es7000_mip_reg; + + printk("ES7000: Enabling APIC mode.\n"); + memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); + es7000_mip_reg.off_0 = MIP_SW_APIC; + es7000_mip_reg.off_38 = (MIP_VALID); + while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) + printk("es7000_sw_apic: command failed, status = %x\n", + mip_status); + return; + } +} diff --git a/arch/i386/mach-generic/Makefile b/arch/i386/mach-generic/Makefile new file mode 100644 index 000000000000..77fbc9f64fbc --- /dev/null +++ b/arch/i386/mach-generic/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the generic architecture +# + +EXTRA_CFLAGS += -I../kernel + +obj-y := probe.o summit.o bigsmp.o es7000.o default.o ../mach-es7000/ diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c new file mode 100644 index 000000000000..25883b44f625 --- /dev/null +++ b/arch/i386/mach-generic/bigsmp.c @@ -0,0 +1,54 @@ +/* + * APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs. + * Drives the local APIC in "clustered mode". + */ +#define APIC_DEFINITION 1 +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/genapic.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/dmi.h> +#include <asm/mach-bigsmp/mach_apic.h> +#include <asm/mach-bigsmp/mach_apicdef.h> +#include <asm/mach-bigsmp/mach_ipi.h> +#include <asm/mach-default/mach_mpparse.h> + +static int dmi_bigsmp; /* can be set by dmi scanners */ + +static __init int hp_ht_bigsmp(struct dmi_system_id *d) +{ +#ifdef CONFIG_X86_GENERICARCH + printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); + dmi_bigsmp = 1; +#endif + return 0; +} + + +static struct dmi_system_id __initdata bigsmp_dmi_table[] = { + { hp_ht_bigsmp, "HP ProLiant DL760 G2", { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P44-"), + }}, + + { hp_ht_bigsmp, "HP ProLiant DL740", { + DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P47-"), + }}, + { } +}; + + +static __init int probe_bigsmp(void) +{ + dmi_check_system(bigsmp_dmi_table); + return dmi_bigsmp; +} + +struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp); diff --git a/arch/i386/mach-generic/default.c b/arch/i386/mach-generic/default.c new file mode 100644 index 000000000000..7da14e9a79c3 --- /dev/null +++ b/arch/i386/mach-generic/default.c @@ -0,0 +1,27 @@ +/* + * Default generic APIC driver. This handles upto 8 CPUs. + */ +#define APIC_DEFINITION 1 +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/mach-default/mach_apicdef.h> +#include <asm/genapic.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <asm/mach-default/mach_apic.h> +#include <asm/mach-default/mach_ipi.h> +#include <asm/mach-default/mach_mpparse.h> + +/* should be called last. */ +static __init int probe_default(void) +{ + return 1; +} + +struct genapic apic_default = APIC_INIT("default", probe_default); diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c new file mode 100644 index 000000000000..48d3ec37241b --- /dev/null +++ b/arch/i386/mach-generic/es7000.c @@ -0,0 +1,28 @@ +/* + * APIC driver for the Unisys ES7000 chipset. + */ +#define APIC_DEFINITION 1 +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/genapic.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <asm/mach-es7000/mach_apicdef.h> +#include <asm/mach-es7000/mach_apic.h> +#include <asm/mach-es7000/mach_ipi.h> +#include <asm/mach-es7000/mach_mpparse.h> +#include <asm/mach-es7000/mach_wakecpu.h> + +static __init int probe_es7000(void) +{ + /* probed later in mptable/ACPI hooks */ + return 0; +} + +struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c new file mode 100644 index 000000000000..5497c65a8790 --- /dev/null +++ b/arch/i386/mach-generic/probe.c @@ -0,0 +1,102 @@ +/* Copyright 2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + * + * Generic x86 APIC driver probe layer. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/genapic.h> + +extern struct genapic apic_summit; +extern struct genapic apic_bigsmp; +extern struct genapic apic_es7000; +extern struct genapic apic_default; + +struct genapic *genapic = &apic_default; + +struct genapic *apic_probe[] __initdata = { + &apic_summit, + &apic_bigsmp, + &apic_es7000, + &apic_default, /* must be last */ + NULL, +}; + +void __init generic_apic_probe(char *command_line) +{ + char *s; + int i; + int changed = 0; + + s = strstr(command_line, "apic="); + if (s && (s == command_line || isspace(s[-1]))) { + char *p = strchr(s, ' '), old; + if (!p) + p = strchr(s, '\0'); + old = *p; + *p = 0; + for (i = 0; !changed && apic_probe[i]; i++) { + if (!strcmp(apic_probe[i]->name, s+5)) { + changed = 1; + genapic = apic_probe[i]; + } + } + if (!changed) + printk(KERN_ERR "Unknown genapic `%s' specified.\n", s); + *p = old; + } + for (i = 0; !changed && apic_probe[i]; i++) { + if (apic_probe[i]->probe()) { + changed = 1; + genapic = apic_probe[i]; + } + } + /* Not visible without early console */ + if (!changed) + panic("Didn't find an APIC driver"); + + printk(KERN_INFO "Using APIC driver %s\n", genapic->name); +} + +/* These functions can switch the APIC even after the initial ->probe() */ + +int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) +{ + int i; + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->mps_oem_check(mpc,oem,productid)) { + genapic = apic_probe[i]; + printk(KERN_INFO "Switched to APIC driver `%s'.\n", + genapic->name); + return 1; + } + } + return 0; +} + +int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + int i; + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { + genapic = apic_probe[i]; + printk(KERN_INFO "Switched to APIC driver `%s'.\n", + genapic->name); + return 1; + } + } + return 0; +} + +int hard_smp_processor_id(void) +{ + return genapic->get_apic_id(*(unsigned long *)(APIC_BASE+APIC_ID)); +} diff --git a/arch/i386/mach-generic/summit.c b/arch/i386/mach-generic/summit.c new file mode 100644 index 000000000000..65ddf74d7f25 --- /dev/null +++ b/arch/i386/mach-generic/summit.c @@ -0,0 +1,27 @@ +/* + * APIC driver for the IBM "Summit" chipset. + */ +#define APIC_DEFINITION 1 +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <asm/mpspec.h> +#include <asm/genapic.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <asm/mach-summit/mach_apic.h> +#include <asm/mach-summit/mach_apicdef.h> +#include <asm/mach-summit/mach_ipi.h> +#include <asm/mach-summit/mach_mpparse.h> + +static __init int probe_summit(void) +{ + /* probed later in mptable/ACPI hooks */ + return 0; +} + +struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/i386/mach-visws/Makefile b/arch/i386/mach-visws/Makefile new file mode 100644 index 000000000000..835fd96ad768 --- /dev/null +++ b/arch/i386/mach-visws/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the linux kernel. +# + +obj-y := setup.o traps.o reboot.o + +obj-$(CONFIG_X86_VISWS_APIC) += visws_apic.o +obj-$(CONFIG_X86_LOCAL_APIC) += mpparse.o diff --git a/arch/i386/mach-visws/mpparse.c b/arch/i386/mach-visws/mpparse.c new file mode 100644 index 000000000000..5a22082147f4 --- /dev/null +++ b/arch/i386/mach-visws/mpparse.c @@ -0,0 +1,105 @@ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <asm/smp.h> +#include <asm/io.h> + +#include "cobalt.h" +#include "mach_apic.h" + +/* Have we found an MP table */ +int smp_found_config; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; + +int pic_mode; +unsigned long mp_lapic_addr; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +unsigned int boot_cpu_logical_apicid = -1U; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +unsigned int __initdata maxcpus = NR_CPUS; + +/* + * The Visual Workstation is Intel MP compliant in the hardware + * sense, but it doesn't have a BIOS(-configuration table). + * No problem for Linux. + */ + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver, logical_apicid; + physid_mask_t apic_cpus; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + logical_apicid = m->mpc_apicid; + printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n", + m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + boot_cpu_physical_apicid = m->mpc_apicid; + boot_cpu_logical_apicid = logical_apicid; + } + + ver = m->mpc_apicver; + if ((ver >= 0x14 && m->mpc_apicid >= 0xff) || m->mpc_apicid >= 0xf) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } + + apic_cpus = apicid_to_cpu_present(m->mpc_apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; +} + +void __init find_smp_config(void) +{ + struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); + unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); + + if (ncpus > CO_CPU_MAX) { + printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", + ncpus, mp); + + ncpus = CO_CPU_MAX; + } + + if (ncpus > maxcpus) + ncpus = maxcpus; + + smp_found_config = 1; + while (ncpus--) + MP_processor_info(mp++); + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +} + +void __init get_smp_config (void) +{ +} diff --git a/arch/i386/mach-visws/reboot.c b/arch/i386/mach-visws/reboot.c new file mode 100644 index 000000000000..3a81e904a7b8 --- /dev/null +++ b/arch/i386/mach-visws/reboot.c @@ -0,0 +1,51 @@ +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/platform.h> + +#include <asm/io.h> +#include "piix4.h" + +void (*pm_power_off)(void); + +void machine_restart(char * __unused) +{ +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + /* + * Visual Workstations restart after this + * register is poked on the PIIX4 + */ + outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); +} + +EXPORT_SYMBOL(machine_restart); + +void machine_power_off(void) +{ + unsigned short pm_status; + extern unsigned int pci_bus0; + + while ((pm_status = inw(PMSTS_PORT)) & 0x100) + outw(pm_status, PMSTS_PORT); + + outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); + + mdelay(10); + +#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + + outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); + outl(PIIX_SPECIAL_STOP, 0xCFC); +} + +EXPORT_SYMBOL(machine_power_off); + +void machine_halt(void) +{ +} + +EXPORT_SYMBOL(machine_halt); diff --git a/arch/i386/mach-visws/setup.c b/arch/i386/mach-visws/setup.c new file mode 100644 index 000000000000..9f6d2d9b1be7 --- /dev/null +++ b/arch/i386/mach-visws/setup.c @@ -0,0 +1,134 @@ +/* + * Unmaintained SGI Visual Workstation support. + * Split out from setup.c by davej@suse.de + */ + +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/interrupt.h> + +#include <asm/fixmap.h> +#include <asm/arch_hooks.h> +#include <asm/io.h> +#include "cobalt.h" +#include "piix4.h" + +char visws_board_type = -1; +char visws_board_rev = -1; + +void __init visws_get_board_type_and_rev(void) +{ + int raw; + + visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) + >> PIIX_GPI_BD_SHIFT; + /* + * Get Board rev. + * First, we have to initialize the 307 part to allow us access + * to the GPIO registers. Let's map them at 0x0fc0 which is right + * after the PIIX4 PM section. + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable GPIO registers. */ + + /* + * Now, we have to map the power management section to write + * a bit which enables access to the GPIO registers. + * What lunatic came up with this shit? + */ + outb_p(SIO_DEV_SEL, SIO_INDEX); + outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ + + outb_p(SIO_DEV_MSB, SIO_INDEX); + outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ + + outb_p(SIO_DEV_LSB, SIO_INDEX); + outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ + + outb_p(SIO_DEV_ENB, SIO_INDEX); + outb_p(1, SIO_DATA); /* Enable PM registers. */ + + /* + * Now, write the PM register which enables the GPIO registers. + */ + outb_p(SIO_PM_FER2, SIO_PM_INDEX); + outb_p(SIO_PM_GP_EN, SIO_PM_DATA); + + /* + * Now, initialize the GPIO registers. + * We want them all to be inputs which is the + * power on default, so let's leave them alone. + * So, let's just read the board rev! + */ + raw = inb_p(SIO_GP_DATA1); + raw &= 0x7f; /* 7 bits of valid board revision ID. */ + + if (visws_board_type == VISWS_320) { + if (raw < 0x6) { + visws_board_rev = 4; + } else if (raw < 0xc) { + visws_board_rev = 5; + } else { + visws_board_rev = 6; + } + } else if (visws_board_type == VISWS_540) { + visws_board_rev = 2; + } else { + visws_board_rev = raw; + } + + printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", + (visws_board_type == VISWS_320 ? "320" : + (visws_board_type == VISWS_540 ? "540" : + "unknown")), visws_board_rev); +} + +void __init pre_intr_init_hook(void) +{ + init_VISWS_APIC_irqs(); +} + +void __init intr_init_hook(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic_intr_init(); +#endif +} + +void __init pre_setup_arch_hook() +{ + visws_get_board_type_and_rev(); +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = SA_INTERRUPT, + .name = "timer", +}; + +void __init time_init_hook(void) +{ + printk(KERN_INFO "Starting Cobalt Timer system clock\n"); + + /* Set the countdown value */ + co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); + + /* Start the timer */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); + + /* Enable (unmask) the timer interrupt */ + co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); + + /* Wire cpu IDT entry to s/w handler (and Cobalt APIC to IDT) */ + setup_irq(0, &irq0); +} diff --git a/arch/i386/mach-visws/traps.c b/arch/i386/mach-visws/traps.c new file mode 100644 index 000000000000..964353992031 --- /dev/null +++ b/arch/i386/mach-visws/traps.c @@ -0,0 +1,69 @@ +/* VISWS traps */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/pci_ids.h> + +#include <asm/io.h> +#include <asm/arch_hooks.h> +#include <asm/apic.h> +#include "cobalt.h" +#include "lithium.h" + + +#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) +#define BCD (LI_INTB | LI_INTC | LI_INTD) +#define ALLDEVS (A01234 | BCD) + +static __init void lithium_init(void) +{ + set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); + set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); + + if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcia_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); + panic("This machine is not SGI Visual Workstation 320/540"); + } + + if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || + (li_pcib_read16(PCI_DEVICE_ID) != PCI_VENDOR_ID_SGI_LITHIUM)) { + printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); + panic("This machine is not SGI Visual Workstation 320/540"); + } + + li_pcia_write16(LI_PCI_INTEN, ALLDEVS); + li_pcib_write16(LI_PCI_INTEN, ALLDEVS); +} + +static __init void cobalt_init(void) +{ + /* + * On normal SMP PC this is used only with SMP, but we have to + * use it and set it up here to start the Cobalt clock + */ + set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); + setup_local_APIC(); + printk(KERN_INFO "Local APIC Version %#lx, ID %#lx\n", + apic_read(APIC_LVR), apic_read(APIC_ID)); + + set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); + set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); + printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", + co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); + + /* Enable Cobalt APIC being careful to NOT change the ID! */ + co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); + + printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", + co_apic_read(CO_APIC_ID)); +} + +void __init trap_init_hook(void) +{ + lithium_init(); + cobalt_init(); +} diff --git a/arch/i386/mach-visws/visws_apic.c b/arch/i386/mach-visws/visws_apic.c new file mode 100644 index 000000000000..04e6585849a2 --- /dev/null +++ b/arch/i386/mach-visws/visws_apic.c @@ -0,0 +1,303 @@ +/* + * linux/arch/i386/mach_visws/visws_apic.c + * + * Copyright (C) 1999 Bent Hagemark, Ingo Molnar + * + * SGI Visual Workstation interrupt controller + * + * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC + * which serves as the main interrupt controller in the system. Non-legacy + * hardware in the system uses this controller directly. Legacy devices + * are connected to the PIIX4 which in turn has its 8259(s) connected to + * a of the Cobalt APIC entry. + * + * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com + * + * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> + */ + +#include <linux/config.h> +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/smp_lock.h> +#include <linux/init.h> + +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/i8259.h> + +#include "cobalt.h" +#include "irq_vectors.h" + + +static DEFINE_SPINLOCK(cobalt_lock); + +/* + * Set the given Cobalt APIC Redirection Table entry to point + * to the given IDT vector/index. + */ +static inline void co_apic_set(int entry, int irq) +{ + co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); + co_apic_write(CO_APIC_HI(entry), 0); +} + +/* + * Cobalt (IO)-APIC functions to handle PCI devices. + */ +static inline int co_apic_ide0_hack(void) +{ + extern char visws_board_type; + extern char visws_board_rev; + + if (visws_board_type == VISWS_320 && visws_board_rev == 5) + return 5; + return CO_APIC_IDE0; +} + +static int is_co_apic(unsigned int irq) +{ + if (IS_CO_APIC(irq)) + return CO_APIC(irq); + + switch (irq) { + case 0: return CO_APIC_CPU; + case CO_IRQ_IDE0: return co_apic_ide0_hack(); + case CO_IRQ_IDE1: return CO_APIC_IDE1; + default: return -1; + } +} + + +/* + * This is the SGI Cobalt (IO-)APIC: + */ + +static void enable_cobalt_irq(unsigned int irq) +{ + co_apic_set(is_co_apic(irq), irq); +} + +static void disable_cobalt_irq(unsigned int irq) +{ + int entry = is_co_apic(irq); + + co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); + co_apic_read(CO_APIC_LO(entry)); +} + +/* + * "irq" really just serves to identify the device. Here is where we + * map this to the Cobalt APIC entry where it's physically wired. + * This is called via request_irq -> setup_irq -> irq_desc->startup() + */ +static unsigned int startup_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); + return 0; +} + +static void ack_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + disable_cobalt_irq(irq); + apic_write(APIC_EOI, APIC_EIO_ACK); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static void end_cobalt_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct hw_interrupt_type cobalt_irq_type = { + .typename = "Cobalt-APIC", + .startup = startup_cobalt_irq, + .shutdown = disable_cobalt_irq, + .enable = enable_cobalt_irq, + .disable = disable_cobalt_irq, + .ack = ack_cobalt_irq, + .end = end_cobalt_irq, +}; + + +/* + * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt + * -- not the manner expected by the code in i8259.c. + * + * there is a 'master' physical interrupt source that gets sent to + * the CPU. But in the chipset there are various 'virtual' interrupts + * waiting to be handled. We represent this to Linux through a 'master' + * interrupt controller type, and through a special virtual interrupt- + * controller. Device drivers only see the virtual interrupt sources. + */ +static unsigned int startup_piix4_master_irq(unsigned int irq) +{ + init_8259A(0); + + return startup_cobalt_irq(irq); +} + +static void end_piix4_master_irq(unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&cobalt_lock, flags); + enable_cobalt_irq(irq); + spin_unlock_irqrestore(&cobalt_lock, flags); +} + +static struct hw_interrupt_type piix4_master_irq_type = { + .typename = "PIIX4-master", + .startup = startup_piix4_master_irq, + .ack = ack_cobalt_irq, + .end = end_piix4_master_irq, +}; + + +static struct hw_interrupt_type piix4_virtual_irq_type = { + .typename = "PIIX4-virtual", + .startup = startup_8259A_irq, + .shutdown = disable_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, +}; + + +/* + * PIIX4-8259 master/virtual functions to handle interrupt requests + * from legacy devices: floppy, parallel, serial, rtc. + * + * None of these get Cobalt APIC entries, neither do they have IDT + * entries. These interrupts are purely virtual and distributed from + * the 'master' interrupt source: CO_IRQ_8259. + * + * When the 8259 interrupts its handler figures out which of these + * devices is interrupting and dispatches to its handler. + * + * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ + * enable_irq gets the right irq. This 'master' irq is never directly + * manipulated by any driver. + */ +static irqreturn_t piix4_master_intr(int irq, void *dev_id, struct pt_regs * regs) +{ + int realirq; + irq_desc_t *desc; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + /* Find out what's interrupting in the PIIX4 master 8259 */ + outb(0x0c, 0x20); /* OCW3 Poll command */ + realirq = inb(0x20); + + /* + * Bit 7 == 0 means invalid/spurious + */ + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq &= 7; + + if (unlikely(realirq == 2)) { + outb(0x0c, 0xa0); + realirq = inb(0xa0); + + if (unlikely(!(realirq & 0x80))) + goto out_unlock; + + realirq = (realirq & 7) + 8; + } + + /* mask and ack interrupt */ + cached_irq_mask |= 1 << realirq; + if (unlikely(realirq > 7)) { + inb(0xa1); + outb(cached_slave_mask, 0xa1); + outb(0x60 + (realirq & 7), 0xa0); + outb(0x60 + 2, 0x20); + } else { + inb(0x21); + outb(cached_master_mask, 0x21); + outb(0x60 + realirq, 0x20); + } + + spin_unlock_irqrestore(&i8259A_lock, flags); + + desc = irq_desc + realirq; + + /* + * handle this 'virtual interrupt' as a Cobalt one now. + */ + kstat_cpu(smp_processor_id()).irqs[realirq]++; + + if (likely(desc->action != NULL)) + handle_IRQ_event(realirq, regs, desc->action); + + if (!(desc->status & IRQ_DISABLED)) + enable_8259A_irq(realirq); + + return IRQ_HANDLED; + +out_unlock: + spin_unlock_irqrestore(&i8259A_lock, flags); + return IRQ_NONE; +} + +static struct irqaction master_action = { + .handler = piix4_master_intr, + .name = "PIIX4-8259", +}; + +static struct irqaction cascade_action = { + .handler = no_action, + .name = "cascade", +}; + + +void init_VISWS_APIC_irqs(void) +{ + int i; + + for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + + if (i == 0) { + irq_desc[i].handler = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE0) { + irq_desc[i].handler = &cobalt_irq_type; + } + else if (i == CO_IRQ_IDE1) { + irq_desc[i].handler = &cobalt_irq_type; + } + else if (i == CO_IRQ_8259) { + irq_desc[i].handler = &piix4_master_irq_type; + } + else if (i < CO_IRQ_APIC0) { + irq_desc[i].handler = &piix4_virtual_irq_type; + } + else if (IS_CO_APIC(i)) { + irq_desc[i].handler = &cobalt_irq_type; + } + } + + setup_irq(CO_IRQ_8259, &master_action); + setup_irq(2, &cascade_action); +} diff --git a/arch/i386/mach-voyager/Makefile b/arch/i386/mach-voyager/Makefile new file mode 100644 index 000000000000..f24d29651318 --- /dev/null +++ b/arch/i386/mach-voyager/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the linux kernel. +# + +EXTRA_CFLAGS += -I../kernel +obj-y := setup.o voyager_basic.o voyager_thread.o + +obj-$(CONFIG_SMP) += voyager_smp.o voyager_cat.o diff --git a/arch/i386/mach-voyager/setup.c b/arch/i386/mach-voyager/setup.c new file mode 100644 index 000000000000..df123fc487bb --- /dev/null +++ b/arch/i386/mach-voyager/setup.c @@ -0,0 +1,48 @@ +/* + * Machine specific setup for generic + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/arch_hooks.h> + +void __init pre_intr_init_hook(void) +{ + init_ISA_irqs(); +} + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; + +void __init intr_init_hook(void) +{ +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} + +void __init pre_setup_arch_hook(void) +{ + /* Voyagers run their CPUs from independent clocks, so disable + * the TSC code because we can't sync them */ + tsc_disable = 1; +} + +void __init trap_init_hook(void) +{ +} + +static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; + +void __init time_init_hook(void) +{ + setup_irq(0, &irq0); +} diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c new file mode 100644 index 000000000000..602aea240e9b --- /dev/null +++ b/arch/i386/mach-voyager/voyager_basic.c @@ -0,0 +1,325 @@ +/* Copyright (C) 1999,2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * linux/arch/i386/kernel/voyager.c + * + * This file contains all the voyager specific routines for getting + * initialisation of the architecture to function. For additional + * features see: + * + * voyager_cat.c - Voyager CAT bus interface + * voyager_smp.c - Voyager SMP hal (emulates linux smp.c) + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <asm/io.h> +#include <asm/voyager.h> +#include <asm/vic.h> +#include <linux/pm.h> +#include <linux/irq.h> +#include <asm/tlbflush.h> +#include <asm/arch_hooks.h> + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); + +int voyager_level = 0; + +struct voyager_SUS *voyager_SUS = NULL; + +#ifdef CONFIG_SMP +static void +voyager_dump(int dummy1, struct pt_regs *dummy2, struct tty_struct *dummy3) +{ + /* get here via a sysrq */ + voyager_smp_dump(); +} + +static struct sysrq_key_op sysrq_voyager_dump_op = { + .handler = voyager_dump, + .help_msg = "Voyager", + .action_msg = "Dump Voyager Status", +}; +#endif + +void +voyager_detect(struct voyager_bios_info *bios) +{ + if(bios->len != 0xff) { + int class = (bios->class_1 << 8) + | (bios->class_2 & 0xff); + + printk("Voyager System detected.\n" + " Class %x, Revision %d.%d\n", + class, bios->major, bios->minor); + if(class == VOYAGER_LEVEL4) + voyager_level = 4; + else if(class < VOYAGER_LEVEL5_AND_ABOVE) + voyager_level = 3; + else + voyager_level = 5; + printk(" Architecture Level %d\n", voyager_level); + if(voyager_level < 4) + printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); + /* install the power off handler */ + pm_power_off = voyager_power_off; +#ifdef CONFIG_SMP + register_sysrq_key('v', &sysrq_voyager_dump_op); +#endif + } else { + printk("\n\n**WARNING**: No Voyager Subsystem Found\n"); + } +} + +void +voyager_system_interrupt(int cpl, void *dev_id, struct pt_regs *regs) +{ + printk("Voyager: detected system interrupt\n"); +} + +/* Routine to read information from the extended CMOS area */ +__u8 +voyager_extended_cmos_read(__u16 addr) +{ + outb(addr & 0xff, 0x74); + outb((addr >> 8) & 0xff, 0x75); + return inb(0x76); +} + +/* internal definitions for the SUS Click Map of memory */ + +#define CLICK_ENTRIES 16 +#define CLICK_SIZE 4096 /* click to byte conversion for Length */ + +typedef struct ClickMap { + struct Entry { + __u32 Address; + __u32 Length; + } Entry[CLICK_ENTRIES]; +} ClickMap_t; + + +/* This routine is pretty much an awful hack to read the bios clickmap by + * mapping it into page 0. There are usually three regions in the map: + * Base Memory + * Extended Memory + * zero length marker for end of map + * + * Returns are 0 for failure and 1 for success on extracting region. + */ +int __init +voyager_memory_detect(int region, __u32 *start, __u32 *length) +{ + int i; + int retval = 0; + __u8 cmos[4]; + ClickMap_t *map; + unsigned long map_addr; + unsigned long old; + + if(region >= CLICK_ENTRIES) { + printk("Voyager: Illegal ClickMap region %d\n", region); + return 0; + } + + for(i = 0; i < sizeof(cmos); i++) + cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); + + map_addr = *(unsigned long *)cmos; + + /* steal page 0 for this */ + old = pg0[0]; + pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); + local_flush_tlb(); + /* now clear everything out but page 0 */ + map = (ClickMap_t *)(map_addr & (~PAGE_MASK)); + + /* zero length is the end of the clickmap */ + if(map->Entry[region].Length != 0) { + *length = map->Entry[region].Length * CLICK_SIZE; + *start = map->Entry[region].Address; + retval = 1; + } + + /* replace the mapping */ + pg0[0] = old; + local_flush_tlb(); + return retval; +} + +/* voyager specific handling code for timer interrupts. Used to hand + * off the timer tick to the SMP code, since the VIC doesn't have an + * internal timer (The QIC does, but that's another story). */ +void +voyager_timer_interrupt(struct pt_regs *regs) +{ + if((jiffies & 0x3ff) == 0) { + + /* There seems to be something flaky in either + * hardware or software that is resetting the timer 0 + * count to something much higher than it should be + * This seems to occur in the boot sequence, just + * before root is mounted. Therefore, every 10 + * seconds or so, we sanity check the timer zero count + * and kick it back to where it should be. + * + * FIXME: This is the most awful hack yet seen. I + * should work out exactly what is interfering with + * the timer count settings early in the boot sequence + * and swiftly introduce it to something sharp and + * pointy. */ + __u16 val; + extern spinlock_t i8253_lock; + + spin_lock(&i8253_lock); + + outb_p(0x00, 0x43); + val = inb_p(0x40); + val |= inb(0x40) << 8; + spin_unlock(&i8253_lock); + + if(val > LATCH) { + printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val); + spin_lock(&i8253_lock); + outb(0x34,0x43); + outb_p(LATCH & 0xff , 0x40); /* LSB */ + outb(LATCH >> 8 , 0x40); /* MSB */ + spin_unlock(&i8253_lock); + } + } +#ifdef CONFIG_SMP + smp_vic_timer_interrupt(regs); +#endif +} + +void +voyager_power_off(void) +{ + printk("VOYAGER Power Off\n"); + + if(voyager_level == 5) { + voyager_cat_power_off(); + } else if(voyager_level == 4) { + /* This doesn't apparently work on most L4 machines, + * but the specs say to do this to get automatic power + * off. Unfortunately, if it doesn't power off the + * machine, it ends up doing a cold restart, which + * isn't really intended, so comment out the code */ +#if 0 + int port; + + + /* enable the voyager Configuration Space */ + outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, + VOYAGER_MC_SETUP); + /* the port for the power off flag is an offset from the + floating base */ + port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; + /* set the power off flag */ + outb(inb(port) | 0x1, port); +#endif + } + /* and wait for it to happen */ + for(;;) { + __asm("cli"); + __asm("hlt"); + } +} + +/* copied from process.c */ +static inline void +kb_wait(void) +{ + int i; + + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} + +void +machine_restart(char *cmd) +{ + printk("Voyager Warm Restart\n"); + kb_wait(); + + if(voyager_level == 5) { + /* write magic values to the RTC to inform system that + * shutdown is beginning */ + outb(0x8f, 0x70); + outb(0x5 , 0x71); + + udelay(50); + outb(0xfe,0x64); /* pull reset low */ + } else if(voyager_level == 4) { + __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; + __u8 basebd = inb(VOYAGER_MC_SETUP); + + outb(basebd | 0x08, VOYAGER_MC_SETUP); + outb(0x02, catbase + 0x21); + } + for(;;) { + asm("cli"); + asm("hlt"); + } +} + +EXPORT_SYMBOL(machine_restart); + +void +mca_nmi_hook(void) +{ + __u8 dumpval __attribute__((unused)) = inb(0xf823); + __u8 swnmi __attribute__((unused)) = inb(0xf813); + + /* FIXME: assume dump switch pressed */ + /* check to see if the dump switch was pressed */ + VDEBUG(("VOYAGER: dumpval = 0x%x, swnmi = 0x%x\n", dumpval, swnmi)); + /* clear swnmi */ + outb(0xff, 0xf813); + /* tell SUS to ignore dump */ + if(voyager_level == 5 && voyager_SUS != NULL) { + if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { + voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; + voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; + udelay(1000); + voyager_SUS->kernel_mbox = VOYAGER_IGNORE_DUMP; + voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; + } + } + printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id()); + show_stack(NULL, NULL); + show_state(); +} + + + +void +machine_halt(void) +{ + /* treat a halt like a power off */ + machine_power_off(); +} + +EXPORT_SYMBOL(machine_halt); + +void machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); +} + +EXPORT_SYMBOL(machine_power_off); diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c new file mode 100644 index 000000000000..23967fe658d3 --- /dev/null +++ b/arch/i386/mach-voyager/voyager_cat.c @@ -0,0 +1,1178 @@ +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* Copyright (C) 1999,2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * linux/arch/i386/kernel/voyager_cat.c + * + * This file contains all the logic for manipulating the CAT bus + * in a level 5 machine. + * + * The CAT bus is a serial configuration and test bus. Its primary + * uses are to probe the initial configuration of the system and to + * diagnose error conditions when a system interrupt occurs. The low + * level interface is fairly primitive, so most of this file consists + * of bit shift manipulations to send and receive packets on the + * serial bus */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/completion.h> +#include <linux/sched.h> +#include <asm/voyager.h> +#include <asm/vic.h> +#include <linux/ioport.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <asm/io.h> + +#ifdef VOYAGER_CAT_DEBUG +#define CDEBUG(x) printk x +#else +#define CDEBUG(x) +#endif + +/* the CAT command port */ +#define CAT_CMD (sspb + 0xe) +/* the CAT data port */ +#define CAT_DATA (sspb + 0xd) + +/* the internal cat functions */ +static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, + __u16 num_bits); +static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data, + __u16 num_bits); +static void cat_build_header(__u8 *header, const __u16 len, + const __u16 smallest_reg_bits, + const __u16 longest_reg_bits); +static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, + __u8 reg, __u8 op); +static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, + __u8 reg, __u8 *value); +static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, + __u8 pad_bits); +static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, + __u8 value); +static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, + __u8 *value); +static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, + __u16 offset, __u16 len, void *buf); +static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, + __u8 reg, __u8 value); +static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp); +static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp); + +static inline const char * +cat_module_name(int module_id) +{ + switch(module_id) { + case 0x10: + return "Processor Slot 0"; + case 0x11: + return "Processor Slot 1"; + case 0x12: + return "Processor Slot 2"; + case 0x13: + return "Processor Slot 4"; + case 0x14: + return "Memory Slot 0"; + case 0x15: + return "Memory Slot 1"; + case 0x18: + return "Primary Microchannel"; + case 0x19: + return "Secondary Microchannel"; + case 0x1a: + return "Power Supply Interface"; + case 0x1c: + return "Processor Slot 5"; + case 0x1d: + return "Processor Slot 6"; + case 0x1e: + return "Processor Slot 7"; + case 0x1f: + return "Processor Slot 8"; + default: + return "Unknown Module"; + } +} + +static int sspb = 0; /* stores the super port location */ +int voyager_8slot = 0; /* set to true if a 51xx monster */ + +voyager_module_t *voyager_cat_list; + +/* the I/O port assignments for the VIC and QIC */ +static struct resource vic_res = { + "Voyager Interrupt Controller", 0xFC00, 0xFC6F }; +static struct resource qic_res = { + "Quad Interrupt Controller", 0xFC70, 0xFCFF }; + +/* This function is used to pack a data bit stream inside a message. + * It writes num_bits of the data buffer in msg starting at start_bit. + * Note: This function assumes that any unused bit in the data stream + * is set to zero so that the ors will work correctly */ +#define BITS_PER_BYTE 8 +static void +cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +{ + /* compute initial shift needed */ + const __u16 offset = start_bit % BITS_PER_BYTE; + __u16 len = num_bits / BITS_PER_BYTE; + __u16 byte = start_bit / BITS_PER_BYTE; + __u16 residue = (num_bits % BITS_PER_BYTE) + offset; + int i; + + /* adjust if we have more than a byte of residue */ + if(residue >= BITS_PER_BYTE) { + residue -= BITS_PER_BYTE; + len++; + } + + /* clear out the bits. We assume here that if len==0 then + * residue >= offset. This is always true for the catbus + * operations */ + msg[byte] &= 0xff << (BITS_PER_BYTE - offset); + msg[byte++] |= data[0] >> offset; + if(len == 0) + return; + for(i = 1; i < len; i++) + msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset)) + | (data[i] >> offset); + if(residue != 0) { + __u8 mask = 0xff >> residue; + __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset) + | (data[i] >> offset); + + last_byte &= ~mask; + msg[byte] &= mask; + msg[byte] |= last_byte; + } + return; +} +/* unpack the data again (same arguments as cat_pack()). data buffer + * must be zero populated. + * + * Function: given a message string move to start_bit and copy num_bits into + * data (starting at bit 0 in data). + */ +static void +cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +{ + /* compute initial shift needed */ + const __u16 offset = start_bit % BITS_PER_BYTE; + __u16 len = num_bits / BITS_PER_BYTE; + const __u8 last_bits = num_bits % BITS_PER_BYTE; + __u16 byte = start_bit / BITS_PER_BYTE; + int i; + + if(last_bits != 0) + len++; + + /* special case: want < 8 bits from msg and we can get it from + * a single byte of the msg */ + if(len == 0 && BITS_PER_BYTE - offset >= num_bits) { + data[0] = msg[byte] << offset; + data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); + return; + } + for(i = 0; i < len; i++) { + /* this annoying if has to be done just in case a read of + * msg one beyond the array causes a panic */ + if(offset != 0) { + data[i] = msg[byte++] << offset; + data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); + } + else { + data[i] = msg[byte++]; + } + } + /* do we need to truncate the final byte */ + if(last_bits != 0) { + data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits); + } + return; +} + +static void +cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits, + const __u16 longest_reg_bits) +{ + int i; + __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; + __u8 *last_byte = &header[len - 1]; + + if(start_bit == 0) + start_bit = 1; /* must have at least one bit in the hdr */ + + for(i=0; i < len; i++) + header[i] = 0; + + for(i = start_bit; i > 0; i--) + *last_byte = ((*last_byte) << 1) + 1; + +} + +static int +cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) +{ + __u8 parity, inst, inst_buf[4] = { 0 }; + __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; + __u16 ibytes, hbytes, padbits; + int i; + + /* + * Parity is the parity of the register number + 1 (READ_REGISTER + * and WRITE_REGISTER always add '1' to the number of bits == 1) + */ + parity = (__u8)(1 + (reg & 0x01) + + ((__u8)(reg & 0x02) >> 1) + + ((__u8)(reg & 0x04) >> 2) + + ((__u8)(reg & 0x08) >> 3)) % 2; + + inst = ((parity << 7) | (reg << 2) | op); + + outb(VOYAGER_CAT_IRCYC, CAT_CMD); + if(!modp->scan_path_connected) { + if(asicp->asic_id != VOYAGER_CAT_ID) { + printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); + return 1; + } + outb(VOYAGER_CAT_HEADER, CAT_DATA); + outb(inst, CAT_DATA); + if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); + return 1; + } + return 0; + } + ibytes = modp->inst_bits / BITS_PER_BYTE; + if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { + padbits = BITS_PER_BYTE - padbits; + ibytes++; + } + hbytes = modp->largest_reg / BITS_PER_BYTE; + if(modp->largest_reg % BITS_PER_BYTE) + hbytes++; + CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); + /* initialise the instruction sequence to 0xff */ + for(i=0; i < ibytes + hbytes; i++) + iseq[i] = 0xff; + cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); + cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); + inst_buf[0] = inst; + inst_buf[1] = 0xFF >> (modp->largest_reg % BITS_PER_BYTE); + cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); +#ifdef VOYAGER_CAT_DEBUG + printk("ins = 0x%x, iseq: ", inst); + for(i=0; i< ibytes + hbytes; i++) + printk("0x%x ", iseq[i]); + printk("\n"); +#endif + if(cat_shiftout(iseq, ibytes, hbytes, padbits)) { + CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); + return 1; + } + CDEBUG(("CAT SHIFTOUT DONE\n")); + return 0; +} + +static int +cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, + __u8 *value) +{ + if(!modp->scan_path_connected) { + if(asicp->asic_id != VOYAGER_CAT_ID) { + CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); + return 1; + } + if(reg > VOYAGER_SUBADDRHI) + outb(VOYAGER_CAT_RUN, CAT_CMD); + outb(VOYAGER_CAT_DRCYC, CAT_CMD); + outb(VOYAGER_CAT_HEADER, CAT_DATA); + *value = inb(CAT_DATA); + outb(0xAA, CAT_DATA); + if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); + return 1; + } + return 0; + } + else { + __u16 sbits = modp->num_asics -1 + asicp->ireg_length; + __u16 sbytes = sbits / BITS_PER_BYTE; + __u16 tbytes; + __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE]; + __u8 padbits; + int i; + + outb(VOYAGER_CAT_DRCYC, CAT_CMD); + + if((padbits = sbits % BITS_PER_BYTE) != 0) { + padbits = BITS_PER_BYTE - padbits; + sbytes++; + } + tbytes = asicp->ireg_length / BITS_PER_BYTE; + if(asicp->ireg_length % BITS_PER_BYTE) + tbytes++; + CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", + tbytes, sbytes, padbits)); + cat_build_header(trailer, tbytes, 1, asicp->ireg_length); + + + for(i = tbytes - 1; i >= 0; i--) { + outb(trailer[i], CAT_DATA); + string[sbytes + i] = inb(CAT_DATA); + } + + for(i = sbytes - 1; i >= 0; i--) { + outb(0xaa, CAT_DATA); + string[i] = inb(CAT_DATA); + } + *value = 0; + cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length); +#ifdef VOYAGER_CAT_DEBUG + printk("value=0x%x, string: ", *value); + for(i=0; i< tbytes+sbytes; i++) + printk("0x%x ", string[i]); + printk("\n"); +#endif + + /* sanity check the rest of the return */ + for(i=0; i < tbytes; i++) { + __u8 input = 0; + + cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE); + if(trailer[i] != input) { + CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); + return 1; + } + } + CDEBUG(("cat_getdata DONE\n")); + return 0; + } +} + +static int +cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) +{ + int i; + + for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--) + outb(data[i], CAT_DATA); + + for(i = header_bytes - 1; i >= 0; i--) { + __u8 header = 0; + __u8 input; + + outb(data[i], CAT_DATA); + input = inb(CAT_DATA); + CDEBUG(("cat_shiftout: returned 0x%x\n", input)); + cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, + &header, BITS_PER_BYTE); + if(input != header) { + CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); + return 1; + } + } + return 0; +} + +static int +cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, + __u8 reg, __u8 value) +{ + outb(VOYAGER_CAT_DRCYC, CAT_CMD); + if(!modp->scan_path_connected) { + if(asicp->asic_id != VOYAGER_CAT_ID) { + CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); + return 1; + } + outb(VOYAGER_CAT_HEADER, CAT_DATA); + outb(value, CAT_DATA); + if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); + return 1; + } + if(reg > VOYAGER_SUBADDRHI) { + outb(VOYAGER_CAT_RUN, CAT_CMD); + outb(VOYAGER_CAT_END, CAT_CMD); + outb(VOYAGER_CAT_RUN, CAT_CMD); + } + + return 0; + } + else { + __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; + __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE; + __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], + hseq[VOYAGER_MAX_REG_SIZE]; + int i; + + if((padbits = (modp->num_asics - 1 + + asicp->ireg_length) % BITS_PER_BYTE) != 0) { + padbits = BITS_PER_BYTE - padbits; + dbytes++; + } + if(asicp->ireg_length % BITS_PER_BYTE) + hbytes++; + + cat_build_header(hseq, hbytes, 1, asicp->ireg_length); + + for(i = 0; i < dbytes + hbytes; i++) + dseq[i] = 0xff; + CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", + dbytes, hbytes, padbits)); + cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, + hseq, hbytes * BITS_PER_BYTE); + cat_pack(dseq, asicp->asic_location, &value, + asicp->ireg_length); +#ifdef VOYAGER_CAT_DEBUG + printk("dseq "); + for(i=0; i<hbytes+dbytes; i++) { + printk("0x%x ", dseq[i]); + } + printk("\n"); +#endif + return cat_shiftout(dseq, dbytes, hbytes, padbits); + } +} + +static int +cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, + __u8 value) +{ + if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) + return 1; + return cat_senddata(modp, asicp, reg, value); +} + +static int +cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, + __u8 *value) +{ + if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) + return 1; + return cat_getdata(modp, asicp, reg, value); +} + +static int +cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, + __u16 len) +{ + __u8 val; + + if(len > 1) { + /* set auto increment */ + __u8 newval; + + if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { + CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); + return 1; + } + CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val)); + newval = val | VOYAGER_AUTO_INC; + if(newval != val) { + if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { + CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); + return 1; + } + } + } + if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) { + CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); + return 1; + } + if(asicp->subaddr > VOYAGER_SUBADDR_LO) { + if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) { + CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); + return 1; + } + cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); + CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val)); + } + cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); + CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); + return 0; +} + +static int +cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, + __u16 len, void *buf) +{ + int i, retval; + + /* FIXME: need special actions for VOYAGER_CAT_ID here */ + if(asicp->asic_id == VOYAGER_CAT_ID) { + CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); + /* FIXME -- This is supposed to be handled better + * There is a problem writing to the cat asic in the + * PSI. The 30us delay seems to work, though */ + udelay(30); + } + + if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + printk("cat_subwrite: cat_subaddrsetup FAILED\n"); + return retval; + } + + if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { + printk("cat_subwrite: cat_sendinst FAILED\n"); + return 1; + } + for(i = 0; i < len; i++) { + if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) { + printk("cat_subwrite: cat_sendata element at %d FAILED\n", i); + return 1; + } + } + return 0; +} +static int +cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, + __u16 len, void *buf) +{ + int i, retval; + + if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); + return retval; + } + + if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { + CDEBUG(("cat_subread: cat_sendinst failed\n")); + return 1; + } + for(i = 0; i < len; i++) { + if(cat_getdata(modp, asicp, 0xFF, + &((__u8 *)buf)[i])) { + CDEBUG(("cat_subread: cat_getdata element %d failed\n", i)); + return 1; + } + } + return 0; +} + + +/* buffer for storing EPROM data read in during initialisation */ +static __initdata __u8 eprom_buf[0xFFFF]; +static voyager_module_t *voyager_initial_module; + +/* Initialise the cat bus components. We assume this is called by the + * boot cpu *after* all memory initialisation has been done (so we can + * use kmalloc) but before smp initialisation, so we can probe the SMP + * configuration and pick up necessary information. */ +void +voyager_cat_init(void) +{ + voyager_module_t **modpp = &voyager_initial_module; + voyager_asic_t **asicpp; + voyager_asic_t *qabc_asic = NULL; + int i, j; + unsigned long qic_addr = 0; + __u8 qabc_data[0x20]; + __u8 num_submodules, val; + voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0]; + + __u8 cmos[4]; + unsigned long addr; + + /* initiallise the SUS mailbox */ + for(i=0; i<sizeof(cmos); i++) + cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); + addr = *(unsigned long *)cmos; + if((addr & 0xff000000) != 0xff000000) { + printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr); + } else { + static struct resource res; + + res.name = "voyager SUS"; + res.start = addr; + res.end = addr+0x3ff; + + request_resource(&iomem_resource, &res); + voyager_SUS = (struct voyager_SUS *) + ioremap(addr, 0x400); + printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", + voyager_SUS->SUS_version); + voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; + voyager_SUS->kernel_flags = VOYAGER_OS_HAS_SYSINT; + } + + /* clear the processor counts */ + voyager_extended_vic_processors = 0; + voyager_quad_processors = 0; + + + + printk("VOYAGER: beginning CAT bus probe\n"); + /* set up the SuperSet Port Block which tells us where the + * CAT communication port is */ + sspb = inb(VOYAGER_SSPB_RELOCATION_PORT) * 0x100; + VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); + + /* now find out if were 8 slot or normal */ + if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) + == EIGHT_SLOT_IDENTIFIER) { + voyager_8slot = 1; + printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n"); + } + + for(i = VOYAGER_MIN_MODULE; + i <= VOYAGER_MAX_MODULE; i++) { + __u8 input; + int asic; + __u16 eprom_size; + __u16 sp_offset; + + outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); + outb(i, VOYAGER_CAT_CONFIG_PORT); + + /* check the presence of the module */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + outb(VOYAGER_CAT_IRCYC, CAT_CMD); + outb(VOYAGER_CAT_HEADER, CAT_DATA); + /* stream series of alternating 1's and 0's to stimulate + * response */ + outb(0xAA, CAT_DATA); + input = inb(CAT_DATA); + outb(VOYAGER_CAT_END, CAT_CMD); + if(input != VOYAGER_CAT_HEADER) { + continue; + } + CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, + cat_module_name(i))); + *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/ + if(*modpp == NULL) { + printk("**WARNING** kmalloc failure in cat_init\n"); + continue; + } + memset(*modpp, 0, sizeof(voyager_module_t)); + /* need temporary asic for cat_subread. It will be + * filled in correctly later */ + (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/ + if((*modpp)->asic == NULL) { + printk("**WARNING** kmalloc failure in cat_init\n"); + continue; + } + memset((*modpp)->asic, 0, sizeof(voyager_asic_t)); + (*modpp)->asic->asic_id = VOYAGER_CAT_ID; + (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; + (*modpp)->module_addr = i; + (*modpp)->scan_path_connected = 0; + if(i == VOYAGER_PSI) { + /* Exception leg for modules with no EEPROM */ + printk("Module \"%s\"\n", cat_module_name(i)); + continue; + } + + CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_disconnect(*modpp, (*modpp)->asic); + if(cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + if(eprom_size > sizeof(eprom_buf)) { + printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + outb(VOYAGER_CAT_END, CAT_CMD); + outb(VOYAGER_CAT_RUN, CAT_CMD); + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); + if(cat_subread(*modpp, (*modpp)->asic, 0, + eprom_size, eprom_buf)) { + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + outb(VOYAGER_CAT_END, CAT_CMD); + printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", + cat_module_name(i), eprom_hdr->version_id, + *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics); + (*modpp)->ee_size = eprom_hdr->ee_size; + (*modpp)->num_asics = eprom_hdr->num_asics; + asicpp = &((*modpp)->asic); + sp_offset = eprom_hdr->scan_path_offset; + /* All we really care about are the Quad cards. We + * identify them because they are in a processor slot + * and have only four asics */ + if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) { + modpp = &((*modpp)->next); + continue; + } + /* Now we know it's in a processor slot, does it have + * a quad baseboard submodule */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODPRESENT, + &num_submodules); + /* lowest two bits, active low */ + num_submodules = ~(0xfc | num_submodules); + CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules)); + if(num_submodules == 0) { + /* fill in the dyadic extended processors */ + __u8 cpu = i & 0x07; + + printk("Module \"%s\": Dyadic Processor Card\n", + cat_module_name(i)); + voyager_extended_vic_processors |= (1<<cpu); + cpu += 4; + voyager_extended_vic_processors |= (1<<cpu); + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + + /* now we want to read the asics on the first submodule, + * which should be the quad base board */ + + cat_read(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, &val); + CDEBUG(("cat_init: SUBMODSELECT value = 0x%x\n", val)); + val = (val & 0x7c) | VOYAGER_QUAD_BASEBOARD; + cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); + + outb(VOYAGER_CAT_END, CAT_CMD); + + + CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_disconnect(*modpp, (*modpp)->asic); + if(cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + if(eprom_size > sizeof(eprom_buf)) { + printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + outb(VOYAGER_CAT_END, CAT_CMD); + outb(VOYAGER_CAT_RUN, CAT_CMD); + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); + if(cat_subread(*modpp, (*modpp)->asic, 0, + eprom_size, eprom_buf)) { + outb(VOYAGER_CAT_END, CAT_CMD); + continue; + } + outb(VOYAGER_CAT_END, CAT_CMD); + /* Now do everything for the QBB submodule 1 */ + (*modpp)->ee_size = eprom_hdr->ee_size; + (*modpp)->num_asics = eprom_hdr->num_asics; + asicpp = &((*modpp)->asic); + sp_offset = eprom_hdr->scan_path_offset; + /* get rid of the dummy CAT asic and read the real one */ + kfree((*modpp)->asic); + for(asic=0; asic < (*modpp)->num_asics; asic++) { + int j; + voyager_asic_t *asicp = *asicpp + = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + voyager_sp_table_t *sp_table; + voyager_at_t *asic_table; + voyager_jtt_t *jtag_table; + + if(asicp == NULL) { + printk("**WARNING** kmalloc failure in cat_init\n"); + continue; + } + memset(asicp, 0, sizeof(voyager_asic_t)); + asicpp = &(asicp->next); + asicp->asic_location = asic; + sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); + asicp->asic_id = sp_table->asic_id; + asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset); + for(j=0; j<4; j++) + asicp->jtag_id[j] = asic_table->jtag_id[j]; + jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset); + asicp->ireg_length = jtag_table->ireg_len; + asicp->bit_location = (*modpp)->inst_bits; + (*modpp)->inst_bits += asicp->ireg_length; + if(asicp->ireg_length > (*modpp)->largest_reg) + (*modpp)->largest_reg = asicp->ireg_length; + if (asicp->ireg_length < (*modpp)->smallest_reg || + (*modpp)->smallest_reg == 0) + (*modpp)->smallest_reg = asicp->ireg_length; + CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", + asicp->asic_id, asicp->ireg_length, + asicp->bit_location)); + if(asicp->asic_id == VOYAGER_QUAD_QABC) { + CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); + qabc_asic = asicp; + } + sp_offset += sizeof(voyager_sp_table_t); + } + CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", + (*modpp)->inst_bits, (*modpp)->largest_reg, + (*modpp)->smallest_reg)); + /* OK, now we have the QUAD ASICs set up, use them. + * we need to: + * + * 1. Find the Memory area for the Quad CPIs. + * 2. Find the Extended VIC processor + * 3. Configure a second extended VIC processor (This + * cannot be done for the 51xx. + * */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_connect(*modpp, (*modpp)->asic); + CDEBUG(("CAT CONNECTED!!\n")); + cat_subread(*modpp, qabc_asic, 0, sizeof(qabc_data), qabc_data); + qic_addr = qabc_data[5] << 8; + qic_addr = (qic_addr | qabc_data[6]) << 8; + qic_addr = (qic_addr | qabc_data[7]) << 8; + printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", + cat_module_name(i), qic_addr, qabc_data[8]); +#if 0 /* plumbing fails---FIXME */ + if((qabc_data[8] & 0xf0) == 0) { + /* FIXME: 32 way 8 CPU slot monster cannot be + * plumbed this way---need to check for it */ + + printk("Plumbing second Extended Quad Processor\n"); + /* second VIC line hardwired to Quad CPU 1 */ + qabc_data[8] |= 0x20; + cat_subwrite(*modpp, qabc_asic, 8, 1, &qabc_data[8]); +#ifdef VOYAGER_CAT_DEBUG + /* verify plumbing */ + cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); + if((qabc_data[8] & 0xf0) == 0) { + CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8])); + } +#endif + } +#endif + + { + struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); + memset(res, 0, sizeof(struct resource)); + res->name = kmalloc(128, GFP_KERNEL); + sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); + res->start = qic_addr; + res->end = qic_addr + 0x3ff; + request_resource(&iomem_resource, res); + } + + qic_addr = (unsigned long)ioremap(qic_addr, 0x400); + + for(j = 0; j < 4; j++) { + __u8 cpu; + + if(voyager_8slot) { + /* 8 slot has a different mapping, + * each slot has only one vic line, so + * 1 cpu in each slot must be < 8 */ + cpu = (i & 0x07) + j*8; + } else { + cpu = (i & 0x03) + j*4; + } + if( (qabc_data[8] & (1<<j))) { + voyager_extended_vic_processors |= (1<<cpu); + } + if(qabc_data[8] & (1<<(j+4)) ) { + /* Second SET register plumbed: Quad + * card has two VIC connected CPUs. + * Secondary cannot be booted as a VIC + * CPU */ + voyager_extended_vic_processors |= (1<<cpu); + voyager_allowed_boot_processors &= (~(1<<cpu)); + } + + voyager_quad_processors |= (1<<cpu); + voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) + (qic_addr+(j<<8)); + CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, + (unsigned long)voyager_quad_cpi_addr[cpu])); + } + outb(VOYAGER_CAT_END, CAT_CMD); + + + + *asicpp = NULL; + modpp = &((*modpp)->next); + } + *modpp = NULL; + printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors); + request_resource(&ioport_resource, &vic_res); + if(voyager_quad_processors) + request_resource(&ioport_resource, &qic_res); + /* set up the front power switch */ +} + +int +voyager_cat_readb(__u8 module, __u8 asic, int reg) +{ + return 0; +} + +static int +cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) +{ + __u8 val; + int err = 0; + + if(!modp->scan_path_connected) + return 0; + if(asicp->asic_id != VOYAGER_CAT_ID) { + CDEBUG(("cat_disconnect: ASIC is not CAT\n")); + return 1; + } + err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); + if(err) { + CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); + return err; + } + val &= VOYAGER_DISCONNECT_ASIC; + err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); + if(err) { + CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); + return err; + } + outb(VOYAGER_CAT_END, CAT_CMD); + outb(VOYAGER_CAT_RUN, CAT_CMD); + modp->scan_path_connected = 0; + + return 0; +} + +static int +cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) +{ + __u8 val; + int err = 0; + + if(modp->scan_path_connected) + return 0; + if(asicp->asic_id != VOYAGER_CAT_ID) { + CDEBUG(("cat_connect: ASIC is not CAT\n")); + return 1; + } + + err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); + if(err) { + CDEBUG(("cat_connect: failed to read SCANPATH\n")); + return err; + } + val |= VOYAGER_CONNECT_ASIC; + err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); + if(err) { + CDEBUG(("cat_connect: failed to write SCANPATH\n")); + return err; + } + outb(VOYAGER_CAT_END, CAT_CMD); + outb(VOYAGER_CAT_RUN, CAT_CMD); + modp->scan_path_connected = 1; + + return 0; +} + +void +voyager_cat_power_off(void) +{ + /* Power the machine off by writing to the PSI over the CAT + * bus */ + __u8 data; + voyager_module_t psi = { 0 }; + voyager_asic_t psi_asic = { 0 }; + + psi.asic = &psi_asic; + psi.asic->asic_id = VOYAGER_CAT_ID; + psi.asic->subaddr = VOYAGER_SUBADDR_HI; + psi.module_addr = VOYAGER_PSI; + psi.scan_path_connected = 0; + + outb(VOYAGER_CAT_END, CAT_CMD); + /* Connect the PSI to the CAT Bus */ + outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_disconnect(&psi, &psi_asic); + /* Read the status */ + cat_subread(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + CDEBUG(("PSI STATUS 0x%x\n", data)); + /* These two writes are power off prep and perform */ + data = PSI_CLEAR; + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + data = PSI_POWER_DOWN; + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); +} + +struct voyager_status voyager_status = { 0 }; + +void +voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) +{ + voyager_module_t psi = { 0 }; + voyager_asic_t psi_asic = { 0 }; + + psi.asic = &psi_asic; + psi.asic->asic_id = VOYAGER_CAT_ID; + psi.asic->subaddr = VOYAGER_SUBADDR_HI; + psi.module_addr = VOYAGER_PSI; + psi.scan_path_connected = 0; + + outb(VOYAGER_CAT_END, CAT_CMD); + /* Connect the PSI to the CAT Bus */ + outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_disconnect(&psi, &psi_asic); + switch(cmd) { + case VOYAGER_PSI_READ: + cat_read(&psi, &psi_asic, reg, data); + break; + case VOYAGER_PSI_WRITE: + cat_write(&psi, &psi_asic, reg, *data); + break; + case VOYAGER_PSI_SUBREAD: + cat_subread(&psi, &psi_asic, reg, 1, data); + break; + case VOYAGER_PSI_SUBWRITE: + cat_subwrite(&psi, &psi_asic, reg, 1, data); + break; + default: + printk(KERN_ERR "Voyager PSI, unrecognised command %d\n", cmd); + break; + } + outb(VOYAGER_CAT_END, CAT_CMD); +} + +void +voyager_cat_do_common_interrupt(void) +{ + /* This is caused either by a memory parity error or something + * in the PSI */ + __u8 data; + voyager_module_t psi = { 0 }; + voyager_asic_t psi_asic = { 0 }; + struct voyager_psi psi_reg; + int i; + re_read: + psi.asic = &psi_asic; + psi.asic->asic_id = VOYAGER_CAT_ID; + psi.asic->subaddr = VOYAGER_SUBADDR_HI; + psi.module_addr = VOYAGER_PSI; + psi.scan_path_connected = 0; + + outb(VOYAGER_CAT_END, CAT_CMD); + /* Connect the PSI to the CAT Bus */ + outb(VOYAGER_CAT_DESELECT, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_disconnect(&psi, &psi_asic); + /* Read the status. NOTE: Need to read *all* the PSI regs here + * otherwise the cmn int will be reasserted */ + for(i = 0; i < sizeof(psi_reg.regs); i++) { + cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]); + } + outb(VOYAGER_CAT_END, CAT_CMD); + if((psi_reg.regs.checkbit & 0x02) == 0) { + psi_reg.regs.checkbit |= 0x02; + cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); + printk("VOYAGER RE-READ PSI\n"); + goto re_read; + } + outb(VOYAGER_CAT_RUN, CAT_CMD); + for(i = 0; i < sizeof(psi_reg.subregs); i++) { + /* This looks strange, but the PSI doesn't do auto increment + * correctly */ + cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, + 1, &((__u8 *)&psi_reg.subregs)[i]); + } + outb(VOYAGER_CAT_END, CAT_CMD); +#ifdef VOYAGER_CAT_DEBUG + printk("VOYAGER PSI: "); + for(i=0; i<sizeof(psi_reg.regs); i++) + printk("%02x ", ((__u8 *)&psi_reg.regs)[i]); + printk("\n "); + for(i=0; i<sizeof(psi_reg.subregs); i++) + printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]); + printk("\n"); +#endif + if(psi_reg.regs.intstatus & PSI_MON) { + /* switch off or power fail */ + + if(psi_reg.subregs.supply & PSI_SWITCH_OFF) { + if(voyager_status.switch_off) { + printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n"); + voyager_cat_power_off(); + /* not reached */ + } else { + printk(KERN_ERR "Voyager front panel switch turned off\n"); + voyager_status.switch_off = 1; + voyager_status.request_from_kernel = 1; + up(&kvoyagerd_sem); + } + /* Tell the hardware we're taking care of the + * shutdown, otherwise it will power the box off + * within 3 seconds of the switch being pressed and, + * which is much more important to us, continue to + * assert the common interrupt */ + data = PSI_CLR_SWITCH_OFF; + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG, + 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + } else { + + VDEBUG(("Voyager ac fail reg 0x%x\n", + psi_reg.subregs.ACfail)); + if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { + /* No further update */ + return; + } +#if 0 + /* Don't bother trying to find out who failed. + * FIXME: This probably makes the code incorrect on + * anything other than a 345x */ + for(i=0; i< 5; i++) { + if( psi_reg.subregs.ACfail &(1<<i)) { + break; + } + } + printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); +#endif + /* DON'T do this: it shuts down the AC PSI + outb(VOYAGER_CAT_RUN, CAT_CMD); + data = PSI_MASK_MASK | i; + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, + 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + */ + printk(KERN_ERR "Voyager AC power failure\n"); + outb(VOYAGER_CAT_RUN, CAT_CMD); + data = PSI_COLD_START; + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_GENERAL_REG, + 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + voyager_status.power_fail = 1; + voyager_status.request_from_kernel = 1; + up(&kvoyagerd_sem); + } + + + } else if(psi_reg.regs.intstatus & PSI_FAULT) { + /* Major fault! */ + printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n"); + voyager_cat_power_off(); + /* not reached */ + } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM + | PSI_CURRENT | PSI_DVM + | PSI_PSCFAULT | PSI_STAT_CHG)) { + /* other psi fault */ + + printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); + /* clear the PSI fault */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + cat_write(&psi, &psi_asic, VOYAGER_PSI_STATUS_REG, 0); + outb(VOYAGER_CAT_END, CAT_CMD); + } +} diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c new file mode 100644 index 000000000000..903d739ca74a --- /dev/null +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -0,0 +1,1931 @@ +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* Copyright (C) 1999,2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * linux/arch/i386/kernel/voyager_smp.c + * + * This file provides all the same external entries as smp.c but uses + * the voyager hal to provide the functionality + */ +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/bootmem.h> +#include <linux/completion.h> +#include <asm/desc.h> +#include <asm/voyager.h> +#include <asm/vic.h> +#include <asm/mtrr.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/arch_hooks.h> + +#include <linux/irq.h> + +/* TLB state -- visible externally, indexed physically */ +DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; + +/* CPU IRQ affinity -- set to all ones initially */ +static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL }; + +/* per CPU data structure (for /proc/cpuinfo et al), visible externally + * indexed physically */ +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; + +/* physical ID of the CPU used to boot the system */ +unsigned char boot_cpu_id; + +/* The memory line addresses for the Quad CPIs */ +struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS] __cacheline_aligned; + +/* The masks for the Extended VIC processors, filled in by cat_init */ +__u32 voyager_extended_vic_processors = 0; + +/* Masks for the extended Quad processors which cannot be VIC booted */ +__u32 voyager_allowed_boot_processors = 0; + +/* The mask for the Quad Processors (both extended and non-extended) */ +__u32 voyager_quad_processors = 0; + +/* Total count of live CPUs, used in process.c to display + * the CPU information and in irq.c for the per CPU irq + * activity count. Finally exported by i386_ksyms.c */ +static int voyager_extended_cpus = 1; + +/* Have we found an SMP box - used by time.c to do the profiling + interrupt for timeslicing; do not set to 1 until the per CPU timer + interrupt is active */ +int smp_found_config = 0; + +/* Used for the invalidate map that's also checked in the spinlock */ +static volatile unsigned long smp_invalidate_needed; + +/* Bitmask of currently online CPUs - used by setup.c for + /proc/cpuinfo, visible externally but still physical */ +cpumask_t cpu_online_map = CPU_MASK_NONE; + +/* Bitmask of CPUs present in the system - exported by i386_syms.c, used + * by scheduler but indexed physically */ +cpumask_t phys_cpu_present_map = CPU_MASK_NONE; + + +/* The internal functions */ +static void send_CPI(__u32 cpuset, __u8 cpi); +static void ack_CPI(__u8 cpi); +static int ack_QIC_CPI(__u8 cpi); +static void ack_special_QIC_CPI(__u8 cpi); +static void ack_VIC_CPI(__u8 cpi); +static void send_CPI_allbutself(__u8 cpi); +static void enable_vic_irq(unsigned int irq); +static void disable_vic_irq(unsigned int irq); +static unsigned int startup_vic_irq(unsigned int irq); +static void enable_local_vic_irq(unsigned int irq); +static void disable_local_vic_irq(unsigned int irq); +static void before_handle_vic_irq(unsigned int irq); +static void after_handle_vic_irq(unsigned int irq); +static void set_vic_irq_affinity(unsigned int irq, cpumask_t mask); +static void ack_vic_irq(unsigned int irq); +static void vic_enable_cpi(void); +static void do_boot_cpu(__u8 cpuid); +static void do_quad_bootstrap(void); +static inline void wrapper_smp_local_timer_interrupt(struct pt_regs *); + +int hard_smp_processor_id(void); + +/* Inline functions */ +static inline void +send_one_QIC_CPI(__u8 cpu, __u8 cpi) +{ + voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = + (smp_processor_id() << 16) + cpi; +} + +static inline void +send_QIC_CPI(__u32 cpuset, __u8 cpi) +{ + int cpu; + + for_each_online_cpu(cpu) { + if(cpuset & (1<<cpu)) { +#ifdef VOYAGER_DEBUG + if(!cpu_isset(cpu, cpu_online_map)) + VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); +#endif + send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); + } + } +} + +static inline void +send_one_CPI(__u8 cpu, __u8 cpi) +{ + if(voyager_quad_processors & (1<<cpu)) + send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); + else + send_CPI(1<<cpu, cpi); +} + +static inline void +send_CPI_allbutself(__u8 cpi) +{ + __u8 cpu = smp_processor_id(); + __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); + send_CPI(mask, cpi); +} + +static inline int +is_cpu_quad(void) +{ + __u8 cpumask = inb(VIC_PROC_WHO_AM_I); + return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); +} + +static inline int +is_cpu_extended(void) +{ + __u8 cpu = hard_smp_processor_id(); + + return(voyager_extended_vic_processors & (1<<cpu)); +} + +static inline int +is_cpu_vic_boot(void) +{ + __u8 cpu = hard_smp_processor_id(); + + return(voyager_extended_vic_processors + & voyager_allowed_boot_processors & (1<<cpu)); +} + + +static inline void +ack_CPI(__u8 cpi) +{ + switch(cpi) { + case VIC_CPU_BOOT_CPI: + if(is_cpu_quad() && !is_cpu_vic_boot()) + ack_QIC_CPI(cpi); + else + ack_VIC_CPI(cpi); + break; + case VIC_SYS_INT: + case VIC_CMN_INT: + /* These are slightly strange. Even on the Quad card, + * They are vectored as VIC CPIs */ + if(is_cpu_quad()) + ack_special_QIC_CPI(cpi); + else + ack_VIC_CPI(cpi); + break; + default: + printk("VOYAGER ERROR: CPI%d is in common CPI code\n", cpi); + break; + } +} + +/* local variables */ + +/* The VIC IRQ descriptors -- these look almost identical to the + * 8259 IRQs except that masks and things must be kept per processor + */ +static struct hw_interrupt_type vic_irq_type = { + .typename = "VIC-level", + .startup = startup_vic_irq, + .shutdown = disable_vic_irq, + .enable = enable_vic_irq, + .disable = disable_vic_irq, + .ack = before_handle_vic_irq, + .end = after_handle_vic_irq, + .set_affinity = set_vic_irq_affinity, +}; + +/* used to count up as CPUs are brought on line (starts at 0) */ +static int cpucount = 0; + +/* steal a page from the bottom of memory for the trampoline and + * squirrel its address away here. This will be in kernel virtual + * space */ +static __u32 trampoline_base; + +/* The per cpu profile stuff - used in smp_local_timer_interrupt */ +static DEFINE_PER_CPU(int, prof_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; + +/* the map used to check if a CPU has booted */ +static __u32 cpu_booted_map; + +/* the synchronize flag used to hold all secondary CPUs spinning in + * a tight loop until the boot sequence is ready for them */ +static cpumask_t smp_commenced_mask = CPU_MASK_NONE; + +/* This is for the new dynamic CPU boot code */ +cpumask_t cpu_callin_map = CPU_MASK_NONE; +cpumask_t cpu_callout_map = CPU_MASK_NONE; + +/* The per processor IRQ masks (these are usually kept in sync) */ +static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; + +/* the list of IRQs to be enabled by the VIC_ENABLE_IRQ_CPI */ +static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; + +/* Lock for enable/disable of VIC interrupts */ +static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); + +/* The boot processor is correctly set up in PC mode when it + * comes up, but the secondaries need their master/slave 8259 + * pairs initializing correctly */ + +/* Interrupt counters (per cpu) and total - used to try to + * even up the interrupt handling routines */ +static long vic_intr_total = 0; +static long vic_intr_count[NR_CPUS] __cacheline_aligned = { 0 }; +static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 }; + +/* Since we can only use CPI0, we fake all the other CPIs */ +static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; + +/* debugging routine to read the isr of the cpu's pic */ +static inline __u16 +vic_read_isr(void) +{ + __u16 isr; + + outb(0x0b, 0xa0); + isr = inb(0xa0) << 8; + outb(0x0b, 0x20); + isr |= inb(0x20); + + return isr; +} + +static __init void +qic_setup(void) +{ + if(!is_cpu_quad()) { + /* not a quad, no setup */ + return; + } + outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); + outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); + + if(is_cpu_extended()) { + /* the QIC duplicate of the VIC base register */ + outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); + outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); + + /* FIXME: should set up the QIC timer and memory parity + * error vectors here */ + } +} + +static __init void +vic_setup_pic(void) +{ + outb(1, VIC_REDIRECT_REGISTER_1); + /* clear the claim registers for dynamic routing */ + outb(0, VIC_CLAIM_REGISTER_0); + outb(0, VIC_CLAIM_REGISTER_1); + + outb(0, VIC_PRIORITY_REGISTER); + /* Set the Primary and Secondary Microchannel vector + * bases to be the same as the ordinary interrupts + * + * FIXME: This would be more efficient using separate + * vectors. */ + outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE); + outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE); + /* Now initiallise the master PIC belonging to this CPU by + * sending the four ICWs */ + + /* ICW1: level triggered, ICW4 needed */ + outb(0x19, 0x20); + + /* ICW2: vector base */ + outb(FIRST_EXTERNAL_VECTOR, 0x21); + + /* ICW3: slave at line 2 */ + outb(0x04, 0x21); + + /* ICW4: 8086 mode */ + outb(0x01, 0x21); + + /* now the same for the slave PIC */ + + /* ICW1: level trigger, ICW4 needed */ + outb(0x19, 0xA0); + + /* ICW2: slave vector base */ + outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); + + /* ICW3: slave ID */ + outb(0x02, 0xA1); + + /* ICW4: 8086 mode */ + outb(0x01, 0xA1); +} + +static void +do_quad_bootstrap(void) +{ + if(is_cpu_quad() && is_cpu_vic_boot()) { + int i; + unsigned long flags; + __u8 cpuid = hard_smp_processor_id(); + + local_irq_save(flags); + + for(i = 0; i<4; i++) { + /* FIXME: this would be >>3 &0x7 on the 32 way */ + if(((cpuid >> 2) & 0x03) == i) + /* don't lower our own mask! */ + continue; + + /* masquerade as local Quad CPU */ + outb(QIC_CPUID_ENABLE | i, QIC_PROCESSOR_ID); + /* enable the startup CPI */ + outb(QIC_BOOT_CPI_MASK, QIC_MASK_REGISTER1); + /* restore cpu id */ + outb(0, QIC_PROCESSOR_ID); + } + local_irq_restore(flags); + } +} + + +/* Set up all the basic stuff: read the SMP config and make all the + * SMP information reflect only the boot cpu. All others will be + * brought on-line later. */ +void __init +find_smp_config(void) +{ + int i; + + boot_cpu_id = hard_smp_processor_id(); + + printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); + + /* initialize the CPU structures (moved from smp_boot_cpus) */ + for(i=0; i<NR_CPUS; i++) { + cpu_irq_affinity[i] = ~0; + } + cpu_online_map = cpumask_of_cpu(boot_cpu_id); + + /* The boot CPU must be extended */ + voyager_extended_vic_processors = 1<<boot_cpu_id; + /* initially, all of the first 8 cpu's can boot */ + voyager_allowed_boot_processors = 0xff; + /* set up everything for just this CPU, we can alter + * this as we start the other CPUs later */ + /* now get the CPU disposition from the extended CMOS */ + cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); + cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; + cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16; + cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24; + printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]); + /* Here we set up the VIC to enable SMP */ + /* enable the CPIs by writing the base vector to their register */ + outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); + outb(1, VIC_REDIRECT_REGISTER_1); + /* set the claim registers for static routing --- Boot CPU gets + * all interrupts untill all other CPUs started */ + outb(0xff, VIC_CLAIM_REGISTER_0); + outb(0xff, VIC_CLAIM_REGISTER_1); + /* Set the Primary and Secondary Microchannel vector + * bases to be the same as the ordinary interrupts + * + * FIXME: This would be more efficient using separate + * vectors. */ + outb(FIRST_EXTERNAL_VECTOR, VIC_PRIMARY_MC_BASE); + outb(FIRST_EXTERNAL_VECTOR, VIC_SECONDARY_MC_BASE); + + /* Finally tell the firmware that we're driving */ + outb(inb(VOYAGER_SUS_IN_CONTROL_PORT) | VOYAGER_IN_CONTROL_FLAG, + VOYAGER_SUS_IN_CONTROL_PORT); + + current_thread_info()->cpu = boot_cpu_id; +} + +/* + * The bootstrap kernel entry code has set these up. Save them + * for a given CPU, id is physical */ +void __init +smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c=&cpu_data[id]; + + *c = boot_cpu_data; + + identify_cpu(c); +} + +/* set up the trampoline and return the physical address of the code */ +static __u32 __init +setup_trampoline(void) +{ + /* these two are global symbols in trampoline.S */ + extern __u8 trampoline_end[]; + extern __u8 trampoline_data[]; + + memcpy((__u8 *)trampoline_base, trampoline_data, + trampoline_end - trampoline_data); + return virt_to_phys((__u8 *)trampoline_base); +} + +/* Routine initially called when a non-boot CPU is brought online */ +static void __init +start_secondary(void *unused) +{ + __u8 cpuid = hard_smp_processor_id(); + /* external functions not defined in the headers */ + extern void calibrate_delay(void); + + cpu_init(); + + /* OK, we're in the routine */ + ack_CPI(VIC_CPU_BOOT_CPI); + + /* setup the 8259 master slave pair belonging to this CPU --- + * we won't actually receive any until the boot CPU + * relinquishes it's static routing mask */ + vic_setup_pic(); + + qic_setup(); + + if(is_cpu_quad() && !is_cpu_vic_boot()) { + /* clear the boot CPI */ + __u8 dummy; + + dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; + printk("read dummy %d\n", dummy); + } + + /* lower the mask to receive CPIs */ + vic_enable_cpi(); + + VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); + + /* enable interrupts */ + local_irq_enable(); + + /* get our bogomips */ + calibrate_delay(); + + /* save our processor parameters */ + smp_store_cpu_info(cpuid); + + /* if we're a quad, we may need to bootstrap other CPUs */ + do_quad_bootstrap(); + + /* FIXME: this is rather a poor hack to prevent the CPU + * activating softirqs while it's supposed to be waiting for + * permission to proceed. Without this, the new per CPU stuff + * in the softirqs will fail */ + local_irq_disable(); + cpu_set(cpuid, cpu_callin_map); + + /* signal that we're done */ + cpu_booted_map = 1; + + while (!cpu_isset(cpuid, smp_commenced_mask)) + rep_nop(); + local_irq_enable(); + + local_flush_tlb(); + + cpu_set(cpuid, cpu_online_map); + wmb(); + cpu_idle(); +} + + +/* Routine to kick start the given CPU and wait for it to report ready + * (or timeout in startup). When this routine returns, the requested + * CPU is either fully running and configured or known to be dead. + * + * We call this routine sequentially 1 CPU at a time, so no need for + * locking */ + +static void __init +do_boot_cpu(__u8 cpu) +{ + struct task_struct *idle; + int timeout; + unsigned long flags; + int quad_boot = (1<<cpu) & voyager_quad_processors + & ~( voyager_extended_vic_processors + & voyager_allowed_boot_processors); + + /* For the 486, we can't use the 4Mb page table trick, so + * must map a region of memory */ +#ifdef CONFIG_M486 + int i; + unsigned long *page_table_copies = (unsigned long *) + __get_free_page(GFP_KERNEL); +#endif + pgd_t orig_swapper_pg_dir0; + + /* This is an area in head.S which was used to set up the + * initial kernel stack. We need to alter this to give the + * booting CPU a new stack (taken from its idle process) */ + extern struct { + __u8 *esp; + unsigned short ss; + } stack_start; + /* This is the format of the CPI IDT gate (in real mode) which + * we're hijacking to boot the CPU */ + union IDTFormat { + struct seg { + __u16 Offset; + __u16 Segment; + } idt; + __u32 val; + } hijack_source; + + __u32 *hijack_vector; + __u32 start_phys_address = setup_trampoline(); + + /* There's a clever trick to this: The linux trampoline is + * compiled to begin at absolute location zero, so make the + * address zero but have the data segment selector compensate + * for the actual address */ + hijack_source.idt.Offset = start_phys_address & 0x000F; + hijack_source.idt.Segment = (start_phys_address >> 4) & 0xFFFF; + + cpucount++; + idle = fork_idle(cpu); + if(IS_ERR(idle)) + panic("failed fork for CPU%d", cpu); + idle->thread.eip = (unsigned long) start_secondary; + /* init_tasks (in sched.c) is indexed logically */ + stack_start.esp = (void *) idle->thread.esp; + + irq_ctx_init(cpu); + + /* Note: Don't modify initial ss override */ + VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, + (unsigned long)hijack_source.val, hijack_source.idt.Segment, + hijack_source.idt.Offset, stack_start.esp)); + /* set the original swapper_pg_dir[0] to map 0 to 4Mb transparently + * (so that the booting CPU can find start_32 */ + orig_swapper_pg_dir0 = swapper_pg_dir[0]; +#ifdef CONFIG_M486 + if(page_table_copies == NULL) + panic("No free memory for 486 page tables\n"); + for(i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++) + page_table_copies[i] = (i * PAGE_SIZE) + | _PAGE_RW | _PAGE_USER | _PAGE_PRESENT; + + ((unsigned long *)swapper_pg_dir)[0] = + ((virt_to_phys(page_table_copies)) & PAGE_MASK) + | _PAGE_RW | _PAGE_USER | _PAGE_PRESENT; +#else + ((unsigned long *)swapper_pg_dir)[0] = + (virt_to_phys(pg0) & PAGE_MASK) + | _PAGE_RW | _PAGE_USER | _PAGE_PRESENT; +#endif + + if(quad_boot) { + printk("CPU %d: non extended Quad boot\n", cpu); + hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4); + *hijack_vector = hijack_source.val; + } else { + printk("CPU%d: extended VIC boot\n", cpu); + hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4); + *hijack_vector = hijack_source.val; + /* VIC errata, may also receive interrupt at this address */ + hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4); + *hijack_vector = hijack_source.val; + } + /* All non-boot CPUs start with interrupts fully masked. Need + * to lower the mask of the CPI we're about to send. We do + * this in the VIC by masquerading as the processor we're + * about to boot and lowering its interrupt mask */ + local_irq_save(flags); + if(quad_boot) { + send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); + } else { + outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); + /* here we're altering registers belonging to `cpu' */ + + outb(VIC_BOOT_INTERRUPT_MASK, 0x21); + /* now go back to our original identity */ + outb(boot_cpu_id, VIC_PROCESSOR_ID); + + /* and boot the CPU */ + + send_CPI((1<<cpu), VIC_CPU_BOOT_CPI); + } + cpu_booted_map = 0; + local_irq_restore(flags); + + /* now wait for it to become ready (or timeout) */ + for(timeout = 0; timeout < 50000; timeout++) { + if(cpu_booted_map) + break; + udelay(100); + } + /* reset the page table */ + swapper_pg_dir[0] = orig_swapper_pg_dir0; + local_flush_tlb(); +#ifdef CONFIG_M486 + free_page((unsigned long)page_table_copies); +#endif + + if (cpu_booted_map) { + VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", + cpu, smp_processor_id())); + + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + wmb(); + cpu_set(cpu, cpu_callout_map); + } + else { + printk("CPU%d FAILED TO BOOT: ", cpu); + if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5) + printk("Stuck.\n"); + else + printk("Not responding.\n"); + + cpucount--; + } +} + +void __init +smp_boot_cpus(void) +{ + int i; + + /* CAT BUS initialisation must be done after the memory */ + /* FIXME: The L4 has a catbus too, it just needs to be + * accessed in a totally different way */ + if(voyager_level == 5) { + voyager_cat_init(); + + /* now that the cat has probed the Voyager System Bus, sanity + * check the cpu map */ + if( ((voyager_quad_processors | voyager_extended_vic_processors) + & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) { + /* should panic */ + printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n"); + } + } else if(voyager_level == 4) + voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0]; + + /* this sets up the idle task to run on the current cpu */ + voyager_extended_cpus = 1; + /* Remove the global_irq_holder setting, it triggers a BUG() on + * schedule at the moment */ + //global_irq_holder = boot_cpu_id; + + /* FIXME: Need to do something about this but currently only works + * on CPUs with a tsc which none of mine have. + smp_tune_scheduling(); + */ + smp_store_cpu_info(boot_cpu_id); + printk("CPU%d: ", boot_cpu_id); + print_cpu_info(&cpu_data[boot_cpu_id]); + + if(is_cpu_quad()) { + /* booting on a Quad CPU */ + printk("VOYAGER SMP: Boot CPU is Quad\n"); + qic_setup(); + do_quad_bootstrap(); + } + + /* enable our own CPIs */ + vic_enable_cpi(); + + cpu_set(boot_cpu_id, cpu_online_map); + cpu_set(boot_cpu_id, cpu_callout_map); + + /* loop over all the extended VIC CPUs and boot them. The + * Quad CPUs must be bootstrapped by their extended VIC cpu */ + for(i = 0; i < NR_CPUS; i++) { + if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) + continue; + do_boot_cpu(i); + /* This udelay seems to be needed for the Quad boots + * don't remove unless you know what you're doing */ + udelay(1000); + } + /* we could compute the total bogomips here, but why bother?, + * Code added from smpboot.c */ + { + unsigned long bogosum = 0; + for (i = 0; i < NR_CPUS; i++) + if (cpu_isset(i, cpu_online_map)) + bogosum += cpu_data[i].loops_per_jiffy; + printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + } + voyager_extended_cpus = hweight32(voyager_extended_vic_processors); + printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus); + /* that's it, switch to symmetric mode */ + outb(0, VIC_PRIORITY_REGISTER); + outb(0, VIC_CLAIM_REGISTER_0); + outb(0, VIC_CLAIM_REGISTER_1); + + VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); +} + +/* Reload the secondary CPUs task structure (this function does not + * return ) */ +void __init +initialize_secondary(void) +{ +#if 0 + // AC kernels only + set_current(hard_get_current()); +#endif + + /* + * We don't actually need to load the full TSS, + * basically just the stack pointer and the eip. + */ + + asm volatile( + "movl %0,%%esp\n\t" + "jmp *%1" + : + :"r" (current->thread.esp),"r" (current->thread.eip)); +} + +/* handle a Voyager SYS_INT -- If we don't, the base board will + * panic the system. + * + * System interrupts occur because some problem was detected on the + * various busses. To find out what you have to probe all the + * hardware via the CAT bus. FIXME: At the moment we do nothing. */ +fastcall void +smp_vic_sys_interrupt(struct pt_regs *regs) +{ + ack_CPI(VIC_SYS_INT); + printk("Voyager SYSTEM INTERRUPT\n"); +} + +/* Handle a voyager CMN_INT; These interrupts occur either because of + * a system status change or because a single bit memory error + * occurred. FIXME: At the moment, ignore all this. */ +fastcall void +smp_vic_cmn_interrupt(struct pt_regs *regs) +{ + static __u8 in_cmn_int = 0; + static DEFINE_SPINLOCK(cmn_int_lock); + + /* common ints are broadcast, so make sure we only do this once */ + _raw_spin_lock(&cmn_int_lock); + if(in_cmn_int) + goto unlock_end; + + in_cmn_int++; + _raw_spin_unlock(&cmn_int_lock); + + VDEBUG(("Voyager COMMON INTERRUPT\n")); + + if(voyager_level == 5) + voyager_cat_do_common_interrupt(); + + _raw_spin_lock(&cmn_int_lock); + in_cmn_int = 0; + unlock_end: + _raw_spin_unlock(&cmn_int_lock); + ack_CPI(VIC_CMN_INT); +} + +/* + * Reschedule call back. Nothing to do, all the work is done + * automatically when we return from the interrupt. */ +static void +smp_reschedule_interrupt(void) +{ + /* do nothing */ +} + +static struct mm_struct * flush_mm; +static unsigned long flush_va; +static DEFINE_SPINLOCK(tlbstate_lock); +#define FLUSH_ALL 0xffffffff + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + * + * We need to reload %cr3 since the page tables may be going + * away from under us.. + */ +static inline void +leave_mm (unsigned long cpu) +{ + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} + + +/* + * Invalidate call-back + */ +static void +smp_invalidate_interrupt(void) +{ + __u8 cpu = smp_processor_id(); + + if (!test_bit(cpu, &smp_invalidate_needed)) + return; + /* This will flood messages. Don't uncomment unless you see + * Problems with cross cpu invalidation + VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", + smp_processor_id())); + */ + + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_va == FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(flush_va); + } else + leave_mm(cpu); + } + smp_mb__before_clear_bit(); + clear_bit(cpu, &smp_invalidate_needed); + smp_mb__after_clear_bit(); +} + +/* All the new flush operations for 2.4 */ + + +/* This routine is called with a physical cpu mask */ +static void +flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, + unsigned long va) +{ + int stuck = 50000; + + if (!cpumask) + BUG(); + if ((cpumask & cpus_addr(cpu_online_map)[0]) != cpumask) + BUG(); + if (cpumask & (1 << smp_processor_id())) + BUG(); + if (!mm) + BUG(); + + spin_lock(&tlbstate_lock); + + flush_mm = mm; + flush_va = va; + atomic_set_mask(cpumask, &smp_invalidate_needed); + /* + * We have to send the CPI only to + * CPUs affected. + */ + send_CPI(cpumask, VIC_INVALIDATE_CPI); + + while (smp_invalidate_needed) { + mb(); + if(--stuck == 0) { + printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id()); + break; + } + } + + /* Uncomment only to debug invalidation problems + VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); + */ + + flush_mm = NULL; + flush_va = 0; + spin_unlock(&tlbstate_lock); +} + +void +flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + unsigned long cpu_mask; + + preempt_disable(); + + cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); + local_flush_tlb(); + if (cpu_mask) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} + + +void +flush_tlb_mm (struct mm_struct * mm) +{ + unsigned long cpu_mask; + + preempt_disable(); + + cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpu_mask) + flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long cpu_mask; + + preempt_disable(); + + cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); + if (current->active_mm == mm) { + if(current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpu_mask) + flush_tlb_others(cpu_mask, mm, va); + + preempt_enable(); +} + +/* enable the requested IRQs */ +static void +smp_enable_irq_interrupt(void) +{ + __u8 irq; + __u8 cpu = get_cpu(); + + VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, + vic_irq_enable_mask[cpu])); + + spin_lock(&vic_irq_lock); + for(irq = 0; irq < 16; irq++) { + if(vic_irq_enable_mask[cpu] & (1<<irq)) + enable_local_vic_irq(irq); + } + vic_irq_enable_mask[cpu] = 0; + spin_unlock(&vic_irq_lock); + + put_cpu_no_resched(); +} + +/* + * CPU halt call-back + */ +static void +smp_stop_cpu_function(void *dummy) +{ + VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); + cpu_clear(smp_processor_id(), cpu_online_map); + local_irq_disable(); + for(;;) + __asm__("hlt"); +} + +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + volatile unsigned long started; + volatile unsigned long finished; + int wait; +}; + +static struct call_data_struct * call_data; + +/* execute a thread on a new CPU. The function to be called must be + * previously set up. This is used to schedule a function for + * execution on all CPU's - set up the function then broadcast a + * function_interrupt CPI to come here on each CPU */ +static void +smp_call_function_interrupt(void) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + /* must take copy of wait because call_data may be replaced + * unless the function is waiting for us to finish */ + int wait = call_data->wait; + __u8 cpu = smp_processor_id(); + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + if(!test_and_clear_bit(cpu, &call_data->started)) { + /* If the bit wasn't set, this could be a replay */ + printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu); + return; + } + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + if (wait) { + mb(); + clear_bit(cpu, &call_data->finished); + } +} + +/* Call this function on all CPUs using the function_interrupt above + <func> The function to run. This must be fast and non-blocking. + <info> An arbitrary pointer to pass to the function. + <retry> If true, keep retrying until ready. + <wait> If true, wait until function has completed on other CPUs. + [RETURNS] 0 on success, else a negative status code. Does not return until + remote CPUs are nearly ready to execute <<func>> or are or have executed. +*/ +int +smp_call_function (void (*func) (void *info), void *info, int retry, + int wait) +{ + struct call_data_struct data; + __u32 mask = cpus_addr(cpu_online_map)[0]; + + mask &= ~(1<<smp_processor_id()); + + if (!mask) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + data.started = mask; + data.wait = wait; + if (wait) + data.finished = mask; + + spin_lock(&call_lock); + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_CPI_allbutself(VIC_CALL_FUNCTION_CPI); + + /* Wait for response */ + while (data.started) + barrier(); + + if (wait) + while (data.finished) + barrier(); + + spin_unlock(&call_lock); + + return 0; +} + +/* Sorry about the name. In an APIC based system, the APICs + * themselves are programmed to send a timer interrupt. This is used + * by linux to reschedule the processor. Voyager doesn't have this, + * so we use the system clock to interrupt one processor, which in + * turn, broadcasts a timer CPI to all the others --- we receive that + * CPI here. We don't use this actually for counting so losing + * ticks doesn't matter + * + * FIXME: For those CPU's which actually have a local APIC, we could + * try to use it to trigger this interrupt instead of having to + * broadcast the timer tick. Unfortunately, all my pentium DYADs have + * no local APIC, so I can't do this + * + * This function is currently a placeholder and is unused in the code */ +fastcall void +smp_apic_timer_interrupt(struct pt_regs *regs) +{ + wrapper_smp_local_timer_interrupt(regs); +} + +/* All of the QUAD interrupt GATES */ +fastcall void +smp_qic_timer_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_TIMER_CPI); + wrapper_smp_local_timer_interrupt(regs); +} + +fastcall void +smp_qic_invalidate_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_INVALIDATE_CPI); + smp_invalidate_interrupt(); +} + +fastcall void +smp_qic_reschedule_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_RESCHEDULE_CPI); + smp_reschedule_interrupt(); +} + +fastcall void +smp_qic_enable_irq_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); + smp_enable_irq_interrupt(); +} + +fastcall void +smp_qic_call_function_interrupt(struct pt_regs *regs) +{ + ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); + smp_call_function_interrupt(); +} + +fastcall void +smp_vic_cpi_interrupt(struct pt_regs *regs) +{ + __u8 cpu = smp_processor_id(); + + if(is_cpu_quad()) + ack_QIC_CPI(VIC_CPI_LEVEL0); + else + ack_VIC_CPI(VIC_CPI_LEVEL0); + + if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) + wrapper_smp_local_timer_interrupt(regs); + if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) + smp_invalidate_interrupt(); + if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) + smp_reschedule_interrupt(); + if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) + smp_enable_irq_interrupt(); + if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) + smp_call_function_interrupt(); +} + +static void +do_flush_tlb_all(void* info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + leave_mm(cpu); +} + + +/* flush the TLB of every active CPU in the system */ +void +flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, 0, 1, 1); +} + +/* used to set up the trampoline for other CPUs when the memory manager + * is sorted out */ +void __init +smp_alloc_memory(void) +{ + trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE); + if(__pa(trampoline_base) >= 0x93000) + BUG(); +} + +/* send a reschedule CPI to one CPU by physical CPU number*/ +void +smp_send_reschedule(int cpu) +{ + send_one_CPI(cpu, VIC_RESCHEDULE_CPI); +} + + +int +hard_smp_processor_id(void) +{ + __u8 i; + __u8 cpumask = inb(VIC_PROC_WHO_AM_I); + if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) + return cpumask & 0x1F; + + for(i = 0; i < 8; i++) { + if(cpumask & (1<<i)) + return i; + } + printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); + return 0; +} + +/* broadcast a halt to all other CPUs */ +void +smp_send_stop(void) +{ + smp_call_function(smp_stop_cpu_function, NULL, 1, 1); +} + +/* this function is triggered in time.c when a clock tick fires + * we need to re-broadcast the tick to all CPUs */ +void +smp_vic_timer_interrupt(struct pt_regs *regs) +{ + send_CPI_allbutself(VIC_TIMER_CPI); + smp_local_timer_interrupt(regs); +} + +static inline void +wrapper_smp_local_timer_interrupt(struct pt_regs *regs) +{ + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); +} + +/* local (per CPU) timer interrupt. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ +void +smp_local_timer_interrupt(struct pt_regs * regs) +{ + int cpu = smp_processor_id(); + long weight; + + profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { + /* + * The multiplier may have changed since the last time we got + * to this point as a result of the user writing to + * /proc/profile. In this case we need to adjust the APIC + * timer accordingly. + * + * Interrupts are already masked off at this point. + */ + per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu); + if (per_cpu(prof_counter, cpu) != + per_cpu(prof_old_multiplier, cpu)) { + /* FIXME: need to update the vic timer tick here */ + per_cpu(prof_old_multiplier, cpu) = + per_cpu(prof_counter, cpu); + } + + update_process_times(user_mode(regs)); + } + + if( ((1<<cpu) & voyager_extended_vic_processors) == 0) + /* only extended VIC processors participate in + * interrupt distribution */ + return; + + /* + * We take the 'long' return path, and there every subsystem + * grabs the apropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ + + if((++vic_tick[cpu] & 0x7) != 0) + return; + /* get here every 16 ticks (about every 1/6 of a second) */ + + /* Change our priority to give someone else a chance at getting + * the IRQ. The algorithm goes like this: + * + * In the VIC, the dynamically routed interrupt is always + * handled by the lowest priority eligible (i.e. receiving + * interrupts) CPU. If >1 eligible CPUs are equal lowest, the + * lowest processor number gets it. + * + * The priority of a CPU is controlled by a special per-CPU + * VIC priority register which is 3 bits wide 0 being lowest + * and 7 highest priority.. + * + * Therefore we subtract the average number of interrupts from + * the number we've fielded. If this number is negative, we + * lower the activity count and if it is positive, we raise + * it. + * + * I'm afraid this still leads to odd looking interrupt counts: + * the totals are all roughly equal, but the individual ones + * look rather skewed. + * + * FIXME: This algorithm is total crap when mixed with SMP + * affinity code since we now try to even up the interrupt + * counts when an affinity binding is keeping them on a + * particular CPU*/ + weight = (vic_intr_count[cpu]*voyager_extended_cpus + - vic_intr_total) >> 4; + weight += 4; + if(weight > 7) + weight = 7; + if(weight < 0) + weight = 0; + + outb((__u8)weight, VIC_PRIORITY_REGISTER); + +#ifdef VOYAGER_DEBUG + if((vic_tick[cpu] & 0xFFF) == 0) { + /* print this message roughly every 25 secs */ + printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", + cpu, vic_tick[cpu], weight); + } +#endif +} + +/* setup the profiling timer */ +int +setup_profiling_timer(unsigned int multiplier) +{ + int i; + + if ( (!multiplier)) + return -EINVAL; + + /* + * Set the new multiplier for each CPU. CPUs don't start using the + * new values until the next timer interrupt in which they do process + * accounting. + */ + for (i = 0; i < NR_CPUS; ++i) + per_cpu(prof_multiplier, i) = multiplier; + + return 0; +} + + +/* The CPIs are handled in the per cpu 8259s, so they must be + * enabled to be received: FIX: enabling the CPIs in the early + * boot sequence interferes with bug checking; enable them later + * on in smp_init */ +#define VIC_SET_GATE(cpi, vector) \ + set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector)) +#define QIC_SET_GATE(cpi, vector) \ + set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) + +void __init +smp_intr_init(void) +{ + int i; + + /* initialize the per cpu irq mask to all disabled */ + for(i = 0; i < NR_CPUS; i++) + vic_irq_mask[i] = 0xFFFF; + + VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); + + VIC_SET_GATE(VIC_SYS_INT, vic_sys_interrupt); + VIC_SET_GATE(VIC_CMN_INT, vic_cmn_interrupt); + + QIC_SET_GATE(QIC_TIMER_CPI, qic_timer_interrupt); + QIC_SET_GATE(QIC_INVALIDATE_CPI, qic_invalidate_interrupt); + QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); + QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); + QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); + + + /* now put the VIC descriptor into the first 48 IRQs + * + * This is for later: first 16 correspond to PC IRQs; next 16 + * are Primary MC IRQs and final 16 are Secondary MC IRQs */ + for(i = 0; i < 48; i++) + irq_desc[i].handler = &vic_irq_type; +} + +/* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per + * processor to receive CPI */ +static void +send_CPI(__u32 cpuset, __u8 cpi) +{ + int cpu; + __u32 quad_cpuset = (cpuset & voyager_quad_processors); + + if(cpi < VIC_START_FAKE_CPI) { + /* fake CPI are only used for booting, so send to the + * extended quads as well---Quads must be VIC booted */ + outb((__u8)(cpuset), VIC_CPI_Registers[cpi]); + return; + } + if(quad_cpuset) + send_QIC_CPI(quad_cpuset, cpi); + cpuset &= ~quad_cpuset; + cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ + if(cpuset == 0) + return; + for_each_online_cpu(cpu) { + if(cpuset & (1<<cpu)) + set_bit(cpi, &vic_cpi_mailbox[cpu]); + } + if(cpuset) + outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); +} + +/* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and + * set the cache line to shared by reading it. + * + * DON'T make this inline otherwise the cache line read will be + * optimised away + * */ +static int +ack_QIC_CPI(__u8 cpi) { + __u8 cpu = hard_smp_processor_id(); + + cpi &= 7; + + outb(1<<cpi, QIC_INTERRUPT_CLEAR1); + return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; +} + +static void +ack_special_QIC_CPI(__u8 cpi) +{ + switch(cpi) { + case VIC_CMN_INT: + outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); + break; + case VIC_SYS_INT: + outb(QIC_SYS_INT, QIC_INTERRUPT_CLEAR0); + break; + } + /* also clear at the VIC, just in case (nop for non-extended proc) */ + ack_VIC_CPI(cpi); +} + +/* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ +static void +ack_VIC_CPI(__u8 cpi) +{ +#ifdef VOYAGER_DEBUG + unsigned long flags; + __u16 isr; + __u8 cpu = smp_processor_id(); + + local_irq_save(flags); + isr = vic_read_isr(); + if((isr & (1<<(cpi &7))) == 0) { + printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); + } +#endif + /* send specific EOI; the two system interrupts have + * bit 4 set for a separate vector but behave as the + * corresponding 3 bit intr */ + outb_p(0x60|(cpi & 7),0x20); + +#ifdef VOYAGER_DEBUG + if((vic_read_isr() & (1<<(cpi &7))) != 0) { + printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); + } + local_irq_restore(flags); +#endif +} + +/* cribbed with thanks from irq.c */ +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) +#define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) + +static unsigned int +startup_vic_irq(unsigned int irq) +{ + enable_vic_irq(irq); + + return 0; +} + +/* The enable and disable routines. This is where we run into + * conflicting architectural philosophy. Fundamentally, the voyager + * architecture does not expect to have to disable interrupts globally + * (the IRQ controllers belong to each CPU). The processor masquerade + * which is used to start the system shouldn't be used in a running OS + * since it will cause great confusion if two separate CPUs drive to + * the same IRQ controller (I know, I've tried it). + * + * The solution is a variant on the NCR lazy SPL design: + * + * 1) To disable an interrupt, do nothing (other than set the + * IRQ_DISABLED flag). This dares the interrupt actually to arrive. + * + * 2) If the interrupt dares to come in, raise the local mask against + * it (this will result in all the CPU masks being raised + * eventually). + * + * 3) To enable the interrupt, lower the mask on the local CPU and + * broadcast an Interrupt enable CPI which causes all other CPUs to + * adjust their masks accordingly. */ + +static void +enable_vic_irq(unsigned int irq) +{ + /* linux doesn't to processor-irq affinity, so enable on + * all CPUs we know about */ + int cpu = smp_processor_id(), real_cpu; + __u16 mask = (1<<irq); + __u32 processorList = 0; + unsigned long flags; + + VDEBUG(("VOYAGER: enable_vic_irq(%d) CPU%d affinity 0x%lx\n", + irq, cpu, cpu_irq_affinity[cpu])); + spin_lock_irqsave(&vic_irq_lock, flags); + for_each_online_cpu(real_cpu) { + if(!(voyager_extended_vic_processors & (1<<real_cpu))) + continue; + if(!(cpu_irq_affinity[real_cpu] & mask)) { + /* irq has no affinity for this CPU, ignore */ + continue; + } + if(real_cpu == cpu) { + enable_local_vic_irq(irq); + } + else if(vic_irq_mask[real_cpu] & mask) { + vic_irq_enable_mask[real_cpu] |= mask; + processorList |= (1<<real_cpu); + } + } + spin_unlock_irqrestore(&vic_irq_lock, flags); + if(processorList) + send_CPI(processorList, VIC_ENABLE_IRQ_CPI); +} + +static void +disable_vic_irq(unsigned int irq) +{ + /* lazy disable, do nothing */ +} + +static void +enable_local_vic_irq(unsigned int irq) +{ + __u8 cpu = smp_processor_id(); + __u16 mask = ~(1 << irq); + __u16 old_mask = vic_irq_mask[cpu]; + + vic_irq_mask[cpu] &= mask; + if(vic_irq_mask[cpu] == old_mask) + return; + + VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", + irq, cpu)); + + if (irq & 8) { + outb_p(cached_A1(cpu),0xA1); + (void)inb_p(0xA1); + } + else { + outb_p(cached_21(cpu),0x21); + (void)inb_p(0x21); + } +} + +static void +disable_local_vic_irq(unsigned int irq) +{ + __u8 cpu = smp_processor_id(); + __u16 mask = (1 << irq); + __u16 old_mask = vic_irq_mask[cpu]; + + if(irq == 7) + return; + + vic_irq_mask[cpu] |= mask; + if(old_mask == vic_irq_mask[cpu]) + return; + + VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", + irq, cpu)); + + if (irq & 8) { + outb_p(cached_A1(cpu),0xA1); + (void)inb_p(0xA1); + } + else { + outb_p(cached_21(cpu),0x21); + (void)inb_p(0x21); + } +} + +/* The VIC is level triggered, so the ack can only be issued after the + * interrupt completes. However, we do Voyager lazy interrupt + * handling here: It is an extremely expensive operation to mask an + * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If + * this interrupt actually comes in, then we mask and ack here to push + * the interrupt off to another CPU */ +static void +before_handle_vic_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + __u8 cpu = smp_processor_id(); + + _raw_spin_lock(&vic_irq_lock); + vic_intr_total++; + vic_intr_count[cpu]++; + + if(!(cpu_irq_affinity[cpu] & (1<<irq))) { + /* The irq is not in our affinity mask, push it off + * onto another CPU */ + VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n", + irq, cpu)); + disable_local_vic_irq(irq); + /* set IRQ_INPROGRESS to prevent the handler in irq.c from + * actually calling the interrupt routine */ + desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; + } else if(desc->status & IRQ_DISABLED) { + /* Damn, the interrupt actually arrived, do the lazy + * disable thing. The interrupt routine in irq.c will + * not handle a IRQ_DISABLED interrupt, so nothing more + * need be done here */ + VDEBUG(("VOYAGER DEBUG: lazy disable of irq %d on CPU %d\n", + irq, cpu)); + disable_local_vic_irq(irq); + desc->status |= IRQ_REPLAY; + } else { + desc->status &= ~IRQ_REPLAY; + } + + _raw_spin_unlock(&vic_irq_lock); +} + +/* Finish the VIC interrupt: basically mask */ +static void +after_handle_vic_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + + _raw_spin_lock(&vic_irq_lock); + { + unsigned int status = desc->status & ~IRQ_INPROGRESS; +#ifdef VOYAGER_DEBUG + __u16 isr; +#endif + + desc->status = status; + if ((status & IRQ_DISABLED)) + disable_local_vic_irq(irq); +#ifdef VOYAGER_DEBUG + /* DEBUG: before we ack, check what's in progress */ + isr = vic_read_isr(); + if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) { + int i; + __u8 cpu = smp_processor_id(); + __u8 real_cpu; + int mask; /* Um... initialize me??? --RR */ + + printk("VOYAGER SMP: CPU%d lost interrupt %d\n", + cpu, irq); + for_each_cpu(real_cpu, mask) { + + outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, + VIC_PROCESSOR_ID); + isr = vic_read_isr(); + if(isr & (1<<irq)) { + printk("VOYAGER SMP: CPU%d ack irq %d\n", + real_cpu, irq); + ack_vic_irq(irq); + } + outb(cpu, VIC_PROCESSOR_ID); + } + } +#endif /* VOYAGER_DEBUG */ + /* as soon as we ack, the interrupt is eligible for + * receipt by another CPU so everything must be in + * order here */ + ack_vic_irq(irq); + if(status & IRQ_REPLAY) { + /* replay is set if we disable the interrupt + * in the before_handle_vic_irq() routine, so + * clear the in progress bit here to allow the + * next CPU to handle this correctly */ + desc->status &= ~(IRQ_REPLAY | IRQ_INPROGRESS); + } +#ifdef VOYAGER_DEBUG + isr = vic_read_isr(); + if((isr & (1<<irq)) != 0) + printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n", + irq, isr); +#endif /* VOYAGER_DEBUG */ + } + _raw_spin_unlock(&vic_irq_lock); + + /* All code after this point is out of the main path - the IRQ + * may be intercepted by another CPU if reasserted */ +} + + +/* Linux processor - interrupt affinity manipulations. + * + * For each processor, we maintain a 32 bit irq affinity mask. + * Initially it is set to all 1's so every processor accepts every + * interrupt. In this call, we change the processor's affinity mask: + * + * Change from enable to disable: + * + * If the interrupt ever comes in to the processor, we will disable it + * and ack it to push it off to another CPU, so just accept the mask here. + * + * Change from disable to enable: + * + * change the mask and then do an interrupt enable CPI to re-enable on + * the selected processors */ + +void +set_vic_irq_affinity(unsigned int irq, cpumask_t mask) +{ + /* Only extended processors handle interrupts */ + unsigned long real_mask; + unsigned long irq_mask = 1 << irq; + int cpu; + + real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; + + if(cpus_addr(mask)[0] == 0) + /* can't have no cpu's to accept the interrupt -- extremely + * bad things will happen */ + return; + + if(irq == 0) + /* can't change the affinity of the timer IRQ. This + * is due to the constraint in the voyager + * architecture that the CPI also comes in on and IRQ + * line and we have chosen IRQ0 for this. If you + * raise the mask on this interrupt, the processor + * will no-longer be able to accept VIC CPIs */ + return; + + if(irq >= 32) + /* You can only have 32 interrupts in a voyager system + * (and 32 only if you have a secondary microchannel + * bus) */ + return; + + for_each_online_cpu(cpu) { + unsigned long cpu_mask = 1 << cpu; + + if(cpu_mask & real_mask) { + /* enable the interrupt for this cpu */ + cpu_irq_affinity[cpu] |= irq_mask; + } else { + /* disable the interrupt for this cpu */ + cpu_irq_affinity[cpu] &= ~irq_mask; + } + } + /* this is magic, we now have the correct affinity maps, so + * enable the interrupt. This will send an enable CPI to + * those cpu's who need to enable it in their local masks, + * causing them to correct for the new affinity . If the + * interrupt is currently globally disabled, it will simply be + * disabled again as it comes in (voyager lazy disable). If + * the affinity map is tightened to disable the interrupt on a + * cpu, it will be pushed off when it comes in */ + enable_vic_irq(irq); +} + +static void +ack_vic_irq(unsigned int irq) +{ + if (irq & 8) { + outb(0x62,0x20); /* Specific EOI to cascade */ + outb(0x60|(irq & 7),0xA0); + } else { + outb(0x60 | (irq & 7),0x20); + } +} + +/* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 + * but are not vectored by it. This means that the 8259 mask must be + * lowered to receive them */ +static __init void +vic_enable_cpi(void) +{ + __u8 cpu = smp_processor_id(); + + /* just take a copy of the current mask (nop for boot cpu) */ + vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; + + enable_local_vic_irq(VIC_CPI_LEVEL0); + enable_local_vic_irq(VIC_CPI_LEVEL1); + /* for sys int and cmn int */ + enable_local_vic_irq(7); + + if(is_cpu_quad()) { + outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); + outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); + VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", + cpu, QIC_CPI_ENABLE)); + } + + VDEBUG(("VOYAGER SMP: ENABLE CPI: CPU%d: MASK 0x%x\n", + cpu, vic_irq_mask[cpu])); +} + +void +voyager_smp_dump() +{ + int old_cpu = smp_processor_id(), cpu; + + /* dump the interrupt masks of each processor */ + for_each_online_cpu(cpu) { + __u16 imr, isr, irr; + unsigned long flags; + + local_irq_save(flags); + outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); + imr = (inb(0xa1) << 8) | inb(0x21); + outb(0x0a, 0xa0); + irr = inb(0xa0) << 8; + outb(0x0a, 0x20); + irr |= inb(0x20); + outb(0x0b, 0xa0); + isr = inb(0xa0) << 8; + outb(0x0b, 0x20); + isr |= inb(0x20); + outb(old_cpu, VIC_PROCESSOR_ID); + local_irq_restore(flags); + printk("\tCPU%d: mask=0x%x, IMR=0x%x, IRR=0x%x, ISR=0x%x\n", + cpu, vic_irq_mask[cpu], imr, irr, isr); +#if 0 + /* These lines are put in to try to unstick an un ack'd irq */ + if(isr != 0) { + int irq; + for(irq=0; irq<16; irq++) { + if(isr & (1<<irq)) { + printk("\tCPU%d: ack irq %d\n", + cpu, irq); + local_irq_save(flags); + outb(VIC_CPU_MASQUERADE_ENABLE | cpu, + VIC_PROCESSOR_ID); + ack_vic_irq(irq); + outb(old_cpu, VIC_PROCESSOR_ID); + local_irq_restore(flags); + } + } + } +#endif + } +} + +void +smp_voyager_power_off(void *dummy) +{ + if(smp_processor_id() == boot_cpu_id) + voyager_power_off(); + else + smp_stop_cpu_function(NULL); +} + +void __init +smp_prepare_cpus(unsigned int max_cpus) +{ + /* FIXME: ignore max_cpus for now */ + smp_boot_cpus(); +} + +void __devinit smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callout_map); +} + +int __devinit +__cpu_up(unsigned int cpu) +{ + /* This only works at boot for x86. See "rewrite" above. */ + if (cpu_isset(cpu, smp_commenced_mask)) + return -ENOSYS; + + /* In case one didn't come up */ + if (!cpu_isset(cpu, cpu_callin_map)) + return -EIO; + /* Unleash the CPU! */ + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return 0; +} + +void __init +smp_cpus_done(unsigned int max_cpus) +{ + zap_low_mappings(); +} diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c new file mode 100644 index 000000000000..9980eef31fda --- /dev/null +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -0,0 +1,167 @@ +/* -*- mode: c; c-basic-offset: 8 -*- */ + +/* Copyright (C) 2001 + * + * Author: J.E.J.Bottomley@HansenPartnership.com + * + * linux/arch/i386/kernel/voyager_thread.c + * + * This module provides the machine status monitor thread for the + * voyager architecture. This allows us to monitor the machine + * environment (temp, voltage, fan function) and the front panel and + * internal UPS. If a fault is detected, this thread takes corrective + * action (usually just informing init) + * */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/kernel_stat.h> +#include <linux/delay.h> +#include <linux/mc146818rtc.h> +#include <linux/smp_lock.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/kmod.h> +#include <linux/completion.h> +#include <linux/sched.h> +#include <asm/desc.h> +#include <asm/voyager.h> +#include <asm/vic.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include <linux/irq.h> + +#define THREAD_NAME "kvoyagerd" + +/* external variables */ +int kvoyagerd_running = 0; +DECLARE_MUTEX_LOCKED(kvoyagerd_sem); + +static int thread(void *); + +static __u8 set_timeout = 0; + +/* Start the machine monitor thread. Return 1 if OK, 0 if fail */ +static int __init +voyager_thread_start(void) +{ + if(kernel_thread(thread, NULL, CLONE_KERNEL) < 0) { + /* This is serious, but not fatal */ + printk(KERN_ERR "Voyager: Failed to create system monitor thread!!!\n"); + return 1; + } + return 0; +} + +static int +execute(const char *string) +{ + int ret; + + char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL, + }; + char *argv[] = { + "/bin/bash", + "-c", + (char *)string, + NULL, + }; + + if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", + string, ret); + } + return ret; +} + +static void +check_from_kernel(void) +{ + if(voyager_status.switch_off) { + + /* FIXME: This should be configureable via proc */ + execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); + } else if(voyager_status.power_fail) { + VDEBUG(("Voyager daemon detected AC power failure\n")); + + /* FIXME: This should be configureable via proc */ + execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); + set_timeout = 1; + } +} + +static void +check_continuing_condition(void) +{ + if(voyager_status.power_fail) { + __u8 data; + voyager_cat_psi(VOYAGER_PSI_SUBREAD, + VOYAGER_PSI_AC_FAIL_REG, &data); + if((data & 0x1f) == 0) { + /* all power restored */ + printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n"); + /* FIXME: should be user configureable */ + execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); + set_timeout = 0; + } + } +} + +static void +wakeup(unsigned long unused) +{ + up(&kvoyagerd_sem); +} + +static int +thread(void *unused) +{ + struct timer_list wakeup_timer; + + kvoyagerd_running = 1; + + reparent_to_init(); + daemonize(THREAD_NAME); + + set_timeout = 0; + + init_timer(&wakeup_timer); + + sigfillset(¤t->blocked); + current->signal->tty = NULL; + + printk(KERN_NOTICE "Voyager starting monitor thread\n"); + + for(;;) { + down_interruptible(&kvoyagerd_sem); + VDEBUG(("Voyager Daemon awoken\n")); + if(voyager_status.request_from_kernel == 0) { + /* probably awoken from timeout */ + check_continuing_condition(); + } else { + check_from_kernel(); + voyager_status.request_from_kernel = 0; + } + if(set_timeout) { + del_timer(&wakeup_timer); + wakeup_timer.expires = HZ + jiffies; + wakeup_timer.function = wakeup; + add_timer(&wakeup_timer); + } + } +} + +static void __exit +voyager_thread_stop(void) +{ + /* FIXME: do nothing at the moment */ +} + +module_init(voyager_thread_start); +//module_exit(voyager_thread_stop); diff --git a/arch/i386/math-emu/Makefile b/arch/i386/math-emu/Makefile new file mode 100644 index 000000000000..9c943fa6ce6b --- /dev/null +++ b/arch/i386/math-emu/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for wm-FPU-emu +# + +#DEBUG = -DDEBUGGING +DEBUG = +PARANOID = -DPARANOID +CFLAGS := $(CFLAGS) $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) + +EXTRA_AFLAGS := $(PARANOID) + +# From 'C' language sources: +C_OBJS =fpu_entry.o errors.o \ + fpu_arith.o fpu_aux.o fpu_etc.o fpu_tags.o fpu_trig.o \ + load_store.o get_address.o \ + poly_atan.o poly_l2.o poly_2xm1.o poly_sin.o poly_tan.o \ + reg_add_sub.o reg_compare.o reg_constant.o reg_convert.o \ + reg_ld_str.o reg_divide.o reg_mul.o + +# From 80x86 assembler sources: +A_OBJS =reg_u_add.o reg_u_div.o reg_u_mul.o reg_u_sub.o \ + div_small.o reg_norm.o reg_round.o \ + wm_shrx.o wm_sqrt.o \ + div_Xsig.o polynom_Xsig.o round_Xsig.o \ + shr_Xsig.o mul_Xsig.o + +obj-y =$(C_OBJS) $(A_OBJS) + +proto: + cproto -e -DMAKING_PROTO *.c >fpu_proto.h diff --git a/arch/i386/math-emu/README b/arch/i386/math-emu/README new file mode 100644 index 000000000000..e6235491d6eb --- /dev/null +++ b/arch/i386/math-emu/README @@ -0,0 +1,427 @@ + +---------------------------------------------------------------------------+ + | wm-FPU-emu an FPU emulator for 80386 and 80486SX microprocessors. | + | | + | Copyright (C) 1992,1993,1994,1995,1996,1997,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@melbpc.org.au | + | | + | This program is free software; you can redistribute it and/or modify | + | it under the terms of the GNU General Public License version 2 as | + | published by the Free Software Foundation. | + | | + | This program is distributed in the hope that it will be useful, | + | but WITHOUT ANY WARRANTY; without even the implied warranty of | + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | + | GNU General Public License for more details. | + | | + | You should have received a copy of the GNU General Public License | + | along with this program; if not, write to the Free Software | + | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | + | | + +---------------------------------------------------------------------------+ + + + +wm-FPU-emu is an FPU emulator for Linux. It is derived from wm-emu387 +which was my 80387 emulator for early versions of djgpp (gcc under +msdos); wm-emu387 was in turn based upon emu387 which was written by +DJ Delorie for djgpp. The interface to the Linux kernel is based upon +the original Linux math emulator by Linus Torvalds. + +My target FPU for wm-FPU-emu is that described in the Intel486 +Programmer's Reference Manual (1992 edition). Unfortunately, numerous +facets of the functioning of the FPU are not well covered in the +Reference Manual. The information in the manual has been supplemented +with measurements on real 80486's. Unfortunately, it is simply not +possible to be sure that all of the peculiarities of the 80486 have +been discovered, so there is always likely to be obscure differences +in the detailed behaviour of the emulator and a real 80486. + +wm-FPU-emu does not implement all of the behaviour of the 80486 FPU, +but is very close. See "Limitations" later in this file for a list of +some differences. + +Please report bugs, etc to me at: + billm@melbpc.org.au +or b.metzenthen@medoto.unimelb.edu.au + +For more information on the emulator and on floating point topics, see +my web pages, currently at http://www.suburbia.net/~billm/ + + +--Bill Metzenthen + December 1999 + + +----------------------- Internals of wm-FPU-emu ----------------------- + +Numeric algorithms: +(1) Add, subtract, and multiply. Nothing remarkable in these. +(2) Divide has been tuned to get reasonable performance. The algorithm + is not the obvious one which most people seem to use, but is designed + to take advantage of the characteristics of the 80386. I expect that + it has been invented many times before I discovered it, but I have not + seen it. It is based upon one of those ideas which one carries around + for years without ever bothering to check it out. +(3) The sqrt function has been tuned to get good performance. It is based + upon Newton's classic method. Performance was improved by capitalizing + upon the properties of Newton's method, and the code is once again + structured taking account of the 80386 characteristics. +(4) The trig, log, and exp functions are based in each case upon quasi- + "optimal" polynomial approximations. My definition of "optimal" was + based upon getting good accuracy with reasonable speed. +(5) The argument reducing code for the trig function effectively uses + a value of pi which is accurate to more than 128 bits. As a consequence, + the reduced argument is accurate to more than 64 bits for arguments up + to a few pi, and accurate to more than 64 bits for most arguments, + even for arguments approaching 2^63. This is far superior to an + 80486, which uses a value of pi which is accurate to 66 bits. + +The code of the emulator is complicated slightly by the need to +account for a limited form of re-entrancy. Normally, the emulator will +emulate each FPU instruction to completion without interruption. +However, it may happen that when the emulator is accessing the user +memory space, swapping may be needed. In this case the emulator may be +temporarily suspended while disk i/o takes place. During this time +another process may use the emulator, thereby perhaps changing static +variables. The code which accesses user memory is confined to five +files: + fpu_entry.c + reg_ld_str.c + load_store.c + get_address.c + errors.c +As from version 1.12 of the emulator, no static variables are used +(apart from those in the kernel's per-process tables). The emulator is +therefore now fully re-entrant, rather than having just the restricted +form of re-entrancy which is required by the Linux kernel. + +----------------------- Limitations of wm-FPU-emu ----------------------- + +There are a number of differences between the current wm-FPU-emu +(version 2.01) and the 80486 FPU (apart from bugs). The differences +are fewer than those which applied to the 1.xx series of the emulator. +Some of the more important differences are listed below: + +The Roundup flag does not have much meaning for the transcendental +functions and its 80486 value with these functions is likely to differ +from its emulator value. + +In a few rare cases the Underflow flag obtained with the emulator will +be different from that obtained with an 80486. This occurs when the +following conditions apply simultaneously: +(a) the operands have a higher precision than the current setting of the + precision control (PC) flags. +(b) the underflow exception is masked. +(c) the magnitude of the exact result (before rounding) is less than 2^-16382. +(d) the magnitude of the final result (after rounding) is exactly 2^-16382. +(e) the magnitude of the exact result would be exactly 2^-16382 if the + operands were rounded to the current precision before the arithmetic + operation was performed. +If all of these apply, the emulator will set the Underflow flag but a real +80486 will not. + +NOTE: Certain formats of Extended Real are UNSUPPORTED. They are +unsupported by the 80486. They are the Pseudo-NaNs, Pseudoinfinities, +and Unnormals. None of these will be generated by an 80486 or by the +emulator. Do not use them. The emulator treats them differently in +detail from the way an 80486 does. + +Self modifying code can cause the emulator to fail. An example of such +code is: + movl %esp,[%ebx] + fld1 +The FPU instruction may be (usually will be) loaded into the pre-fetch +queue of the CPU before the mov instruction is executed. If the +destination of the 'movl' overlaps the FPU instruction then the bytes +in the prefetch queue and memory will be inconsistent when the FPU +instruction is executed. The emulator will be invoked but will not be +able to find the instruction which caused the device-not-present +exception. For this case, the emulator cannot emulate the behaviour of +an 80486DX. + +Handling of the address size override prefix byte (0x67) has not been +extensively tested yet. A major problem exists because using it in +vm86 mode can cause a general protection fault. Address offsets +greater than 0xffff appear to be illegal in vm86 mode but are quite +acceptable (and work) in real mode. A small test program developed to +check the addressing, and which runs successfully in real mode, +crashes dosemu under Linux and also brings Windows down with a general +protection fault message when run under the MS-DOS prompt of Windows +3.1. (The program simply reads data from a valid address). + +The emulator supports 16-bit protected mode, with one difference from +an 80486DX. A 80486DX will allow some floating point instructions to +write a few bytes below the lowest address of the stack. The emulator +will not allow this in 16-bit protected mode: no instructions are +allowed to write outside the bounds set by the protection. + +----------------------- Performance of wm-FPU-emu ----------------------- + +Speed. +----- + +The speed of floating point computation with the emulator will depend +upon instruction mix. Relative performance is best for the instructions +which require most computation. The simple instructions are adversely +affected by the FPU instruction trap overhead. + + +Timing: Some simple timing tests have been made on the emulator functions. +The times include load/store instructions. All times are in microseconds +measured on a 33MHz 386 with 64k cache. The Turbo C tests were under +ms-dos, the next two columns are for emulators running with the djgpp +ms-dos extender. The final column is for wm-FPU-emu in Linux 0.97, +using libm4.0 (hard). + +function Turbo C djgpp 1.06 WM-emu387 wm-FPU-emu + + + 60.5 154.8 76.5 139.4 + - 61.1-65.5 157.3-160.8 76.2-79.5 142.9-144.7 + * 71.0 190.8 79.6 146.6 + / 61.2-75.0 261.4-266.9 75.3-91.6 142.2-158.1 + + sin() 310.8 4692.0 319.0 398.5 + cos() 284.4 4855.2 308.0 388.7 + tan() 495.0 8807.1 394.9 504.7 + atan() 328.9 4866.4 601.1 419.5-491.9 + + sqrt() 128.7 crashed 145.2 227.0 + log() 413.1-419.1 5103.4-5354.21 254.7-282.2 409.4-437.1 + exp() 479.1 6619.2 469.1 850.8 + + +The performance under Linux is improved by the use of look-ahead code. +The following results show the improvement which is obtained under +Linux due to the look-ahead code. Also given are the times for the +original Linux emulator with the 4.1 'soft' lib. + + [ Linus' note: I changed look-ahead to be the default under linux, as + there was no reason not to use it after I had edited it to be + disabled during tracing ] + + wm-FPU-emu w original w + look-ahead 'soft' lib + + 106.4 190.2 + - 108.6-111.6 192.4-216.2 + * 113.4 193.1 + / 108.8-124.4 700.1-706.2 + + sin() 390.5 2642.0 + cos() 381.5 2767.4 + tan() 496.5 3153.3 + atan() 367.2-435.5 2439.4-3396.8 + + sqrt() 195.1 4732.5 + log() 358.0-387.5 3359.2-3390.3 + exp() 619.3 4046.4 + + +These figures are now somewhat out-of-date. The emulator has become +progressively slower for most functions as more of the 80486 features +have been implemented. + + +----------------------- Accuracy of wm-FPU-emu ----------------------- + + +The accuracy of the emulator is in almost all cases equal to or better +than that of an Intel 80486 FPU. + +The results of the basic arithmetic functions (+,-,*,/), and fsqrt +match those of an 80486 FPU. They are the best possible; the error for +these never exceeds 1/2 an lsb. The fprem and fprem1 instructions +return exact results; they have no error. + + +The following table compares the emulator accuracy for the sqrt(), +trig and log functions against the Turbo C "emulator". For this table, +each function was tested at about 400 points. Ideal worst-case results +would be 64 bits. The reduced Turbo C accuracy of cos() and tan() for +arguments greater than pi/4 can be thought of as being related to the +precision of the argument x; e.g. an argument of pi/2-(1e-10) which is +accurate to 64 bits can result in a relative accuracy in cos() of +about 64 + log2(cos(x)) = 31 bits. + + +Function Tested x range Worst result Turbo C + (relative bits) + +sqrt(x) 1 .. 2 64.1 63.2 +atan(x) 1e-10 .. 200 64.2 62.8 +cos(x) 0 .. pi/2-(1e-10) 64.4 (x <= pi/4) 62.4 + 64.1 (x = pi/2-(1e-10)) 31.9 +sin(x) 1e-10 .. pi/2 64.0 62.8 +tan(x) 1e-10 .. pi/2-(1e-10) 64.0 (x <= pi/4) 62.1 + 64.1 (x = pi/2-(1e-10)) 31.9 +exp(x) 0 .. 1 63.1 ** 62.9 +log(x) 1+1e-6 .. 2 63.8 ** 62.1 + +** The accuracy for exp() and log() is low because the FPU (emulator) +does not compute them directly; two operations are required. + + +The emulator passes the "paranoia" tests (compiled with gcc 2.3.3 or +later) for 'float' variables (24 bit precision numbers) when precision +control is set to 24, 53 or 64 bits, and for 'double' variables (53 +bit precision numbers) when precision control is set to 53 bits (a +properly performing FPU cannot pass the 'paranoia' tests for 'double' +variables when precision control is set to 64 bits). + +The code for reducing the argument for the trig functions (fsin, fcos, +fptan and fsincos) has been improved and now effectively uses a value +for pi which is accurate to more than 128 bits precision. As a +consequence, the accuracy of these functions for large arguments has +been dramatically improved (and is now very much better than an 80486 +FPU). There is also now no degradation of accuracy for fcos and fptan +for operands close to pi/2. Measured results are (note that the +definition of accuracy has changed slightly from that used for the +above table): + +Function Tested x range Worst result + (absolute bits) + +cos(x) 0 .. 9.22e+18 62.0 +sin(x) 1e-16 .. 9.22e+18 62.1 +tan(x) 1e-16 .. 9.22e+18 61.8 + +It is possible with some effort to find very large arguments which +give much degraded precision. For example, the integer number + 8227740058411162616.0 +is within about 10e-7 of a multiple of pi. To find the tan (for +example) of this number to 64 bits precision it would be necessary to +have a value of pi which had about 150 bits precision. The FPU +emulator computes the result to about 42.6 bits precision (the correct +result is about -9.739715e-8). On the other hand, an 80486 FPU returns +0.01059, which in relative terms is hopelessly inaccurate. + +For arguments close to critical angles (which occur at multiples of +pi/2) the emulator is more accurate than an 80486 FPU. For very large +arguments, the emulator is far more accurate. + + +Prior to version 1.20 of the emulator, the accuracy of the results for +the transcendental functions (in their principal range) was not as +good as the results from an 80486 FPU. From version 1.20, the accuracy +has been considerably improved and these functions now give measured +worst-case results which are better than the worst-case results given +by an 80486 FPU. + +The following table gives the measured results for the emulator. The +number of randomly selected arguments in each case is about half a +million. The group of three columns gives the frequency of the given +accuracy in number of times per million, thus the second of these +columns shows that an accuracy of between 63.80 and 63.89 bits was +found at a rate of 133 times per one million measurements for fsin. +The results show that the fsin, fcos and fptan instructions return +results which are in error (i.e. less accurate than the best possible +result (which is 64 bits)) for about one per cent of all arguments +between -pi/2 and +pi/2. The other instructions have a lower +frequency of results which are in error. The last two columns give +the worst accuracy which was found (in bits) and the approximate value +of the argument which produced it. + + frequency (per M) + ------------------- --------------- +instr arg range # tests 63.7 63.8 63.9 worst at arg + bits bits bits bits +----- ------------ ------- ---- ---- ----- ----- -------- +fsin (0,pi/2) 547756 0 133 10673 63.89 0.451317 +fcos (0,pi/2) 547563 0 126 10532 63.85 0.700801 +fptan (0,pi/2) 536274 11 267 10059 63.74 0.784876 +fpatan 4 quadrants 517087 0 8 1855 63.88 0.435121 (4q) +fyl2x (0,20) 541861 0 0 1323 63.94 1.40923 (x) +fyl2xp1 (-.293,.414) 520256 0 0 5678 63.93 0.408542 (x) +f2xm1 (-1,1) 538847 4 481 6488 63.79 0.167709 + + +Tests performed on an 80486 FPU showed results of lower accuracy. The +following table gives the results which were obtained with an AMD +486DX2/66 (other tests indicate that an Intel 486DX produces +identical results). The tests were basically the same as those used +to measure the emulator (the values, being random, were in general not +the same). The total number of tests for each instruction are given +at the end of the table, in case each about 100k tests were performed. +Another line of figures at the end of the table shows that most of the +instructions return results which are in error for more than 10 +percent of the arguments tested. + +The numbers in the body of the table give the approx number of times a +result of the given accuracy in bits (given in the left-most column) +was obtained per one million arguments. For three of the instructions, +two columns of results are given: * The second column for f2xm1 gives +the number cases where the results of the first column were for a +positive argument, this shows that this instruction gives better +results for positive arguments than it does for negative. * In the +cases of fcos and fptan, the first column gives the results when all +cases where arguments greater than 1.5 were removed from the results +given in the second column. Unlike the emulator, an 80486 FPU returns +results of relatively poor accuracy for these instructions when the +argument approaches pi/2. The table does not show those cases when the +accuracy of the results were less than 62 bits, which occurs quite +often for fsin and fptan when the argument approaches pi/2. This poor +accuracy is discussed above in relation to the Turbo C "emulator", and +the accuracy of the value of pi. + + +bits f2xm1 f2xm1 fpatan fcos fcos fyl2x fyl2xp1 fsin fptan fptan +62.0 0 0 0 0 437 0 0 0 0 925 +62.1 0 0 10 0 894 0 0 0 0 1023 +62.2 14 0 0 0 1033 0 0 0 0 945 +62.3 57 0 0 0 1202 0 0 0 0 1023 +62.4 385 0 0 10 1292 0 23 0 0 1178 +62.5 1140 0 0 119 1649 0 39 0 0 1149 +62.6 2037 0 0 189 1620 0 16 0 0 1169 +62.7 5086 14 0 646 2315 10 101 35 39 1402 +62.8 8818 86 0 984 3050 59 287 131 224 2036 +62.9 11340 1355 0 2126 4153 79 605 357 321 1948 +63.0 15557 4750 0 3319 5376 246 1281 862 808 2688 +63.1 20016 8288 0 4620 6628 511 2569 1723 1510 3302 +63.2 24945 11127 10 6588 8098 1120 4470 2968 2990 4724 +63.3 25686 12382 69 8774 10682 1906 6775 4482 5474 7236 +63.4 29219 14722 79 11109 12311 3094 9414 7259 8912 10587 +63.5 30458 14936 393 13802 15014 5874 12666 9609 13762 15262 +63.6 32439 16448 1277 17945 19028 10226 15537 14657 19158 20346 +63.7 35031 16805 4067 23003 23947 18910 20116 21333 25001 26209 +63.8 33251 15820 7673 24781 25675 24617 25354 24440 29433 30329 +63.9 33293 16833 18529 28318 29233 31267 31470 27748 29676 30601 + +Per cent with error: + 30.9 3.2 18.5 9.8 13.1 11.6 17.4 +Total arguments tested: + 70194 70099 101784 100641 100641 101799 128853 114893 102675 102675 + + +------------------------- Contributors ------------------------------- + +A number of people have contributed to the development of the +emulator, often by just reporting bugs, sometimes with suggested +fixes, and a few kind people have provided me with access in one way +or another to an 80486 machine. Contributors include (to those people +who I may have forgotten, please forgive me): + +Linus Torvalds +Tommy.Thorn@daimi.aau.dk +Andrew.Tridgell@anu.edu.au +Nick Holloway, alfie@dcs.warwick.ac.uk +Hermano Moura, moura@dcs.gla.ac.uk +Jon Jagger, J.Jagger@scp.ac.uk +Lennart Benschop +Brian Gallew, geek+@CMU.EDU +Thomas Staniszewski, ts3v+@andrew.cmu.edu +Martin Howell, mph@plasma.apana.org.au +M Saggaf, alsaggaf@athena.mit.edu +Peter Barker, PETER@socpsy.sci.fau.edu +tom@vlsivie.tuwien.ac.at +Dan Russel, russed@rpi.edu +Daniel Carosone, danielce@ee.mu.oz.au +cae@jpmorgan.com +Hamish Coleman, t933093@minyos.xx.rmit.oz.au +Bruce Evans, bde@kralizec.zeta.org.au +Timo Korvola, Timo.Korvola@hut.fi +Rick Lyons, rick@razorback.brisnet.org.au +Rick, jrs@world.std.com + +...and numerous others who responded to my request for help with +a real 80486. + diff --git a/arch/i386/math-emu/control_w.h b/arch/i386/math-emu/control_w.h new file mode 100644 index 000000000000..ae2274dbd305 --- /dev/null +++ b/arch/i386/math-emu/control_w.h @@ -0,0 +1,45 @@ +/*---------------------------------------------------------------------------+ + | control_w.h | + | | + | Copyright (C) 1992,1993 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@vaxc.cc.monash.edu.au | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _CONTROLW_H_ +#define _CONTROLW_H_ + +#ifdef __ASSEMBLY__ +#define _Const_(x) $##x +#else +#define _Const_(x) x +#endif + +#define CW_RC _Const_(0x0C00) /* rounding control */ +#define CW_PC _Const_(0x0300) /* precision control */ + +#define CW_Precision Const_(0x0020) /* loss of precision mask */ +#define CW_Underflow Const_(0x0010) /* underflow mask */ +#define CW_Overflow Const_(0x0008) /* overflow mask */ +#define CW_ZeroDiv Const_(0x0004) /* divide by zero mask */ +#define CW_Denormal Const_(0x0002) /* denormalized operand mask */ +#define CW_Invalid Const_(0x0001) /* invalid operation mask */ + +#define CW_Exceptions _Const_(0x003f) /* all masks */ + +#define RC_RND _Const_(0x0000) +#define RC_DOWN _Const_(0x0400) +#define RC_UP _Const_(0x0800) +#define RC_CHOP _Const_(0x0C00) + +/* p 15-5: Precision control bits affect only the following: + ADD, SUB(R), MUL, DIV(R), and SQRT */ +#define PR_24_BITS _Const_(0x000) +#define PR_53_BITS _Const_(0x200) +#define PR_64_BITS _Const_(0x300) +#define PR_RESERVED_BITS _Const_(0x100) +/* FULL_PRECISION simulates all exceptions masked */ +#define FULL_PRECISION (PR_64_BITS | RC_RND | 0x3f) + +#endif /* _CONTROLW_H_ */ diff --git a/arch/i386/math-emu/div_Xsig.S b/arch/i386/math-emu/div_Xsig.S new file mode 100644 index 000000000000..f77ba3058b31 --- /dev/null +++ b/arch/i386/math-emu/div_Xsig.S @@ -0,0 +1,365 @@ + .file "div_Xsig.S" +/*---------------------------------------------------------------------------+ + | div_Xsig.S | + | | + | Division subroutine for 96 bit quantities | + | | + | Copyright (C) 1994,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Divide the 96 bit quantity pointed to by a, by that pointed to by b, and | + | put the 96 bit result at the location d. | + | | + | The result may not be accurate to 96 bits. It is intended for use where | + | a result better than 64 bits is required. The result should usually be | + | good to at least 94 bits. | + | The returned result is actually divided by one half. This is done to | + | prevent overflow. | + | | + | .aaaaaaaaaaaaaa / .bbbbbbbbbbbbb -> .dddddddddddd | + | | + | void div_Xsig(Xsig *a, Xsig *b, Xsig *dest) | + | | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "fpu_emu.h" + + +#define XsigLL(x) (x) +#define XsigL(x) 4(x) +#define XsigH(x) 8(x) + + +#ifndef NON_REENTRANT_FPU +/* + Local storage on the stack: + Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 + */ +#define FPU_accum_3 -4(%ebp) +#define FPU_accum_2 -8(%ebp) +#define FPU_accum_1 -12(%ebp) +#define FPU_accum_0 -16(%ebp) +#define FPU_result_3 -20(%ebp) +#define FPU_result_2 -24(%ebp) +#define FPU_result_1 -28(%ebp) + +#else +.data +/* + Local storage in a static area: + Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 + */ + .align 4,0 +FPU_accum_3: + .long 0 +FPU_accum_2: + .long 0 +FPU_accum_1: + .long 0 +FPU_accum_0: + .long 0 +FPU_result_3: + .long 0 +FPU_result_2: + .long 0 +FPU_result_1: + .long 0 +#endif /* NON_REENTRANT_FPU */ + + +.text +ENTRY(div_Xsig) + pushl %ebp + movl %esp,%ebp +#ifndef NON_REENTRANT_FPU + subl $28,%esp +#endif /* NON_REENTRANT_FPU */ + + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%esi /* pointer to num */ + movl PARAM2,%ebx /* pointer to denom */ + +#ifdef PARANOID + testl $0x80000000, XsigH(%ebx) /* Divisor */ + je L_bugged +#endif /* PARANOID */ + + +/*---------------------------------------------------------------------------+ + | Divide: Return arg1/arg2 to arg3. | + | | + | The maximum returned value is (ignoring exponents) | + | .ffffffff ffffffff | + | ------------------ = 1.ffffffff fffffffe | + | .80000000 00000000 | + | and the minimum is | + | .80000000 00000000 | + | ------------------ = .80000000 00000001 (rounded) | + | .ffffffff ffffffff | + | | + +---------------------------------------------------------------------------*/ + + /* Save extended dividend in local register */ + + /* Divide by 2 to prevent overflow */ + clc + movl XsigH(%esi),%eax + rcrl %eax + movl %eax,FPU_accum_3 + movl XsigL(%esi),%eax + rcrl %eax + movl %eax,FPU_accum_2 + movl XsigLL(%esi),%eax + rcrl %eax + movl %eax,FPU_accum_1 + movl $0,%eax + rcrl %eax + movl %eax,FPU_accum_0 + + movl FPU_accum_2,%eax /* Get the current num */ + movl FPU_accum_3,%edx + +/*----------------------------------------------------------------------*/ +/* Initialization done. + Do the first 32 bits. */ + + /* We will divide by a number which is too large */ + movl XsigH(%ebx),%ecx + addl $1,%ecx + jnc LFirst_div_not_1 + + /* here we need to divide by 100000000h, + i.e., no division at all.. */ + mov %edx,%eax + jmp LFirst_div_done + +LFirst_div_not_1: + divl %ecx /* Divide the numerator by the augmented + denom ms dw */ + +LFirst_div_done: + movl %eax,FPU_result_3 /* Put the result in the answer */ + + mull XsigH(%ebx) /* mul by the ms dw of the denom */ + + subl %eax,FPU_accum_2 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_3 + + movl FPU_result_3,%eax /* Get the result back */ + mull XsigL(%ebx) /* now mul the ls dw of the denom */ + + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + sbbl $0,FPU_accum_3 + je LDo_2nd_32_bits /* Must check for non-zero result here */ + +#ifdef PARANOID + jb L_bugged_1 +#endif /* PARANOID */ + + /* need to subtract another once of the denom */ + incl FPU_result_3 /* Correct the answer */ + + movl XsigL(%ebx),%eax + movl XsigH(%ebx),%edx + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + +#ifdef PARANOID + sbbl $0,FPU_accum_3 + jne L_bugged_1 /* Must check for non-zero result here */ +#endif /* PARANOID */ + +/*----------------------------------------------------------------------*/ +/* Half of the main problem is done, there is just a reduced numerator + to handle now. + Work with the second 32 bits, FPU_accum_0 not used from now on */ +LDo_2nd_32_bits: + movl FPU_accum_2,%edx /* get the reduced num */ + movl FPU_accum_1,%eax + + /* need to check for possible subsequent overflow */ + cmpl XsigH(%ebx),%edx + jb LDo_2nd_div + ja LPrevent_2nd_overflow + + cmpl XsigL(%ebx),%eax + jb LDo_2nd_div + +LPrevent_2nd_overflow: +/* The numerator is greater or equal, would cause overflow */ + /* prevent overflow */ + subl XsigL(%ebx),%eax + sbbl XsigH(%ebx),%edx + movl %edx,FPU_accum_2 + movl %eax,FPU_accum_1 + + incl FPU_result_3 /* Reflect the subtraction in the answer */ + +#ifdef PARANOID + je L_bugged_2 /* Can't bump the result to 1.0 */ +#endif /* PARANOID */ + +LDo_2nd_div: + cmpl $0,%ecx /* augmented denom msw */ + jnz LSecond_div_not_1 + + /* %ecx == 0, we are dividing by 1.0 */ + mov %edx,%eax + jmp LSecond_div_done + +LSecond_div_not_1: + divl %ecx /* Divide the numerator by the denom ms dw */ + +LSecond_div_done: + movl %eax,FPU_result_2 /* Put the result in the answer */ + + mull XsigH(%ebx) /* mul by the ms dw of the denom */ + + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 +#endif /* PARANOID */ + + movl FPU_result_2,%eax /* Get the result back */ + mull XsigL(%ebx) /* now mul the ls dw of the denom */ + + subl %eax,FPU_accum_0 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */ + sbbl $0,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 +#endif /* PARANOID */ + + jz LDo_3rd_32_bits + +#ifdef PARANOID + cmpl $1,FPU_accum_2 + jne L_bugged_2 +#endif /* PARANOID */ + + /* need to subtract another once of the denom */ + movl XsigL(%ebx),%eax + movl XsigH(%ebx),%edx + subl %eax,FPU_accum_0 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_1 + sbbl $0,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 + jne L_bugged_2 +#endif /* PARANOID */ + + addl $1,FPU_result_2 /* Correct the answer */ + adcl $0,FPU_result_3 + +#ifdef PARANOID + jc L_bugged_2 /* Must check for non-zero result here */ +#endif /* PARANOID */ + +/*----------------------------------------------------------------------*/ +/* The division is essentially finished here, we just need to perform + tidying operations. + Deal with the 3rd 32 bits */ +LDo_3rd_32_bits: + /* We use an approximation for the third 32 bits. + To take account of the 3rd 32 bits of the divisor + (call them del), we subtract del * (a/b) */ + + movl FPU_result_3,%eax /* a/b */ + mull XsigLL(%ebx) /* del */ + + subl %edx,FPU_accum_1 + + /* A borrow indicates that the result is negative */ + jnb LTest_over + + movl XsigH(%ebx),%edx + addl %edx,FPU_accum_1 + + subl $1,FPU_result_2 /* Adjust the answer */ + sbbl $0,FPU_result_3 + + /* The above addition might not have been enough, check again. */ + movl FPU_accum_1,%edx /* get the reduced num */ + cmpl XsigH(%ebx),%edx /* denom */ + jb LDo_3rd_div + + movl XsigH(%ebx),%edx + addl %edx,FPU_accum_1 + + subl $1,FPU_result_2 /* Adjust the answer */ + sbbl $0,FPU_result_3 + jmp LDo_3rd_div + +LTest_over: + movl FPU_accum_1,%edx /* get the reduced num */ + + /* need to check for possible subsequent overflow */ + cmpl XsigH(%ebx),%edx /* denom */ + jb LDo_3rd_div + + /* prevent overflow */ + subl XsigH(%ebx),%edx + movl %edx,FPU_accum_1 + + addl $1,FPU_result_2 /* Reflect the subtraction in the answer */ + adcl $0,FPU_result_3 + +LDo_3rd_div: + movl FPU_accum_0,%eax + movl FPU_accum_1,%edx + divl XsigH(%ebx) + + movl %eax,FPU_result_1 /* Rough estimate of third word */ + + movl PARAM3,%esi /* pointer to answer */ + + movl FPU_result_1,%eax + movl %eax,XsigLL(%esi) + movl FPU_result_2,%eax + movl %eax,XsigL(%esi) + movl FPU_result_3,%eax + movl %eax,XsigH(%esi) + +L_exit: + popl %ebx + popl %edi + popl %esi + + leave + ret + + +#ifdef PARANOID +/* The logic is wrong if we got here */ +L_bugged: + pushl EX_INTERNAL|0x240 + call EXCEPTION + pop %ebx + jmp L_exit + +L_bugged_1: + pushl EX_INTERNAL|0x241 + call EXCEPTION + pop %ebx + jmp L_exit + +L_bugged_2: + pushl EX_INTERNAL|0x242 + call EXCEPTION + pop %ebx + jmp L_exit +#endif /* PARANOID */ diff --git a/arch/i386/math-emu/div_small.S b/arch/i386/math-emu/div_small.S new file mode 100644 index 000000000000..47099628fa4c --- /dev/null +++ b/arch/i386/math-emu/div_small.S @@ -0,0 +1,47 @@ + .file "div_small.S" +/*---------------------------------------------------------------------------+ + | div_small.S | + | | + | Divide a 64 bit integer by a 32 bit integer & return remainder. | + | | + | Copyright (C) 1992,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | unsigned long FPU_div_small(unsigned long long *x, unsigned long y) | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" + +.text +ENTRY(FPU_div_small) + pushl %ebp + movl %esp,%ebp + + pushl %esi + + movl PARAM1,%esi /* pointer to num */ + movl PARAM2,%ecx /* The denominator */ + + movl 4(%esi),%eax /* Get the current num msw */ + xorl %edx,%edx + divl %ecx + + movl %eax,4(%esi) + + movl (%esi),%eax /* Get the num lsw */ + divl %ecx + + movl %eax,(%esi) + + movl %edx,%eax /* Return the remainder in eax */ + + popl %esi + + leave + ret + diff --git a/arch/i386/math-emu/errors.c b/arch/i386/math-emu/errors.c new file mode 100644 index 000000000000..a1b0d22f6978 --- /dev/null +++ b/arch/i386/math-emu/errors.c @@ -0,0 +1,739 @@ +/*---------------------------------------------------------------------------+ + | errors.c | + | | + | The error handling functions for wm-FPU-emu | + | | + | Copyright (C) 1992,1993,1994,1996 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@jacobi.maths.monash.edu.au | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Note: | + | The file contains code which accesses user memory. | + | Emulator static data may change when user memory is accessed, due to | + | other processes using the emulator while swapping is in progress. | + +---------------------------------------------------------------------------*/ + +#include <linux/signal.h> + +#include <asm/uaccess.h> + +#include "fpu_emu.h" +#include "fpu_system.h" +#include "exception.h" +#include "status_w.h" +#include "control_w.h" +#include "reg_constant.h" +#include "version.h" + +/* */ +#undef PRINT_MESSAGES +/* */ + + +#if 0 +void Un_impl(void) +{ + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address); + if ( FPU_CS == __USER_CS ) + { + while ( 1 ) + { + FPU_get_user(byte1, (u_char __user *) address); + if ( (byte1 & 0xf8) == 0xd8 ) break; + printk("[%02x]", byte1); + address++; + } + printk("%02x ", byte1); + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); + else + printk("/%d\n", (FPU_modrm >> 3) & 7); + } + else + { + printk("cs selector = %04x\n", FPU_CS); + } + + RE_ENTRANT_CHECK_ON; + + EXCEPTION(EX_Invalid); + +} +#endif /* 0 */ + + +/* + Called for opcodes which are illegal and which are known to result in a + SIGILL with a real 80486. + */ +void FPU_illegal(void) +{ + math_abort(FPU_info,SIGILL); +} + + + +void FPU_printall(void) +{ + int i; + static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", + "DeNorm", "Inf", "NaN" }; + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("At %p:", (void *) address); + if ( FPU_CS == __USER_CS ) + { +#define MAX_PRINTED_BYTES 20 + for ( i = 0; i < MAX_PRINTED_BYTES; i++ ) + { + FPU_get_user(byte1, (u_char __user *) address); + if ( (byte1 & 0xf8) == 0xd8 ) + { + printk(" %02x", byte1); + break; + } + printk(" [%02x]", byte1); + address++; + } + if ( i == MAX_PRINTED_BYTES ) + printk(" [more..]\n"); + else + { + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); + else + printk(" /%d, mod=%d rm=%d\n", + (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7); + } + } + else + { + printk("%04x\n", FPU_CS); + } + + partial_status = status_word(); + +#ifdef DEBUGGING +if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n"); +if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n"); +if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n"); +if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n"); +if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n"); +if ( partial_status & SW_Summary ) printk("SW: exception summary\n"); +if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n"); +if ( partial_status & SW_Precision ) printk("SW: loss of precision\n"); +if ( partial_status & SW_Underflow ) printk("SW: underflow\n"); +if ( partial_status & SW_Overflow ) printk("SW: overflow\n"); +if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n"); +if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n"); +if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n"); +#endif /* DEBUGGING */ + + printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", + partial_status & 0x8000 ? 1 : 0, /* busy */ + (partial_status & 0x3800) >> 11, /* stack top pointer */ + partial_status & 0x80 ? 1 : 0, /* Error summary status */ + partial_status & 0x40 ? 1 : 0, /* Stack flag */ + partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */ + partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */ + partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0, + partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0, + partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0); + +printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n", + control_word & 0x1000 ? 1 : 0, + (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, + (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, + control_word & 0x80 ? 1 : 0, + control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0, + control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0, + control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0); + + for ( i = 0; i < 8; i++ ) + { + FPU_REG *r = &st(i); + u_char tagi = FPU_gettagi(i); + switch (tagi) + { + case TAG_Empty: + continue; + break; + case TAG_Zero: + case TAG_Special: + tagi = FPU_Special(r); + case TAG_Valid: + printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, + getsign(r) ? '-' : '+', + (long)(r->sigh >> 16), + (long)(r->sigh & 0xFFFF), + (long)(r->sigl >> 16), + (long)(r->sigl & 0xFFFF), + exponent(r) - EXP_BIAS + 1); + break; + default: + printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); + continue; + break; + } + printk("%s\n", tag_desc[(int) (unsigned) tagi]); + } + + RE_ENTRANT_CHECK_ON; + +} + +static struct { + int type; + const char *name; +} exception_names[] = { + { EX_StackOver, "stack overflow" }, + { EX_StackUnder, "stack underflow" }, + { EX_Precision, "loss of precision" }, + { EX_Underflow, "underflow" }, + { EX_Overflow, "overflow" }, + { EX_ZeroDiv, "divide by zero" }, + { EX_Denormal, "denormalized operand" }, + { EX_Invalid, "invalid operation" }, + { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION }, + { 0, NULL } +}; + +/* + EX_INTERNAL is always given with a code which indicates where the + error was detected. + + Internal error types: + 0x14 in fpu_etc.c + 0x1nn in a *.c file: + 0x101 in reg_add_sub.c + 0x102 in reg_mul.c + 0x104 in poly_atan.c + 0x105 in reg_mul.c + 0x107 in fpu_trig.c + 0x108 in reg_compare.c + 0x109 in reg_compare.c + 0x110 in reg_add_sub.c + 0x111 in fpe_entry.c + 0x112 in fpu_trig.c + 0x113 in errors.c + 0x115 in fpu_trig.c + 0x116 in fpu_trig.c + 0x117 in fpu_trig.c + 0x118 in fpu_trig.c + 0x119 in fpu_trig.c + 0x120 in poly_atan.c + 0x121 in reg_compare.c + 0x122 in reg_compare.c + 0x123 in reg_compare.c + 0x125 in fpu_trig.c + 0x126 in fpu_entry.c + 0x127 in poly_2xm1.c + 0x128 in fpu_entry.c + 0x129 in fpu_entry.c + 0x130 in get_address.c + 0x131 in get_address.c + 0x132 in get_address.c + 0x133 in get_address.c + 0x140 in load_store.c + 0x141 in load_store.c + 0x150 in poly_sin.c + 0x151 in poly_sin.c + 0x160 in reg_ld_str.c + 0x161 in reg_ld_str.c + 0x162 in reg_ld_str.c + 0x163 in reg_ld_str.c + 0x164 in reg_ld_str.c + 0x170 in fpu_tags.c + 0x171 in fpu_tags.c + 0x172 in fpu_tags.c + 0x180 in reg_convert.c + 0x2nn in an *.S file: + 0x201 in reg_u_add.S + 0x202 in reg_u_div.S + 0x203 in reg_u_div.S + 0x204 in reg_u_div.S + 0x205 in reg_u_mul.S + 0x206 in reg_u_sub.S + 0x207 in wm_sqrt.S + 0x208 in reg_div.S + 0x209 in reg_u_sub.S + 0x210 in reg_u_sub.S + 0x211 in reg_u_sub.S + 0x212 in reg_u_sub.S + 0x213 in wm_sqrt.S + 0x214 in wm_sqrt.S + 0x215 in wm_sqrt.S + 0x220 in reg_norm.S + 0x221 in reg_norm.S + 0x230 in reg_round.S + 0x231 in reg_round.S + 0x232 in reg_round.S + 0x233 in reg_round.S + 0x234 in reg_round.S + 0x235 in reg_round.S + 0x236 in reg_round.S + 0x240 in div_Xsig.S + 0x241 in div_Xsig.S + 0x242 in div_Xsig.S + */ + +asmlinkage void FPU_exception(int n) +{ + int i, int_type; + + int_type = 0; /* Needed only to stop compiler warnings */ + if ( n & EX_INTERNAL ) + { + int_type = n - EX_INTERNAL; + n = EX_INTERNAL; + /* Set lots of exception bits! */ + partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); + } + else + { + /* Extract only the bits which we use to set the status word */ + n &= (SW_Exc_Mask); + /* Set the corresponding exception bit */ + partial_status |= n; + /* Set summary bits iff exception isn't masked */ + if ( partial_status & ~control_word & CW_Exceptions ) + partial_status |= (SW_Summary | SW_Backward); + if ( n & (SW_Stack_Fault | EX_Precision) ) + { + if ( !(n & SW_C1) ) + /* This bit distinguishes over- from underflow for a stack fault, + and roundup from round-down for precision loss. */ + partial_status &= ~SW_C1; + } + } + + RE_ENTRANT_CHECK_OFF; + if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) ) + { +#ifdef PRINT_MESSAGES + /* My message from the sponsor */ + printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n"); +#endif /* PRINT_MESSAGES */ + + /* Get a name string for error reporting */ + for (i=0; exception_names[i].type; i++) + if ( (exception_names[i].type & n) == exception_names[i].type ) + break; + + if (exception_names[i].type) + { +#ifdef PRINT_MESSAGES + printk("FP Exception: %s!\n", exception_names[i].name); +#endif /* PRINT_MESSAGES */ + } + else + printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); + + if ( n == EX_INTERNAL ) + { + printk("FPU emulator: Internal error type 0x%04x\n", int_type); + FPU_printall(); + } +#ifdef PRINT_MESSAGES + else + FPU_printall(); +#endif /* PRINT_MESSAGES */ + + /* + * The 80486 generates an interrupt on the next non-control FPU + * instruction. So we need some means of flagging it. + * We use the ES (Error Summary) bit for this. + */ + } + RE_ENTRANT_CHECK_ON; + +#ifdef __DEBUG__ + math_abort(FPU_info,SIGFPE); +#endif /* __DEBUG__ */ + +} + + +/* Real operation attempted on a NaN. */ +/* Returns < 0 if the exception is unmasked */ +int real_1op_NaN(FPU_REG *a) +{ + int signalling, isNaN; + + isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); + + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + signalling = isNaN && !(a->sigh & 0x40000000); + + if ( !signalling ) + { + if ( !isNaN ) /* pseudo-NaN, or other unsupported? */ + { + if ( control_word & CW_Invalid ) + { + /* Masked response */ + reg_copy(&CONST_QNaN, a); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + } + return TAG_Special; + } + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */ + { + reg_copy(&CONST_QNaN, a); + } + /* ensure a Quiet NaN */ + a->sigh |= 0x40000000; + } + + EXCEPTION(EX_Invalid); + + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; +} + + +/* Real operation attempted on two operands, one a NaN. */ +/* Returns < 0 if the exception is unmasked */ +int real_2op_NaN(FPU_REG const *b, u_char tagb, + int deststnr, + FPU_REG const *defaultNaN) +{ + FPU_REG *dest = &st(deststnr); + FPU_REG const *a = dest; + u_char taga = FPU_gettagi(deststnr); + FPU_REG const *x; + int signalling, unsupported; + + if ( taga == TAG_Special ) + taga = FPU_Special(a); + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + + /* TW_NaN is also used for unsupported data types. */ + unsupported = ((taga == TW_NaN) + && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000))) + || ((tagb == TW_NaN) + && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); + if ( unsupported ) + { + if ( control_word & CW_Invalid ) + { + /* Masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + } + + if (taga == TW_NaN) + { + x = a; + if (tagb == TW_NaN) + { + signalling = !(a->sigh & b->sigh & 0x40000000); + if ( significand(b) > significand(a) ) + x = b; + else if ( significand(b) == significand(a) ) + { + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + x = defaultNaN; + } + } + else + { + /* return the quiet version of the NaN in a */ + signalling = !(a->sigh & 0x40000000); + } + } + else +#ifdef PARANOID + if (tagb == TW_NaN) +#endif /* PARANOID */ + { + signalling = !(b->sigh & 0x40000000); + x = b; + } +#ifdef PARANOID + else + { + signalling = 0; + EXCEPTION(EX_INTERNAL|0x113); + x = &CONST_QNaN; + } +#endif /* PARANOID */ + + if ( (!signalling) || (control_word & CW_Invalid) ) + { + if ( ! x ) + x = b; + + if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */ + x = &CONST_QNaN; + + FPU_copy_to_regi(x, TAG_Special, deststnr); + + if ( !signalling ) + return TAG_Special; + + /* ensure a Quiet NaN */ + dest->sigh |= 0x40000000; + } + + EXCEPTION(EX_Invalid); + + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; +} + + +/* Invalid arith operation on Valid registers */ +/* Returns < 0 if the exception is unmasked */ +asmlinkage int arith_invalid(int deststnr) +{ + + EXCEPTION(EX_Invalid); + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid; + +} + + +/* Divide a finite number by zero */ +asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) +{ + FPU_REG *dest = &st(deststnr); + int tag = TAG_Valid; + + if ( control_word & CW_ZeroDiv ) + { + /* The masked response */ + FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr); + setsign(dest, sign); + tag = TAG_Special; + } + + EXCEPTION(EX_ZeroDiv); + + return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; + +} + + +/* This may be called often, so keep it lean */ +int set_precision_flag(int flags) +{ + if ( control_word & CW_Precision ) + { + partial_status &= ~(SW_C1 & flags); + partial_status |= flags; /* The masked response */ + return 0; + } + else + { + EXCEPTION(flags); + return 1; + } +} + + +/* This may be called often, so keep it lean */ +asmlinkage void set_precision_flag_up(void) +{ + if ( control_word & CW_Precision ) + partial_status |= (SW_Precision | SW_C1); /* The masked response */ + else + EXCEPTION(EX_Precision | SW_C1); +} + + +/* This may be called often, so keep it lean */ +asmlinkage void set_precision_flag_down(void) +{ + if ( control_word & CW_Precision ) + { /* The masked response */ + partial_status &= ~SW_C1; + partial_status |= SW_Precision; + } + else + EXCEPTION(EX_Precision); +} + + +asmlinkage int denormal_operand(void) +{ + if ( control_word & CW_Denormal ) + { /* The masked response */ + partial_status |= SW_Denorm_Op; + return TAG_Special; + } + else + { + EXCEPTION(EX_Denormal); + return TAG_Special | FPU_Exception; + } +} + + +asmlinkage int arith_overflow(FPU_REG *dest) +{ + int tag = TAG_Valid; + + if ( control_word & CW_Overflow ) + { + /* The masked response */ +/* ###### The response here depends upon the rounding mode */ + reg_copy(&CONST_INF, dest); + tag = TAG_Special; + } + else + { + /* Subtract the magic number from the exponent */ + addexponent(dest, (-3 * (1 << 13))); + } + + EXCEPTION(EX_Overflow); + if ( control_word & CW_Overflow ) + { + /* The overflow exception is masked. */ + /* By definition, precision is lost. + The roundup bit (C1) is also set because we have + "rounded" upwards to Infinity. */ + EXCEPTION(EX_Precision | SW_C1); + return tag; + } + + return tag; + +} + + +asmlinkage int arith_underflow(FPU_REG *dest) +{ + int tag = TAG_Valid; + + if ( control_word & CW_Underflow ) + { + /* The masked response */ + if ( exponent16(dest) <= EXP_UNDER - 63 ) + { + reg_copy(&CONST_Z, dest); + partial_status &= ~SW_C1; /* Round down. */ + tag = TAG_Zero; + } + else + { + stdexp(dest); + } + } + else + { + /* Add the magic number to the exponent. */ + addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias); + } + + EXCEPTION(EX_Underflow); + if ( control_word & CW_Underflow ) + { + /* The underflow exception is masked. */ + EXCEPTION(EX_Precision); + return tag; + } + + return tag; + +} + + +void FPU_stack_overflow(void) +{ + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + top--; + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } + + EXCEPTION(EX_StackOver); + + return; + +} + + +void FPU_stack_underflow(void) +{ + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } + + EXCEPTION(EX_StackUnder); + + return; + +} + + +void FPU_stack_underflow_i(int i) +{ + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + } + + EXCEPTION(EX_StackUnder); + + return; + +} + + +void FPU_stack_underflow_pop(int i) +{ + + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + FPU_pop(); + } + + EXCEPTION(EX_StackUnder); + + return; + +} + diff --git a/arch/i386/math-emu/exception.h b/arch/i386/math-emu/exception.h new file mode 100644 index 000000000000..b463f21a811e --- /dev/null +++ b/arch/i386/math-emu/exception.h @@ -0,0 +1,53 @@ +/*---------------------------------------------------------------------------+ + | exception.h | + | | + | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@vaxc.cc.monash.edu.au | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _EXCEPTION_H_ +#define _EXCEPTION_H_ + + +#ifdef __ASSEMBLY__ +#define Const_(x) $##x +#else +#define Const_(x) x +#endif + +#ifndef SW_C1 +#include "fpu_emu.h" +#endif /* SW_C1 */ + +#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ +#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ +/* Special exceptions: */ +#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */ +#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */ +#define EX_StackUnder Const_(0x0041) /* stack underflow */ +/* Exception flags: */ +#define EX_Precision Const_(0x0020) /* loss of precision */ +#define EX_Underflow Const_(0x0010) /* underflow */ +#define EX_Overflow Const_(0x0008) /* overflow */ +#define EX_ZeroDiv Const_(0x0004) /* divide by zero */ +#define EX_Denormal Const_(0x0002) /* denormalized operand */ +#define EX_Invalid Const_(0x0001) /* invalid operation */ + + +#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) +#define PRECISION_LOST_DOWN Const_(EX_Precision) + + +#ifndef __ASSEMBLY__ + +#ifdef DEBUG +#define EXCEPTION(x) { printk("exception in %s at line %d\n", \ + __FILE__, __LINE__); FPU_exception(x); } +#else +#define EXCEPTION(x) FPU_exception(x) +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* _EXCEPTION_H_ */ diff --git a/arch/i386/math-emu/fpu_arith.c b/arch/i386/math-emu/fpu_arith.c new file mode 100644 index 000000000000..6972dec01af6 --- /dev/null +++ b/arch/i386/math-emu/fpu_arith.c @@ -0,0 +1,174 @@ +/*---------------------------------------------------------------------------+ + | fpu_arith.c | + | | + | Code to implement the FPU register/register arithmetic instructions | + | | + | Copyright (C) 1992,1993,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "fpu_emu.h" +#include "control_w.h" +#include "status_w.h" + + +void fadd__(void) +{ + /* fadd st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), 0, control_word); +} + + +void fmul__(void) +{ + /* fmul st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); +} + + + +void fsub__(void) +{ + /* fsub st,st(i) */ + clear_C1(); + FPU_sub(0, FPU_rm, control_word); +} + + +void fsubr_(void) +{ + /* fsubr st,st(i) */ + clear_C1(); + FPU_sub(REV, FPU_rm, control_word); +} + + +void fdiv__(void) +{ + /* fdiv st,st(i) */ + clear_C1(); + FPU_div(0, FPU_rm, control_word); +} + + +void fdivr_(void) +{ + /* fdivr st,st(i) */ + clear_C1(); + FPU_div(REV, FPU_rm, control_word); +} + + + +void fadd_i(void) +{ + /* fadd st(i),st */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), i, control_word); +} + + +void fmul_i(void) +{ + /* fmul st(i),st */ + clear_C1(); + FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); +} + + +void fsubri(void) +{ + /* fsubr st(i),st */ + clear_C1(); + FPU_sub(DEST_RM, FPU_rm, control_word); +} + + +void fsub_i(void) +{ + /* fsub st(i),st */ + clear_C1(); + FPU_sub(REV|DEST_RM, FPU_rm, control_word); +} + + +void fdivri(void) +{ + /* fdivr st(i),st */ + clear_C1(); + FPU_div(DEST_RM, FPU_rm, control_word); +} + + +void fdiv_i(void) +{ + /* fdiv st(i),st */ + clear_C1(); + FPU_div(REV|DEST_RM, FPU_rm, control_word); +} + + + +void faddp_(void) +{ + /* faddp st(i),st */ + int i = FPU_rm; + clear_C1(); + if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 ) + FPU_pop(); +} + + +void fmulp_(void) +{ + /* fmulp st(i),st */ + clear_C1(); + if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 ) + FPU_pop(); +} + + + +void fsubrp(void) +{ + /* fsubrp st(i),st */ + clear_C1(); + if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 ) + FPU_pop(); +} + + +void fsubp_(void) +{ + /* fsubp st(i),st */ + clear_C1(); + if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 ) + FPU_pop(); +} + + +void fdivrp(void) +{ + /* fdivrp st(i),st */ + clear_C1(); + if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 ) + FPU_pop(); +} + + +void fdivp_(void) +{ + /* fdivp st(i),st */ + clear_C1(); + if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 ) + FPU_pop(); +} diff --git a/arch/i386/math-emu/fpu_asm.h b/arch/i386/math-emu/fpu_asm.h new file mode 100644 index 000000000000..9ba12416df12 --- /dev/null +++ b/arch/i386/math-emu/fpu_asm.h @@ -0,0 +1,32 @@ +/*---------------------------------------------------------------------------+ + | fpu_asm.h | + | | + | Copyright (C) 1992,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _FPU_ASM_H_ +#define _FPU_ASM_H_ + +#include <linux/linkage.h> + +#define EXCEPTION FPU_exception + + +#define PARAM1 8(%ebp) +#define PARAM2 12(%ebp) +#define PARAM3 16(%ebp) +#define PARAM4 20(%ebp) +#define PARAM5 24(%ebp) +#define PARAM6 28(%ebp) +#define PARAM7 32(%ebp) + +#define SIGL_OFFSET 0 +#define EXP(x) 8(x) +#define SIG(x) SIGL_OFFSET##(x) +#define SIGL(x) SIGL_OFFSET##(x) +#define SIGH(x) 4(x) + +#endif /* _FPU_ASM_H_ */ diff --git a/arch/i386/math-emu/fpu_aux.c b/arch/i386/math-emu/fpu_aux.c new file mode 100644 index 000000000000..20886cfb9f76 --- /dev/null +++ b/arch/i386/math-emu/fpu_aux.c @@ -0,0 +1,204 @@ +/*---------------------------------------------------------------------------+ + | fpu_aux.c | + | | + | Code to implement some of the FPU auxiliary instructions. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" +#include "status_w.h" +#include "control_w.h" + + +static void fnop(void) +{ +} + +static void fclex(void) +{ + partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision| + SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op| + SW_Invalid); + no_ip_update = 1; +} + +/* Needs to be externally visible */ +void finit(void) +{ + control_word = 0x037f; + partial_status = 0; + top = 0; /* We don't keep top in the status word internally. */ + fpu_tag_word = 0xffff; + /* The behaviour is different from that detailed in + Section 15.1.6 of the Intel manual */ + operand_address.offset = 0; + operand_address.selector = 0; + instruction_address.offset = 0; + instruction_address.selector = 0; + instruction_address.opcode = 0; + no_ip_update = 1; +} + +/* + * These are nops on the i387.. + */ +#define feni fnop +#define fdisi fnop +#define fsetpm fnop + +static FUNC const finit_table[] = { + feni, fdisi, fclex, finit, + fsetpm, FPU_illegal, FPU_illegal, FPU_illegal +}; + +void finit_(void) +{ + (finit_table[FPU_rm])(); +} + + +static void fstsw_ax(void) +{ + *(short *) &FPU_EAX = status_word(); + no_ip_update = 1; +} + +static FUNC const fstsw_table[] = { + fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal +}; + +void fstsw_(void) +{ + (fstsw_table[FPU_rm])(); +} + + +static FUNC const fp_nop_table[] = { + fnop, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal +}; + +void fp_nop(void) +{ + (fp_nop_table[FPU_rm])(); +} + + +void fld_i_(void) +{ + FPU_REG *st_new_ptr; + int i; + u_char tag; + + if ( STACK_OVERFLOW ) + { FPU_stack_overflow(); return; } + + /* fld st(i) */ + i = FPU_rm; + if ( NOT_EMPTY(i) ) + { + reg_copy(&st(i), st_new_ptr); + tag = FPU_gettagi(i); + push(); + FPU_settag0(tag); + } + else + { + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_stack_underflow(); + } + else + EXCEPTION(EX_StackUnder); + } + +} + + +void fxch_i(void) +{ + /* fxch st(i) */ + FPU_REG t; + int i = FPU_rm; + FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); + long tag_word = fpu_tag_word; + int regnr = top & 7, regnri = ((regnr + i) & 7); + u_char st0_tag = (tag_word >> (regnr*2)) & 3; + u_char sti_tag = (tag_word >> (regnri*2)) & 3; + + if ( st0_tag == TAG_Empty ) + { + if ( sti_tag == TAG_Empty ) + { + FPU_stack_underflow(); + FPU_stack_underflow_i(i); + return; + } + if ( control_word & CW_Invalid ) + { + /* Masked response */ + FPU_copy_to_reg0(sti_ptr, sti_tag); + } + FPU_stack_underflow_i(i); + return; + } + if ( sti_tag == TAG_Empty ) + { + if ( control_word & CW_Invalid ) + { + /* Masked response */ + FPU_copy_to_regi(st0_ptr, st0_tag, i); + } + FPU_stack_underflow(); + return; + } + clear_C1(); + + reg_copy(st0_ptr, &t); + reg_copy(sti_ptr, st0_ptr); + reg_copy(&t, sti_ptr); + + tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2)); + tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2)); + fpu_tag_word = tag_word; +} + + +void ffree_(void) +{ + /* ffree st(i) */ + FPU_settagi(FPU_rm, TAG_Empty); +} + + +void ffreep(void) +{ + /* ffree st(i) + pop - unofficial code */ + FPU_settagi(FPU_rm, TAG_Empty); + FPU_pop(); +} + + +void fst_i_(void) +{ + /* fst st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); +} + + +void fstp_i(void) +{ + /* fstp st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); + FPU_pop(); +} + diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h new file mode 100644 index 000000000000..d62b20a3e660 --- /dev/null +++ b/arch/i386/math-emu/fpu_emu.h @@ -0,0 +1,217 @@ +/*---------------------------------------------------------------------------+ + | fpu_emu.h | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + +---------------------------------------------------------------------------*/ + + +#ifndef _FPU_EMU_H_ +#define _FPU_EMU_H_ + +/* + * Define PECULIAR_486 to get a closer approximation to 80486 behaviour, + * rather than behaviour which appears to be cleaner. + * This is a matter of opinion: for all I know, the 80486 may simply + * be complying with the IEEE spec. Maybe one day I'll get to see the + * spec... + */ +#define PECULIAR_486 + +#ifdef __ASSEMBLY__ +#include "fpu_asm.h" +#define Const(x) $##x +#else +#define Const(x) x +#endif + +#define EXP_BIAS Const(0) +#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ +#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ +#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but + still a 16 bit nr. */ +#define EXP_Infinity EXP_OVER +#define EXP_NaN EXP_OVER + +#define EXTENDED_Ebias Const(0x3fff) +#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ + +#define SIGN_POS Const(0) +#define SIGN_NEG Const(0x80) + +#define SIGN_Positive Const(0) +#define SIGN_Negative Const(0x8000) + + +/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */ +/* The following fold to 2 (Special) in the Tag Word */ +#define TW_Denormal Const(4) /* De-normal */ +#define TW_Infinity Const(5) /* + or - infinity */ +#define TW_NaN Const(6) /* Not a Number */ +#define TW_Unsupported Const(7) /* Not supported by an 80486 */ + +#define TAG_Valid Const(0) /* valid */ +#define TAG_Zero Const(1) /* zero */ +#define TAG_Special Const(2) /* De-normal, + or - infinity, + or Not a Number */ +#define TAG_Empty Const(3) /* empty */ + +#define LOADED_DATA Const(10101) /* Special st() number to identify + loaded data (not on stack). */ + +/* A few flags (must be >= 0x10). */ +#define REV 0x10 +#define DEST_RM 0x20 +#define LOADED 0x40 + +#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ + + +#ifndef __ASSEMBLY__ + +#include "fpu_system.h" + +#include <asm/sigcontext.h> /* for struct _fpstate */ +#include <asm/math_emu.h> +#include <linux/linkage.h> + +/* +#define RE_ENTRANT_CHECKING + */ + +#ifdef RE_ENTRANT_CHECKING +extern u_char emulating; +# define RE_ENTRANT_CHECK_OFF emulating = 0 +# define RE_ENTRANT_CHECK_ON emulating = 1 +#else +# define RE_ENTRANT_CHECK_OFF +# define RE_ENTRANT_CHECK_ON +#endif /* RE_ENTRANT_CHECKING */ + +#define FWAIT_OPCODE 0x9b +#define OP_SIZE_PREFIX 0x66 +#define ADDR_SIZE_PREFIX 0x67 +#define PREFIX_CS 0x2e +#define PREFIX_DS 0x3e +#define PREFIX_ES 0x26 +#define PREFIX_SS 0x36 +#define PREFIX_FS 0x64 +#define PREFIX_GS 0x65 +#define PREFIX_REPE 0xf3 +#define PREFIX_REPNE 0xf2 +#define PREFIX_LOCK 0xf0 +#define PREFIX_CS_ 1 +#define PREFIX_DS_ 2 +#define PREFIX_ES_ 3 +#define PREFIX_FS_ 4 +#define PREFIX_GS_ 5 +#define PREFIX_SS_ 6 +#define PREFIX_DEFAULT 7 + +struct address { + unsigned int offset; + unsigned int selector:16; + unsigned int opcode:11; + unsigned int empty:5; +}; +struct fpu__reg { + unsigned sigl; + unsigned sigh; + short exp; +}; + +typedef void (*FUNC)(void); +typedef struct fpu__reg FPU_REG; +typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag); +typedef struct { u_char address_size, operand_size, segment; } + overrides; +/* This structure is 32 bits: */ +typedef struct { overrides override; + u_char default_mode; } fpu_addr_modes; +/* PROTECTED has a restricted meaning in the emulator; it is used + to signal that the emulator needs to do special things to ensure + that protection is respected in a segmented model. */ +#define PROTECTED 4 +#define SIXTEEN 1 /* We rely upon this being 1 (true) */ +#define VM86 SIXTEEN +#define PM16 (SIXTEEN | PROTECTED) +#define SEG32 PROTECTED +extern u_char const data_sizes_16[32]; + +#define register_base ((u_char *) registers ) +#define fpu_register(x) ( * ((FPU_REG *)( register_base + 10 * (x & 7) )) ) +#define st(x) ( * ((FPU_REG *)( register_base + 10 * ((top+x) & 7) )) ) + +#define STACK_OVERFLOW (FPU_stackoverflow(&st_new_ptr)) +#define NOT_EMPTY(i) (!FPU_empty_i(i)) + +#define NOT_EMPTY_ST0 (st0_tag ^ TAG_Empty) + +#define poppop() { FPU_pop(); FPU_pop(); } + +/* push() does not affect the tags */ +#define push() { top--; } + +#define signbyte(a) (((u_char *)(a))[9]) +#define getsign(a) (signbyte(a) & 0x80) +#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; } +#define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \ + else signbyte(b) &= 0x7f; } +#define changesign(a) { signbyte(a) ^= 0x80; } +#define setpositive(a) { signbyte(a) &= 0x7f; } +#define setnegative(a) { signbyte(a) |= 0x80; } +#define signpositive(a) ( (signbyte(a) & 0x80) == 0 ) +#define signnegative(a) (signbyte(a) & 0x80) + +static inline void reg_copy(FPU_REG const *x, FPU_REG *y) +{ + *(short *)&(y->exp) = *(const short *)&(x->exp); + *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); +} + +#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias) +#define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \ + ((y) + EXTENDED_Ebias) & 0x7fff; } +#define exponent16(x) (*(short *)&((x)->exp)) +#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } +#define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); } +#define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; } + +#define isdenormal(ptr) (exponent(ptr) == EXP_BIAS+EXP_UNDER) + +#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] ) + + +/*----- Prototypes for functions written in assembler -----*/ +/* extern void reg_move(FPU_REG *a, FPU_REG *b); */ + +asmlinkage int FPU_normalize(FPU_REG *x); +asmlinkage int FPU_normalize_nuo(FPU_REG *x); +asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2, + FPU_REG *answ, unsigned int control_w, u_char sign, + int expa, int expb); +asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2, + FPU_REG *answ, unsigned int control_w, u_char sign, + int expon); +asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2, + FPU_REG *answ, unsigned int control_w, u_char sign); +asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2, + FPU_REG *answ, unsigned int control_w, u_char sign, + int expa, int expb); +asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2, + unsigned int control_w, u_char sign); +asmlinkage unsigned FPU_shrx(void *l, unsigned x); +asmlinkage unsigned FPU_shrxs(void *v, unsigned x); +asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y); +asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, + unsigned int control_w, u_char sign); + +#ifndef MAKING_PROTO +#include "fpu_proto.h" +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* _FPU_EMU_H_ */ diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c new file mode 100644 index 000000000000..d93f16ef828f --- /dev/null +++ b/arch/i386/math-emu/fpu_entry.c @@ -0,0 +1,760 @@ +/*---------------------------------------------------------------------------+ + | fpu_entry.c | + | | + | The entry functions for wm-FPU-emu | + | | + | Copyright (C) 1992,1993,1994,1996,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | See the files "README" and "COPYING" for further copyright and warranty | + | information. | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Note: | + | The file contains code which accesses user memory. | + | Emulator static data may change when user memory is accessed, due to | + | other processes using the emulator while swapping is in progress. | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | math_emulate(), restore_i387_soft() and save_i387_soft() are the only | + | entry points for wm-FPU-emu. | + +---------------------------------------------------------------------------*/ + +#include <linux/signal.h> +#include <linux/ptrace.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> + +#include "fpu_system.h" +#include "fpu_emu.h" +#include "exception.h" +#include "control_w.h" +#include "status_w.h" + +#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ + +#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ + +/* WARNING: These codes are not documented by Intel in their 80486 manual + and may not work on FPU clones or later Intel FPUs. */ + +/* Changes to support the un-doc codes provided by Linus Torvalds. */ + +#define _d9_d8_ fstp_i /* unofficial code (19) */ +#define _dc_d0_ fcom_st /* unofficial code (14) */ +#define _dc_d8_ fcompst /* unofficial code (1c) */ +#define _dd_c8_ fxch_i /* unofficial code (0d) */ +#define _de_d0_ fcompst /* unofficial code (16) */ +#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ +#define _df_c8_ fxch_i /* unofficial code (0f) */ +#define _df_d0_ fstp_i /* unofficial code (17) */ +#define _df_d8_ fstp_i /* unofficial code (1f) */ + +static FUNC const st_instr_table[64] = { + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, + fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, + fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, +}; + +#else /* Support only documented FPU op-codes */ + +static FUNC const st_instr_table[64] = { + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, + fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, + fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, +}; + +#endif /* NO_UNDOC_CODE */ + + +#define _NONE_ 0 /* Take no special action */ +#define _REG0_ 1 /* Need to check for not empty st(0) */ +#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ +#define _REGi_ 0 /* Uses st(rm) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ +#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ +#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ +#define _REGIc 0 /* Compare st(0) and st(rm) */ +#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ + +#ifndef NO_UNDOC_CODE + +/* Un-documented FPU op-codes supported by default. (see above) */ + +static u_char const type_table[64] = { + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, + _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ +}; + +#else /* Support only documented FPU op-codes */ + +static u_char const type_table[64] = { + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, + _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ +}; + +#endif /* NO_UNDOC_CODE */ + + +#ifdef RE_ENTRANT_CHECKING +u_char emulating=0; +#endif /* RE_ENTRANT_CHECKING */ + +static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, + overrides *override); + +asmlinkage void math_emulate(long arg) +{ + u_char FPU_modrm, byte1; + unsigned short code; + fpu_addr_modes addr_modes; + int unmasked; + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char loaded_tag, st0_tag; + void __user *data_address; + struct address data_sel_off; + struct address entry_sel_off; + unsigned long code_base = 0; + unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ + struct desc_struct code_descriptor; + +#ifdef RE_ENTRANT_CHECKING + if ( emulating ) + { + printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); + } + RE_ENTRANT_CHECK_ON; +#endif /* RE_ENTRANT_CHECKING */ + + if (!used_math()) + { + finit(); + set_used_math(); + } + + SETUP_DATA_AREA(arg); + + FPU_ORIG_EIP = FPU_EIP; + + if ( (FPU_EFLAGS & 0x00020000) != 0 ) + { + /* Virtual 8086 mode */ + addr_modes.default_mode = VM86; + FPU_EIP += code_base = FPU_CS << 4; + code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */ + } + else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS ) + { + addr_modes.default_mode = 0; + } + else if ( FPU_CS == __KERNEL_CS ) + { + printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP); + panic("Math emulation needed in kernel"); + } + else + { + + if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */ + { + /* Can only handle segmented addressing via the LDT + for now, and it must be 16 bit */ + printk("FPU emulator: Unsupported addressing mode\n"); + math_abort(FPU_info, SIGILL); + } + + code_descriptor = LDT_DESCRIPTOR(FPU_CS); + if ( SEG_D_SIZE(code_descriptor) ) + { + /* The above test may be wrong, the book is not clear */ + /* Segmented 32 bit protected mode */ + addr_modes.default_mode = SEG32; + } + else + { + /* 16 bit protected mode */ + addr_modes.default_mode = PM16; + } + FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); + code_limit = code_base + + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor) + - 1; + if ( code_limit < code_base ) code_limit = 0xffffffff; + } + + FPU_lookahead = 1; + if (current->ptrace & PT_PTRACED) + FPU_lookahead = 0; + + if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP, + &addr_modes.override) ) + { + RE_ENTRANT_CHECK_OFF; + printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n" + "FPU emulator: self-modifying code! (emulation impossible)\n", + byte1); + RE_ENTRANT_CHECK_ON; + EXCEPTION(EX_INTERNAL|0x126); + math_abort(FPU_info,SIGILL); + } + +do_another_FPU_instruction: + + no_ip_update = 0; + + FPU_EIP++; /* We have fetched the prefix and first code bytes. */ + + if ( addr_modes.default_mode ) + { + /* This checks for the minimum instruction bytes. + We also need to check any extra (address mode) code access. */ + if ( FPU_EIP > code_limit ) + math_abort(FPU_info,SIGSEGV); + } + + if ( (byte1 & 0xf8) != 0xd8 ) + { + if ( byte1 == FWAIT_OPCODE ) + { + if (partial_status & SW_Summary) + goto do_the_FPU_interrupt; + else + goto FPU_fwait_done; + } +#ifdef PARANOID + EXCEPTION(EX_INTERNAL|0x128); + math_abort(FPU_info,SIGILL); +#endif /* PARANOID */ + } + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP); + RE_ENTRANT_CHECK_ON; + FPU_EIP++; + + if (partial_status & SW_Summary) + { + /* Ignore the error for now if the current instruction is a no-wait + control instruction */ + /* The 80486 manual contradicts itself on this topic, + but a real 80486 uses the following instructions: + fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex. + */ + code = (FPU_modrm << 8) | byte1; + if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */ + (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv, + fnstsw */ + ((code & 0xc000) != 0xc000))) ) ) + { + /* + * We need to simulate the action of the kernel to FPU + * interrupts here. + */ + do_the_FPU_interrupt: + + FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ + + RE_ENTRANT_CHECK_OFF; + current->thread.trap_no = 16; + current->thread.error_code = 0; + send_sig(SIGFPE, current, 1); + return; + } + } + + entry_sel_off.offset = FPU_ORIG_EIP; + entry_sel_off.selector = FPU_CS; + entry_sel_off.opcode = (byte1 << 8) | FPU_modrm; + + FPU_rm = FPU_modrm & 7; + + if ( FPU_modrm < 0300 ) + { + /* All of these instructions use the mod/rm byte to get a data address */ + + if ( (addr_modes.default_mode & SIXTEEN) + ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) ) + data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off, + addr_modes); + else + data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off, + addr_modes); + + if ( addr_modes.default_mode ) + { + if ( FPU_EIP-1 > code_limit ) + math_abort(FPU_info,SIGSEGV); + } + + if ( !(byte1 & 1) ) + { + unsigned short status1 = partial_status; + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + + /* Stack underflow has priority */ + if ( NOT_EMPTY_ST0 ) + { + if ( addr_modes.default_mode & PROTECTED ) + { + /* This table works for 16 and 32 bit protected mode */ + if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] ) + math_abort(FPU_info,SIGSEGV); + } + + unmasked = 0; /* Do this here to stop compiler warnings. */ + switch ( (byte1 >> 1) & 3 ) + { + case 0: + unmasked = FPU_load_single((float __user *)data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 1: + loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); + break; + case 2: + unmasked = FPU_load_double((double __user *)data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 3: + default: /* Used here to suppress gcc warnings. */ + loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); + break; + } + + /* No more access to user memory, it is safe + to use static data now */ + + /* NaN operands have the next priority. */ + /* We have to delay looking at st(0) until after + loading the data, because that data might contain an SNaN */ + if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) || + ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) ) + { + /* Restore the status word; we might have loaded a + denormal. */ + partial_status = status1; + if ( (FPU_modrm & 0x30) == 0x10 ) + { + /* fcom or fcomp */ + EXCEPTION(EX_Invalid); + setcc(SW_C3 | SW_C2 | SW_C0); + if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) + FPU_pop(); /* fcomp, masked, so we pop. */ + } + else + { + if ( loaded_tag == TAG_Special ) + loaded_tag = FPU_Special(&loaded_data); +#ifdef PECULIAR_486 + /* This is not really needed, but gives behaviour + identical to an 80486 */ + if ( (FPU_modrm & 0x28) == 0x20 ) + /* fdiv or fsub */ + real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data); + else +#endif /* PECULIAR_486 */ + /* fadd, fdivr, fmul, or fsubr */ + real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr); + } + goto reg_mem_instr_done; + } + + if ( unmasked && !((FPU_modrm & 0x30) == 0x10) ) + { + /* Is not a comparison instruction. */ + if ( (FPU_modrm & 0x38) == 0x38 ) + { + /* fdivr */ + if ( (st0_tag == TAG_Zero) && + ((loaded_tag == TAG_Valid) + || (loaded_tag == TAG_Special + && isdenormal(&loaded_data))) ) + { + if ( FPU_divide_by_zero(0, getsign(&loaded_data)) + < 0 ) + { + /* We use the fact here that the unmasked + exception in the loaded data was for a + denormal operand */ + /* Restore the state of the denormal op bit */ + partial_status &= ~SW_Denorm_Op; + partial_status |= status1 & SW_Denorm_Op; + } + else + setsign(st0_ptr, getsign(&loaded_data)); + } + } + goto reg_mem_instr_done; + } + + switch ( (FPU_modrm >> 3) & 7 ) + { + case 0: /* fadd */ + clear_C1(); + FPU_add(&loaded_data, loaded_tag, 0, control_word); + break; + case 1: /* fmul */ + clear_C1(); + FPU_mul(&loaded_data, loaded_tag, 0, control_word); + break; + case 2: /* fcom */ + FPU_compare_st_data(&loaded_data, loaded_tag); + break; + case 3: /* fcomp */ + if ( !FPU_compare_st_data(&loaded_data, loaded_tag) + && !unmasked ) + FPU_pop(); + break; + case 4: /* fsub */ + clear_C1(); + FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word); + break; + case 5: /* fsubr */ + clear_C1(); + FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); + break; + case 6: /* fdiv */ + clear_C1(); + FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word); + break; + case 7: /* fdivr */ + clear_C1(); + if ( st0_tag == TAG_Zero ) + partial_status = status1; /* Undo any denorm tag, + zero-divide has priority. */ + FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); + break; + } + } + else + { + if ( (FPU_modrm & 0x30) == 0x10 ) + { + /* The instruction is fcom or fcomp */ + EXCEPTION(EX_StackUnder); + setcc(SW_C3 | SW_C2 | SW_C0); + if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) + FPU_pop(); /* fcomp */ + } + else + FPU_stack_underflow(); + } + reg_mem_instr_done: + operand_address = data_sel_off; + } + else + { + if ( !(no_ip_update = + FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1, + addr_modes, data_address)) ) + { + operand_address = data_sel_off; + } + } + + } + else + { + /* None of these instructions access user memory */ + u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7); + +#ifdef PECULIAR_486 + /* This is supposed to be undefined, but a real 80486 seems + to do this: */ + operand_address.offset = 0; + operand_address.selector = FPU_DS; +#endif /* PECULIAR_486 */ + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + switch ( type_table[(int) instr_index] ) + { + case _NONE_: /* also _REGIc: _REGIn */ + break; + case _REG0_: + if ( !NOT_EMPTY_ST0 ) + { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _REGIi: + if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) + { + FPU_stack_underflow_i(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGIp: + if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) + { + FPU_stack_underflow_pop(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGI_: + if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) + { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _PUSH_: /* Only used by the fld st(i) instruction */ + break; + case _null_: + FPU_illegal(); + goto FPU_instruction_done; + default: + EXCEPTION(EX_INTERNAL|0x111); + goto FPU_instruction_done; + } + (*st_instr_table[(int) instr_index])(); + +FPU_instruction_done: + ; + } + + if ( ! no_ip_update ) + instruction_address = entry_sel_off; + +FPU_fwait_done: + +#ifdef DEBUG + RE_ENTRANT_CHECK_OFF; + FPU_printall(); + RE_ENTRANT_CHECK_ON; +#endif /* DEBUG */ + + if (FPU_lookahead && !need_resched()) + { + FPU_ORIG_EIP = FPU_EIP - code_base; + if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP, + &addr_modes.override) ) + goto do_another_FPU_instruction; + } + + if ( addr_modes.default_mode ) + FPU_EIP -= code_base; + + RE_ENTRANT_CHECK_OFF; +} + + +/* Support for prefix bytes is not yet complete. To properly handle + all prefix bytes, further changes are needed in the emulator code + which accesses user address space. Access to separate segments is + important for msdos emulation. */ +static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, + overrides *override) +{ + u_char byte; + u_char __user *ip = *fpu_eip; + + *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */ + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + + while ( 1 ) + { + switch ( byte ) + { + case ADDR_SIZE_PREFIX: + override->address_size = ADDR_SIZE_PREFIX; + goto do_next_byte; + + case OP_SIZE_PREFIX: + override->operand_size = OP_SIZE_PREFIX; + goto do_next_byte; + + case PREFIX_CS: + override->segment = PREFIX_CS_; + goto do_next_byte; + case PREFIX_ES: + override->segment = PREFIX_ES_; + goto do_next_byte; + case PREFIX_SS: + override->segment = PREFIX_SS_; + goto do_next_byte; + case PREFIX_FS: + override->segment = PREFIX_FS_; + goto do_next_byte; + case PREFIX_GS: + override->segment = PREFIX_GS_; + goto do_next_byte; + case PREFIX_DS: + override->segment = PREFIX_DS_; + goto do_next_byte; + +/* lock is not a valid prefix for FPU instructions, + let the cpu handle it to generate a SIGILL. */ +/* case PREFIX_LOCK: */ + + /* rep.. prefixes have no meaning for FPU instructions */ + case PREFIX_REPE: + case PREFIX_REPNE: + + do_next_byte: + ip++; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + break; + case FWAIT_OPCODE: + *Byte = byte; + return 1; + default: + if ( (byte & 0xf8) == 0xd8 ) + { + *Byte = byte; + *fpu_eip = ip; + return 1; + } + else + { + /* Not a valid sequence of prefix bytes followed by + an FPU instruction. */ + *Byte = byte; /* Needed for error message. */ + return 0; + } + } + } +} + + +void math_abort(struct info * info, unsigned int signal) +{ + FPU_EIP = FPU_ORIG_EIP; + current->thread.trap_no = 16; + current->thread.error_code = 0; + send_sig(signal,current,1); + RE_ENTRANT_CHECK_OFF; + __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); +#ifdef PARANOID + printk("ERROR: wm-FPU-emu math_abort failed!\n"); +#endif /* PARANOID */ +} + + + +#define S387 ((struct i387_soft_struct *)s387) +#define sstatus_word() \ + ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) + +int restore_i387_soft(void *s387, struct _fpstate __user *buf) +{ + u_char __user *d = (u_char __user *)buf; + int offset, other, i, tags, regnr, tag, newtop; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10); + if (__copy_from_user(&S387->cwd, d, 7*4)) + return -1; + RE_ENTRANT_CHECK_ON; + + d += 7*4; + + S387->ftop = (S387->swd >> SW_Top_Shift) & 7; + offset = (S387->ftop & 7) * 10; + other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; + /* Copy all registers in stack order. */ + if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other)) + return -1; + if ( offset ) + if (__copy_from_user((u_char *)&S387->st_space, d+other, offset)) + return -1; + RE_ENTRANT_CHECK_ON; + + /* The tags may need to be corrected now. */ + tags = S387->twd; + newtop = S387->ftop; + for ( i = 0; i < 8; i++ ) + { + regnr = (i+newtop) & 7; + if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty ) + { + /* The loaded data over-rides all other cases. */ + tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr)); + tags &= ~(3 << (regnr*2)); + tags |= (tag & 3) << (regnr*2); + } + } + S387->twd = tags; + + return 0; +} + + +int save_i387_soft(void *s387, struct _fpstate __user * buf) +{ + u_char __user *d = (u_char __user *)buf; + int offset = (S387->ftop & 7) * 10, other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10); +#ifdef PECULIAR_486 + S387->cwd &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + S387->cwd |= 0xffff0040; + S387->swd = sstatus_word() | 0xffff0000; + S387->twd |= 0xffff0000; + S387->fcs &= ~0xf8000000; + S387->fos |= 0xffff0000; +#endif /* PECULIAR_486 */ + __copy_to_user(d, &S387->cwd, 7*4); + RE_ENTRANT_CHECK_ON; + + d += 7*4; + + RE_ENTRANT_CHECK_OFF; + /* Copy all registers in stack order. */ + if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other)) + return -1; + if ( offset ) + if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset)) + return -1 + RE_ENTRANT_CHECK_ON; + + return 1; +} diff --git a/arch/i386/math-emu/fpu_etc.c b/arch/i386/math-emu/fpu_etc.c new file mode 100644 index 000000000000..e3b5d465587f --- /dev/null +++ b/arch/i386/math-emu/fpu_etc.c @@ -0,0 +1,143 @@ +/*---------------------------------------------------------------------------+ + | fpu_etc.c | + | | + | Implement a few FPU instructions. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" +#include "status_w.h" +#include "reg_constant.h" + + +static void fchs(FPU_REG *st0_ptr, u_char st0tag) +{ + if ( st0tag ^ TAG_Empty ) + { + signbyte(st0_ptr) ^= SIGN_NEG; + clear_C1(); + } + else + FPU_stack_underflow(); +} + + +static void fabs(FPU_REG *st0_ptr, u_char st0tag) +{ + if ( st0tag ^ TAG_Empty ) + { + setpositive(st0_ptr); + clear_C1(); + } + else + FPU_stack_underflow(); +} + + +static void ftst_(FPU_REG *st0_ptr, u_char st0tag) +{ + switch (st0tag) + { + case TAG_Zero: + setcc(SW_C3); + break; + case TAG_Valid: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + case TAG_Special: + switch ( FPU_Special(st0_ptr) ) + { + case TW_Denormal: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + if ( denormal_operand() < 0 ) + { +#ifdef PECULIAR_486 + /* This is weird! */ + if (getsign(st0_ptr) == SIGN_POS) + setcc(SW_C3); +#endif /* PECULIAR_486 */ + return; + } + break; + case TW_NaN: + setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_Invalid); + break; + case TW_Infinity: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + default: + setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_INTERNAL|0x14); + break; + } + break; + case TAG_Empty: + setcc(SW_C0|SW_C2|SW_C3); + EXCEPTION(EX_StackUnder); + break; + } +} + + +static void fxam(FPU_REG *st0_ptr, u_char st0tag) +{ + int c = 0; + switch (st0tag) + { + case TAG_Empty: + c = SW_C3|SW_C0; + break; + case TAG_Zero: + c = SW_C3; + break; + case TAG_Valid: + c = SW_C2; + break; + case TAG_Special: + switch ( FPU_Special(st0_ptr) ) + { + case TW_Denormal: + c = SW_C2|SW_C3; /* Denormal */ + break; + case TW_NaN: + /* We also use NaN for unsupported types. */ + if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) ) + c = SW_C0; + break; + case TW_Infinity: + c = SW_C2|SW_C0; + break; + } + } + if ( getsign(st0_ptr) == SIGN_NEG ) + c |= SW_C1; + setcc(c); +} + + +static FUNC_ST0 const fp_etc_table[] = { + fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal, + ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal +}; + +void FPU_etc(void) +{ + (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0()); +} diff --git a/arch/i386/math-emu/fpu_proto.h b/arch/i386/math-emu/fpu_proto.h new file mode 100644 index 000000000000..37a8a7fe7e2b --- /dev/null +++ b/arch/i386/math-emu/fpu_proto.h @@ -0,0 +1,140 @@ +#ifndef _FPU_PROTO_H +#define _FPU_PROTO_H + +/* errors.c */ +extern void FPU_illegal(void); +extern void FPU_printall(void); +asmlinkage void FPU_exception(int n); +extern int real_1op_NaN(FPU_REG *a); +extern int real_2op_NaN(FPU_REG const *b, u_char tagb, int deststnr, + FPU_REG const *defaultNaN); +asmlinkage int arith_invalid(int deststnr); +asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign); +extern int set_precision_flag(int flags); +asmlinkage void set_precision_flag_up(void); +asmlinkage void set_precision_flag_down(void); +asmlinkage int denormal_operand(void); +asmlinkage int arith_overflow(FPU_REG *dest); +asmlinkage int arith_underflow(FPU_REG *dest); +extern void FPU_stack_overflow(void); +extern void FPU_stack_underflow(void); +extern void FPU_stack_underflow_i(int i); +extern void FPU_stack_underflow_pop(int i); +/* fpu_arith.c */ +extern void fadd__(void); +extern void fmul__(void); +extern void fsub__(void); +extern void fsubr_(void); +extern void fdiv__(void); +extern void fdivr_(void); +extern void fadd_i(void); +extern void fmul_i(void); +extern void fsubri(void); +extern void fsub_i(void); +extern void fdivri(void); +extern void fdiv_i(void); +extern void faddp_(void); +extern void fmulp_(void); +extern void fsubrp(void); +extern void fsubp_(void); +extern void fdivrp(void); +extern void fdivp_(void); +/* fpu_aux.c */ +extern void finit(void); +extern void finit_(void); +extern void fstsw_(void); +extern void fp_nop(void); +extern void fld_i_(void); +extern void fxch_i(void); +extern void ffree_(void); +extern void ffreep(void); +extern void fst_i_(void); +extern void fstp_i(void); +/* fpu_entry.c */ +asmlinkage extern void math_emulate(long arg); +extern void math_abort(struct info *info, unsigned int signal); +/* fpu_etc.c */ +extern void FPU_etc(void); +/* fpu_tags.c */ +extern int FPU_gettag0(void); +extern int FPU_gettagi(int stnr); +extern int FPU_gettag(int regnr); +extern void FPU_settag0(int tag); +extern void FPU_settagi(int stnr, int tag); +extern void FPU_settag(int regnr, int tag); +extern int FPU_Special(FPU_REG const *ptr); +extern int isNaN(FPU_REG const *ptr); +extern void FPU_pop(void); +extern int FPU_empty_i(int stnr); +extern int FPU_stackoverflow(FPU_REG **st_new_ptr); +extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr); +extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag); +extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag); +/* fpu_trig.c */ +extern void FPU_triga(void); +extern void FPU_trigb(void); +/* get_address.c */ +extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, fpu_addr_modes addr_modes); +extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, fpu_addr_modes addr_modes); +/* load_store.c */ +extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes, + void __user *data_address); +/* poly_2xm1.c */ +extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result); +/* poly_atan.c */ +extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, + u_char st1_tag); +/* poly_l2.c */ +extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign); +extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1, + FPU_REG *d); +/* poly_sin.c */ +extern void poly_sine(FPU_REG *st0_ptr); +extern void poly_cos(FPU_REG *st0_ptr); +/* poly_tan.c */ +extern void poly_tan(FPU_REG *st0_ptr); +/* reg_add_sub.c */ +extern int FPU_add(FPU_REG const *b, u_char tagb, int destrnr, int control_w); +extern int FPU_sub(int flags, int rm, int control_w); +/* reg_compare.c */ +extern int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag); +extern void fcom_st(void); +extern void fcompst(void); +extern void fcompp(void); +extern void fucom_(void); +extern void fucomp(void); +extern void fucompp(void); +/* reg_constant.c */ +extern void fconst(void); +/* reg_ld_str.c */ +extern int FPU_load_extended(long double __user *s, int stnr); +extern int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data); +extern int FPU_load_single(float __user *single, FPU_REG *loaded_data); +extern int FPU_load_int64(long long __user *_s); +extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data); +extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data); +extern int FPU_load_bcd(u_char __user *s); +extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, + long double __user *d); +extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat); +extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single); +extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d); +extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d); +extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); +extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); +extern int FPU_round_to_int(FPU_REG *r, u_char tag); +extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s); +extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address); +extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d); +extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address); +extern int FPU_tagof(FPU_REG *ptr); +/* reg_mul.c */ +extern int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w); + +extern int FPU_div(int flags, int regrm, int control_w); +/* reg_convert.c */ +extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x); +#endif /* _FPU_PROTO_H */ + diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h new file mode 100644 index 000000000000..bf26341c8bde --- /dev/null +++ b/arch/i386/math-emu/fpu_system.h @@ -0,0 +1,89 @@ +/*---------------------------------------------------------------------------+ + | fpu_system.h | + | | + | Copyright (C) 1992,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _FPU_SYSTEM_H +#define _FPU_SYSTEM_H + +/* system dependent definitions */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +/* This sets the pointer FPU_info to point to the argument part + of the stack frame of math_emulate() */ +#define SETUP_DATA_AREA(arg) FPU_info = (struct info *) &arg + +/* s is always from a cpu register, and the cpu does bounds checking + * during register load --> no further bounds checks needed */ +#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3]) +#define SEG_D_SIZE(x) ((x).b & (3 << 21)) +#define SEG_G_BIT(x) ((x).b & (1 << 23)) +#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) +#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) +#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ + | (((s).b & 0xff) << 16) | ((s).a >> 16)) +#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) +#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) +#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) +#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ + == (1 << 10)) + +#define I387 (current->thread.i387) +#define FPU_info (I387.soft.info) + +#define FPU_CS (*(unsigned short *) &(FPU_info->___cs)) +#define FPU_SS (*(unsigned short *) &(FPU_info->___ss)) +#define FPU_DS (*(unsigned short *) &(FPU_info->___ds)) +#define FPU_EAX (FPU_info->___eax) +#define FPU_EFLAGS (FPU_info->___eflags) +#define FPU_EIP (FPU_info->___eip) +#define FPU_ORIG_EIP (FPU_info->___orig_eip) + +#define FPU_lookahead (I387.soft.lookahead) + +/* nz if ip_offset and cs_selector are not to be set for the current + instruction. */ +#define no_ip_update (*(u_char *)&(I387.soft.no_update)) +#define FPU_rm (*(u_char *)&(I387.soft.rm)) + +/* Number of bytes of data which can be legally accessed by the current + instruction. This only needs to hold a number <= 108, so a byte will do. */ +#define access_limit (*(u_char *)&(I387.soft.alimit)) + +#define partial_status (I387.soft.swd) +#define control_word (I387.soft.cwd) +#define fpu_tag_word (I387.soft.twd) +#define registers (I387.soft.st_space) +#define top (I387.soft.ftop) + +#define instruction_address (*(struct address *)&I387.soft.fip) +#define operand_address (*(struct address *)&I387.soft.foo) + +#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ + math_abort(FPU_info,SIGSEGV) + +#undef FPU_IGNORE_CODE_SEGV +#ifdef FPU_IGNORE_CODE_SEGV +/* access_ok() is very expensive, and causes the emulator to run + about 20% slower if applied to the code. Anyway, errors due to bad + code addresses should be much rarer than errors due to bad data + addresses. */ +#define FPU_code_access_ok(z) +#else +/* A simpler test than access_ok() can probably be done for + FPU_code_access_ok() because the only possible error is to step + past the upper boundary of a legal code area. */ +#define FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z) +#endif + +#define FPU_get_user(x,y) get_user((x),(y)) +#define FPU_put_user(x,y) put_user((x),(y)) + +#endif diff --git a/arch/i386/math-emu/fpu_tags.c b/arch/i386/math-emu/fpu_tags.c new file mode 100644 index 000000000000..cb436fe20e4c --- /dev/null +++ b/arch/i386/math-emu/fpu_tags.c @@ -0,0 +1,127 @@ +/*---------------------------------------------------------------------------+ + | fpu_tags.c | + | | + | Set FPU register tags. | + | | + | Copyright (C) 1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@jacobi.maths.monash.edu.au | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" +#include "fpu_system.h" +#include "exception.h" + + +void FPU_pop(void) +{ + fpu_tag_word |= 3 << ((top & 7)*2); + top++; +} + + +int FPU_gettag0(void) +{ + return (fpu_tag_word >> ((top & 7)*2)) & 3; +} + + +int FPU_gettagi(int stnr) +{ + return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3; +} + + +int FPU_gettag(int regnr) +{ + return (fpu_tag_word >> ((regnr & 7)*2)) & 3; +} + + +void FPU_settag0(int tag) +{ + int regnr = top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr*2)); + fpu_tag_word |= (tag & 3) << (regnr*2); +} + + +void FPU_settagi(int stnr, int tag) +{ + int regnr = stnr+top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr*2)); + fpu_tag_word |= (tag & 3) << (regnr*2); +} + + +void FPU_settag(int regnr, int tag) +{ + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr*2)); + fpu_tag_word |= (tag & 3) << (regnr*2); +} + + +int FPU_Special(FPU_REG const *ptr) +{ + int exp = exponent(ptr); + + if ( exp == EXP_BIAS+EXP_UNDER ) + return TW_Denormal; + else if ( exp != EXP_BIAS+EXP_OVER ) + return TW_NaN; + else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) ) + return TW_Infinity; + return TW_NaN; +} + + +int isNaN(FPU_REG const *ptr) +{ + return ( (exponent(ptr) == EXP_BIAS+EXP_OVER) + && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) ); +} + + +int FPU_empty_i(int stnr) +{ + int regnr = (top+stnr) & 7; + + return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty; +} + + +int FPU_stackoverflow(FPU_REG **st_new_ptr) +{ + *st_new_ptr = &st(-1); + + return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty; +} + + +void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr) +{ + reg_copy(r, &st(stnr)); + FPU_settagi(stnr, tag); +} + +void FPU_copy_to_reg1(FPU_REG const *r, u_char tag) +{ + reg_copy(r, &st(1)); + FPU_settagi(1, tag); +} + +void FPU_copy_to_reg0(FPU_REG const *r, u_char tag) +{ + int regnr = top; + regnr &= 7; + + reg_copy(r, &st(0)); + + fpu_tag_word &= ~(3 << (regnr*2)); + fpu_tag_word |= (tag & 3) << (regnr*2); +} diff --git a/arch/i386/math-emu/fpu_trig.c b/arch/i386/math-emu/fpu_trig.c new file mode 100644 index 000000000000..403cbde1d425 --- /dev/null +++ b/arch/i386/math-emu/fpu_trig.c @@ -0,0 +1,1845 @@ +/*---------------------------------------------------------------------------+ + | fpu_trig.c | + | | + | Implementation of the FPU "transcendental" functions. | + | | + | Copyright (C) 1992,1993,1994,1997,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@melbpc.org.au | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" +#include "status_w.h" +#include "control_w.h" +#include "reg_constant.h" + +static void rem_kernel(unsigned long long st0, unsigned long long *y, + unsigned long long st1, + unsigned long long q, int n); + +#define BETTER_THAN_486 + +#define FCOS 4 + +/* Used only by fptan, fsin, fcos, and fsincos. */ +/* This routine produces very accurate results, similar to + using a value of pi with more than 128 bits precision. */ +/* Limited measurements show no results worse than 64 bit precision + except for the results for arguments close to 2^63, where the + precision of the result sometimes degrades to about 63.9 bits */ +static int trig_arg(FPU_REG *st0_ptr, int even) +{ + FPU_REG tmp; + u_char tmptag; + unsigned long long q; + int old_cw = control_word, saved_status = partial_status; + int tag, st0_tag = TAG_Valid; + + if ( exponent(st0_ptr) >= 63 ) + { + partial_status |= SW_C2; /* Reduction incomplete. */ + return -1; + } + + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + setpositive(st0_ptr); + tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f, + SIGN_POS); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow + to 2^64 */ + q = significand(&tmp); + if ( q ) + { + rem_kernel(significand(st0_ptr), + &significand(&tmp), + significand(&CONST_PI2), + q, exponent(st0_ptr) - exponent(&CONST_PI2)); + setexponent16(&tmp, exponent(&CONST_PI2)); + st0_tag = FPU_normalize(&tmp); + FPU_copy_to_reg0(&tmp, st0_tag); + } + + if ( (even && !(q & 1)) || (!even && (q & 1)) ) + { + st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION); + +#ifdef BETTER_THAN_486 + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) ) + { + /* This code gives the effect of having pi/2 to better than + 128 bits precision. */ + + significand(&tmp) = q + 1; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); + tmptag = + FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI2extra) + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); + if ( signnegative(st0_ptr) ) + { + /* CONST_PI2extra is negative, so the result of the addition + can be negative. This means that the argument is actually + in a different quadrant. The correction is always < pi/2, + so it can't overflow into yet another quadrant. */ + setpositive(st0_ptr); + q++; + } + } +#endif /* BETTER_THAN_486 */ + } +#ifdef BETTER_THAN_486 + else + { + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) + || (q > 1) ) + { + /* This code gives the effect of having p/2 to better than + 128 bits precision. */ + + significand(&tmp) = q; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); /* This must return TAG_Valid */ + tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, + SIGN_POS, + exponent(&CONST_PI2extra) + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp, + FULL_PRECISION); + if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) && + ((st0_ptr->sigh > CONST_PI2.sigh) + || ((st0_ptr->sigh == CONST_PI2.sigh) + && (st0_ptr->sigl > CONST_PI2.sigl))) ) + { + /* CONST_PI2extra is negative, so the result of the + subtraction can be larger than pi/2. This means + that the argument is actually in a different quadrant. + The correction is always < pi/2, so it can't overflow + into yet another quadrant. */ + st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, + FULL_PRECISION); + q++; + } + } + } +#endif /* BETTER_THAN_486 */ + + FPU_settag0(st0_tag); + control_word = old_cw; + partial_status = saved_status & ~SW_C2; /* Reduction complete. */ + + return (q & 3) | even; +} + + +/* Convert a long to register */ +static void convert_l2reg(long const *arg, int deststnr) +{ + int tag; + long num = *arg; + u_char sign; + FPU_REG *dest = &st(deststnr); + + if (num == 0) + { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + return; + } + + if (num > 0) + { sign = SIGN_POS; } + else + { num = -num; sign = SIGN_NEG; } + + dest->sigh = num; + dest->sigl = 0; + setexponent16(dest, 31); + tag = FPU_normalize(dest); + FPU_settagi(deststnr, tag); + setsign(dest, sign); + return; +} + + +static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag) +{ + if ( st0_tag == TAG_Empty ) + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + else if ( st0_tag == TW_NaN ) + real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ +#ifdef PARANOID + else + EXCEPTION(EX_INTERNAL|0x0112); +#endif /* PARANOID */ +} + + +static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag) +{ + int isNaN; + + switch ( st0_tag ) + { + case TW_NaN: + isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000); + if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */ + { + EXCEPTION(EX_Invalid); + if ( control_word & CW_Invalid ) + { + /* The masked response */ + /* Convert to a QNaN */ + st0_ptr->sigh |= 0x40000000; + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } + } + else if ( isNaN ) + { + /* A QNaN */ + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } + else + { + /* pseudoNaN or other unsupported */ + EXCEPTION(EX_Invalid); + if ( control_word & CW_Invalid ) + { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + push(); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } + } + break; /* return with a NaN in st(0) */ +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x0112); +#endif /* PARANOID */ + } +} + + +/*---------------------------------------------------------------------------*/ + +static void f2xm1(FPU_REG *st0_ptr, u_char tag) +{ + FPU_REG a; + + clear_C1(); + + if ( tag == TAG_Valid ) + { + /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ + if ( exponent(st0_ptr) < 0 ) + { + denormal_arg: + + FPU_to_exp16(st0_ptr, &a); + + /* poly_2xm1(x) requires 0 < st(0) < 1. */ + poly_2xm1(getsign(st0_ptr), &a, st0_ptr); + } + set_precision_flag_up(); /* 80486 appears to always do this */ + return; + } + + if ( tag == TAG_Zero ) + return; + + if ( tag == TAG_Special ) + tag = FPU_Special(st0_ptr); + + switch ( tag ) + { + case TW_Denormal: + if ( denormal_operand() < 0 ) + return; + goto denormal_arg; + case TW_Infinity: + if ( signnegative(st0_ptr) ) + { + /* -infinity gives -1 (p16-10) */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setnegative(st0_ptr); + } + return; + default: + single_arg_error(st0_ptr, tag); + } +} + + +static void fptan(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st_new_ptr; + int q; + u_char arg_sign = getsign(st0_ptr); + + /* Stack underflow has higher priority */ + if ( st0_tag == TAG_Empty ) + { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if ( control_word & CW_Invalid ) + { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; + } + + if ( STACK_OVERFLOW ) + { FPU_stack_overflow(); return; } + + if ( st0_tag == TAG_Valid ) + { + if ( exponent(st0_ptr) > -40 ) + { + if ( (q = trig_arg(st0_ptr, 0)) == -1 ) + { + /* Operand is out of range */ + return; + } + + poly_tan(st0_ptr); + setsign(st0_ptr, (q & 1) ^ (arg_sign != 0)); + set_precision_flag_up(); /* We do not really know if up or down */ + } + else + { + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + + denormal_arg: + + FPU_to_exp16(st0_ptr, st0_ptr); + + st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + FPU_settag0(st0_tag); + } + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; + } + + if ( st0_tag == TAG_Zero ) + { + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + + if ( st0_tag == TW_Denormal ) + { + if ( denormal_operand() < 0 ) + return; + + goto denormal_arg; + } + + if ( st0_tag == TW_Infinity ) + { + /* The 80486 treats infinity as an invalid operand */ + if ( arith_invalid(0) >= 0 ) + { + st_new_ptr = &st(-1); + push(); + arith_invalid(0); + } + return; + } + + single_arg_2_error(st0_ptr, st0_tag); +} + + +static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st_new_ptr; + u_char sign; + register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ + + if ( STACK_OVERFLOW ) + { FPU_stack_overflow(); return; } + + clear_C1(); + + if ( st0_tag == TAG_Valid ) + { + long e; + + push(); + sign = getsign(st1_ptr); + reg_copy(st1_ptr, st_new_ptr); + setexponent16(st_new_ptr, exponent(st_new_ptr)); + + denormal_arg: + + e = exponent16(st_new_ptr); + convert_l2reg(&e, 1); + setexponentpos(st_new_ptr, 0); + setsign(st_new_ptr, sign); + FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */ + return; + } + else if ( st0_tag == TAG_Zero ) + { + sign = getsign(st0_ptr); + + if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 ) + return; + + push(); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st_new_ptr, sign); + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + + if ( st0_tag == TW_Denormal ) + { + if (denormal_operand() < 0 ) + return; + + push(); + sign = getsign(st1_ptr); + FPU_to_exp16(st1_ptr, st_new_ptr); + goto denormal_arg; + } + else if ( st0_tag == TW_Infinity ) + { + sign = getsign(st0_ptr); + setpositive(st0_ptr); + push(); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + setsign(st_new_ptr, sign); + return; + } + else if ( st0_tag == TW_NaN ) + { + if ( real_1op_NaN(st0_ptr) < 0 ) + return; + + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + return; + } + else if ( st0_tag == TAG_Empty ) + { + /* Is this the correct behaviour? */ + if ( control_word & EX_Invalid ) + { + FPU_stack_underflow(); + push(); + FPU_stack_underflow(); + } + else + EXCEPTION(EX_StackUnder); + } +#ifdef PARANOID + else + EXCEPTION(EX_INTERNAL | 0x119); +#endif /* PARANOID */ +} + + +static void fdecstp(void) +{ + clear_C1(); + top--; +} + +static void fincstp(void) +{ + clear_C1(); + top++; +} + + +static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag) +{ + int expon; + + clear_C1(); + + if ( st0_tag == TAG_Valid ) + { + u_char tag; + + if (signnegative(st0_ptr)) + { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } + + /* make st(0) in [1.0 .. 4.0) */ + expon = exponent(st0_ptr); + + denormal_arg: + + setexponent16(st0_ptr, (expon & 1)); + + /* Do the computation, the sign of the result will be positive. */ + tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); + addexponent(st0_ptr, expon >> 1); + FPU_settag0(tag); + return; + } + + if ( st0_tag == TAG_Zero ) + return; + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + + if ( st0_tag == TW_Infinity ) + { + if ( signnegative(st0_ptr) ) + arith_invalid(0); /* sqrt(-Infinity) is invalid */ + return; + } + else if ( st0_tag == TW_Denormal ) + { + if (signnegative(st0_ptr)) + { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } + + if ( denormal_operand() < 0 ) + return; + + FPU_to_exp16(st0_ptr, st0_ptr); + + expon = exponent16(st0_ptr); + + goto denormal_arg; + } + + single_arg_error(st0_ptr, st0_tag); + +} + + +static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) +{ + int flags, tag; + + if ( st0_tag == TAG_Valid ) + { + u_char sign; + + denormal_arg: + + sign = getsign(st0_ptr); + + if (exponent(st0_ptr) > 63) + return; + + if ( st0_tag == TW_Denormal ) + { + if (denormal_operand() < 0 ) + return; + } + + /* Fortunately, this can't overflow to 2^64 */ + if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) ) + set_precision_flag(flags); + + setexponent16(st0_ptr, 63); + tag = FPU_normalize(st0_ptr); + setsign(st0_ptr, sign); + FPU_settag0(tag); + return; + } + + if ( st0_tag == TAG_Zero ) + return; + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + + if ( st0_tag == TW_Denormal ) + goto denormal_arg; + else if ( st0_tag == TW_Infinity ) + return; + else + single_arg_error(st0_ptr, st0_tag); +} + + +static int fsin(FPU_REG *st0_ptr, u_char tag) +{ + u_char arg_sign = getsign(st0_ptr); + + if ( tag == TAG_Valid ) + { + int q; + + if ( exponent(st0_ptr) > -40 ) + { + if ( (q = trig_arg(st0_ptr, 0)) == -1 ) + { + /* Operand is out of range */ + return 1; + } + + poly_sine(st0_ptr); + + if (q & 2) + changesign(st0_ptr); + + setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); + + /* We do not really know if up or down */ + set_precision_flag_up(); + return 0; + } + else + { + /* For a small arg, the result == the argument */ + set_precision_flag_up(); /* Must be up. */ + return 0; + } + } + + if ( tag == TAG_Zero ) + { + setcc(0); + return 0; + } + + if ( tag == TAG_Special ) + tag = FPU_Special(st0_ptr); + + if ( tag == TW_Denormal ) + { + if ( denormal_operand() < 0 ) + return 1; + + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + FPU_to_exp16(st0_ptr, st0_ptr); + + tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + + FPU_settag0(tag); + + return 0; + } + else if ( tag == TW_Infinity ) + { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } + else + { + single_arg_error(st0_ptr, tag); + return 1; + } +} + + +static int f_cos(FPU_REG *st0_ptr, u_char tag) +{ + u_char st0_sign; + + st0_sign = getsign(st0_ptr); + + if ( tag == TAG_Valid ) + { + int q; + + if ( exponent(st0_ptr) > -40 ) + { + if ( (exponent(st0_ptr) < 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) ) + { + poly_cos(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } + else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 ) + { + poly_sine(st0_ptr); + + if ((q+1) & 2) + changesign(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } + else + { + /* Operand is out of range */ + return 1; + } + } + else + { + denormal_arg: + + setcc(0); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); +#ifdef PECULIAR_486 + set_precision_flag_down(); /* 80486 appears to do this. */ +#else + set_precision_flag_up(); /* Must be up. */ +#endif /* PECULIAR_486 */ + return 0; + } + } + else if ( tag == TAG_Zero ) + { + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return 0; + } + + if ( tag == TAG_Special ) + tag = FPU_Special(st0_ptr); + + if ( tag == TW_Denormal ) + { + if ( denormal_operand() < 0 ) + return 1; + + goto denormal_arg; + } + else if ( tag == TW_Infinity ) + { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } + else + { + single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */ + return 1; + } +} + + +static void fcos(FPU_REG *st0_ptr, u_char st0_tag) +{ + f_cos(st0_ptr, st0_tag); +} + + +static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st_new_ptr; + FPU_REG arg; + u_char tag; + + /* Stack underflow has higher priority */ + if ( st0_tag == TAG_Empty ) + { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if ( control_word & CW_Invalid ) + { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; + } + + if ( STACK_OVERFLOW ) + { FPU_stack_overflow(); return; } + + if ( st0_tag == TAG_Special ) + tag = FPU_Special(st0_ptr); + else + tag = st0_tag; + + if ( tag == TW_NaN ) + { + single_arg_2_error(st0_ptr, TW_NaN); + return; + } + else if ( tag == TW_Infinity ) + { + /* The 80486 treats infinity as an invalid operand */ + if ( arith_invalid(0) >= 0 ) + { + /* Masked response */ + push(); + arith_invalid(0); + } + return; + } + + reg_copy(st0_ptr, &arg); + if ( !fsin(st0_ptr, st0_tag) ) + { + push(); + FPU_copy_to_reg0(&arg, st0_tag); + f_cos(&st(0), st0_tag); + } + else + { + /* An error, so restore st(0) */ + FPU_copy_to_reg0(&arg, st0_tag); + } +} + + +/*---------------------------------------------------------------------------*/ +/* The following all require two arguments: st(0) and st(1) */ + +/* A lean, mean kernel for the fprem instructions. This relies upon + the division and rounding to an integer in do_fprem giving an + exact result. Because of this, rem_kernel() needs to deal only with + the least significant 64 bits, the more significant bits of the + result must be zero. + */ +static void rem_kernel(unsigned long long st0, unsigned long long *y, + unsigned long long st1, + unsigned long long q, int n) +{ + int dummy; + unsigned long long x; + + x = st0 << n; + + /* Do the required multiplication and subtraction in the one operation */ + + /* lsw x -= lsw st1 * lsw q */ + asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1" + :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]), + "=a" (dummy) + :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= msw st1 * lsw q */ + asm volatile ("mull %3; subl %%eax,%0" + :"=m" (((unsigned *)&x)[1]), "=a" (dummy) + :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= lsw st1 * msw q */ + asm volatile ("mull %3; subl %%eax,%0" + :"=m" (((unsigned *)&x)[1]), "=a" (dummy) + :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1]) + :"%dx"); + + *y = x; +} + + +/* Remainder of st(0) / st(1) */ +/* This routine produces exact results, i.e. there is never any + rounding or truncation, etc of the result. */ +static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round) +{ + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + + if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) + { + FPU_REG tmp, st0, st1; + u_char st0_sign, st1_sign; + u_char tmptag; + int tag; + int old_cw; + int expdif; + long long q; + unsigned short saved_status; + int cc; + + fprem_valid: + /* Convert registers for internal use. */ + st0_sign = FPU_to_exp16(st0_ptr, &st0); + st1_sign = FPU_to_exp16(st1_ptr, &st1); + expdif = exponent16(&st0) - exponent16(&st1); + + old_cw = control_word; + cc = 0; + + /* We want the status following the denorm tests, but don't want + the status changed by the arithmetic operations. */ + saved_status = partial_status; + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + if ( expdif < 64 ) + { + /* This should be the most common case */ + + if ( expdif > -2 ) + { + u_char sign = st0_sign ^ st1_sign; + tag = FPU_u_div(&st0, &st1, &tmp, + PR_64_BITS | RC_CHOP | 0x3f, + sign); + setsign(&tmp, sign); + + if ( exponent(&tmp) >= 0 ) + { + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + q = significand(&tmp); + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + q, expdif); + + setexponent16(&tmp, exponent16(&st1)); + } + else + { + reg_copy(&st0, &tmp); + q = 0; + } + + if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) ) + { + /* We may need to subtract st(1) once more, + to get a result <= 1/2 of st(1). */ + unsigned long long x; + expdif = exponent16(&st1) - exponent16(&tmp); + if ( expdif <= 1 ) + { + if ( expdif == 0 ) + x = significand(&st1) - significand(&tmp); + else /* expdif is 1 */ + x = (significand(&st1) << 1) - significand(&tmp); + if ( (x < significand(&tmp)) || + /* or equi-distant (from 0 & st(1)) and q is odd */ + ((x == significand(&tmp)) && (q & 1) ) ) + { + st0_sign = ! st0_sign; + significand(&tmp) = x; + q++; + } + } + } + + if (q & 4) cc |= SW_C0; + if (q & 2) cc |= SW_C3; + if (q & 1) cc |= SW_C1; + } + else + { + control_word = old_cw; + setcc(0); + return; + } + } + else + { + /* There is a large exponent difference ( >= 64 ) */ + /* To make much sense, the code in this section should + be done at high precision. */ + int exp_1, N; + u_char sign; + + /* prevent overflow here */ + /* N is 'a number between 32 and 63' (p26-113) */ + reg_copy(&st0, &tmp); + tmptag = st0_tag; + N = (expdif & 0x0000001f) + 32; /* This choice gives results + identical to an AMD 486 */ + setexponent16(&tmp, N); + exp_1 = exponent16(&st1); + setexponent16(&st1, 0); + expdif -= N; + + sign = getsign(&tmp) ^ st1_sign; + tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f, + sign); + setsign(&tmp, sign); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + significand(&tmp), + exponent(&tmp) + ); + setexponent16(&tmp, exp_1 + expdif); + + /* It is possible for the operation to be complete here. + What does the IEEE standard say? The Intel 80486 manual + implies that the operation will never be completed at this + point, and the behaviour of a real 80486 confirms this. + */ + if ( !(tmp.sigh | tmp.sigl) ) + { + /* The result is zero */ + control_word = old_cw; + partial_status = saved_status; + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(&st0, st0_sign); +#ifdef PECULIAR_486 + setcc(SW_C2); +#else + setcc(0); +#endif /* PECULIAR_486 */ + return; + } + cc = SW_C2; + } + + control_word = old_cw; + partial_status = saved_status; + tag = FPU_normalize_nuo(&tmp); + reg_copy(&tmp, st0_ptr); + + /* The only condition to be looked for is underflow, + and it can occur here only if underflow is unmasked. */ + if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) + && !(control_word & CW_Underflow) ) + { + setcc(cc); + tag = arith_underflow(st0_ptr); + setsign(st0_ptr, st0_sign); + FPU_settag0(tag); + return; + } + else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) ) + { + stdexp(st0_ptr); + setsign(st0_ptr, st0_sign); + } + else + { + tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign); + } + FPU_settag0(tag); + setcc(cc); + + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + if ( st1_tag == TAG_Special ) + st1_tag = FPU_Special(st1_ptr); + + if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) + { + if ( denormal_operand() < 0 ) + return; + goto fprem_valid; + } + else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) + { + FPU_stack_underflow(); + return; + } + else if ( st0_tag == TAG_Zero ) + { + if ( st1_tag == TAG_Valid ) + { + setcc(0); return; + } + else if ( st1_tag == TW_Denormal ) + { + if ( denormal_operand() < 0 ) + return; + setcc(0); return; + } + else if ( st1_tag == TAG_Zero ) + { arith_invalid(0); return; } /* fprem(?,0) always invalid */ + else if ( st1_tag == TW_Infinity ) + { setcc(0); return; } + } + else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) + { + if ( st1_tag == TAG_Zero ) + { + arith_invalid(0); /* fprem(Valid,Zero) is invalid */ + return; + } + else if ( st1_tag != TW_NaN ) + { + if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal)) + && (denormal_operand() < 0) ) + return; + + if ( st1_tag == TW_Infinity ) + { + /* fprem(Valid,Infinity) is o.k. */ + setcc(0); return; + } + } + } + else if ( st0_tag == TW_Infinity ) + { + if ( st1_tag != TW_NaN ) + { + arith_invalid(0); /* fprem(Infinity,?) is invalid */ + return; + } + } + + /* One of the registers must contain a NaN if we got here. */ + +#ifdef PARANOID + if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) ) + EXCEPTION(EX_INTERNAL | 0x118); +#endif /* PARANOID */ + + real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); + +} + + +/* ST(1) <- ST(1) * log ST; pop ST */ +static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st1_ptr = &st(1), exponent; + u_char st1_tag = FPU_gettagi(1); + u_char sign; + int e, tag; + + clear_C1(); + + if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) ) + { + both_valid: + /* Both regs are Valid or Denormal */ + if ( signpositive(st0_ptr) ) + { + if ( st0_tag == TW_Denormal ) + FPU_to_exp16(st0_ptr, st0_ptr); + else + /* Convert st(0) for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) ) + { + /* Special case. The result can be precise. */ + u_char esign; + e = exponent16(st0_ptr); + if ( e >= 0 ) + { + exponent.sigh = e; + esign = SIGN_POS; + } + else + { + exponent.sigh = -e; + esign = SIGN_NEG; + } + exponent.sigl = 0; + setexponent16(&exponent, 31); + tag = FPU_normalize_nuo(&exponent); + stdexp(&exponent); + setsign(&exponent, esign); + tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION); + if ( tag >= 0 ) + FPU_settagi(1, tag); + } + else + { + /* The usual case */ + sign = getsign(st1_ptr); + if ( st1_tag == TW_Denormal ) + FPU_to_exp16(st1_ptr, st1_ptr); + else + /* Convert st(1) for internal use. */ + setexponent16(st1_ptr, exponent(st1_ptr)); + poly_l2(st0_ptr, st1_ptr, sign); + } + } + else + { + /* negative */ + if ( arith_invalid(1) < 0 ) + return; + } + + FPU_pop(); + + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + if ( st1_tag == TAG_Special ) + st1_tag = FPU_Special(st1_ptr); + + if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) + { + FPU_stack_underflow_pop(1); + return; + } + else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) ) + { + if ( st0_tag == TAG_Zero ) + { + if ( st1_tag == TAG_Zero ) + { + /* Both args zero is invalid */ + if ( arith_invalid(1) < 0 ) + return; + } + else + { + u_char sign; + sign = getsign(st1_ptr)^SIGN_NEG; + if ( FPU_divide_by_zero(1, sign) < 0 ) + return; + + setsign(st1_ptr, sign); + } + } + else if ( st1_tag == TAG_Zero ) + { + /* st(1) contains zero, st(0) valid <> 0 */ + /* Zero is the valid answer */ + sign = getsign(st1_ptr); + + if ( signnegative(st0_ptr) ) + { + /* log(negative) */ + if ( arith_invalid(1) < 0 ) + return; + } + else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + else + { + if ( exponent(st0_ptr) < 0 ) + sign ^= SIGN_NEG; + + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); + } + } + else + { + /* One or both operands are denormals. */ + if ( denormal_operand() < 0 ) + return; + goto both_valid; + } + } + else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) + { + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) + return; + } + /* One or both arg must be an infinity */ + else if ( st0_tag == TW_Infinity ) + { + if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) ) + { + /* log(-infinity) or 0*log(infinity) */ + if ( arith_invalid(1) < 0 ) + return; + } + else + { + u_char sign = getsign(st1_ptr); + + if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } + } + /* st(1) must be infinity here */ + else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) + && ( signpositive(st0_ptr) ) ) + { + if ( exponent(st0_ptr) >= 0 ) + { + if ( (exponent(st0_ptr) == 0) && + (st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0) ) + { + /* st(0) holds 1.0 */ + /* infinity*log(1) */ + if ( arith_invalid(1) < 0 ) + return; + } + /* else st(0) is positive and > 1.0 */ + } + else + { + /* st(0) is positive and < 1.0 */ + + if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + changesign(st1_ptr); + } + } + else + { + /* st(0) must be zero or negative */ + if ( st0_tag == TAG_Zero ) + { + /* This should be invalid, but a real 80486 is happy with it. */ + +#ifndef PECULIAR_486 + sign = getsign(st1_ptr); + if ( FPU_divide_by_zero(1, sign) < 0 ) + return; +#endif /* PECULIAR_486 */ + + changesign(st1_ptr); + } + else if ( arith_invalid(1) < 0 ) /* log(negative) */ + return; + } + + FPU_pop(); +} + + +static void fpatan(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int tag; + + clear_C1(); + if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) + { + valid_atan: + + poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); + + FPU_pop(); + + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + if ( st1_tag == TAG_Special ) + st1_tag = FPU_Special(st1_ptr); + + if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) + { + if ( denormal_operand() < 0 ) + return; + + goto valid_atan; + } + else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) + { + FPU_stack_underflow_pop(1); + return; + } + else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) + { + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 ) + FPU_pop(); + return; + } + else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) ) + { + u_char sign = getsign(st1_ptr); + if ( st0_tag == TW_Infinity ) + { + if ( st1_tag == TW_Infinity ) + { + if ( signpositive(st0_ptr) ) + { + FPU_copy_to_reg1(&CONST_PI4, TAG_Valid); + } + else + { + setpositive(st1_ptr); + tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr, + FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI4), exponent(&CONST_PI2)); + if ( tag >= 0 ) + FPU_settagi(1, tag); + } + } + else + { + if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + if ( signpositive(st0_ptr) ) + { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); /* An 80486 preserves the sign */ + FPU_pop(); + return; + } + else + { + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + } + } + } + else + { + /* st(1) is infinity, st(0) not infinity */ + if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); + } + setsign(st1_ptr, sign); + } + else if ( st1_tag == TAG_Zero ) + { + /* st(0) must be valid or zero */ + u_char sign = getsign(st1_ptr); + + if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + if ( signpositive(st0_ptr) ) + { + /* An 80486 preserves the sign */ + FPU_pop(); + return; + } + + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + setsign(st1_ptr, sign); + } + else if ( st0_tag == TAG_Zero ) + { + /* st(1) must be TAG_Valid here */ + u_char sign = getsign(st1_ptr); + + if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); + setsign(st1_ptr, sign); + } +#ifdef PARANOID + else + EXCEPTION(EX_INTERNAL | 0x125); +#endif /* PARANOID */ + + FPU_pop(); + set_precision_flag_up(); /* We do not really know if up or down */ +} + + +static void fprem(FPU_REG *st0_ptr, u_char st0_tag) +{ + do_fprem(st0_ptr, st0_tag, RC_CHOP); +} + + +static void fprem1(FPU_REG *st0_ptr, u_char st0_tag) +{ + do_fprem(st0_ptr, st0_tag, RC_RND); +} + + +static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) +{ + u_char sign, sign1; + FPU_REG *st1_ptr = &st(1), a, b; + u_char st1_tag = FPU_gettagi(1); + + clear_C1(); + if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) + { + valid_yl2xp1: + + sign = getsign(st0_ptr); + sign1 = getsign(st1_ptr); + + FPU_to_exp16(st0_ptr, &a); + FPU_to_exp16(st1_ptr, &b); + + if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) ) + return; + + FPU_pop(); + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + if ( st1_tag == TAG_Special ) + st1_tag = FPU_Special(st1_ptr); + + if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) + { + if ( denormal_operand() < 0 ) + return; + + goto valid_yl2xp1; + } + else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) ) + { + FPU_stack_underflow_pop(1); + return; + } + else if ( st0_tag == TAG_Zero ) + { + switch ( st1_tag ) + { + case TW_Denormal: + if ( denormal_operand() < 0 ) + return; + + case TAG_Zero: + case TAG_Valid: + setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); + FPU_copy_to_reg1(st0_ptr, st0_tag); + break; + + case TW_Infinity: + /* Infinity*log(1) */ + if ( arith_invalid(1) < 0 ) + return; + break; + + case TW_NaN: + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) + return; + break; + + default: +#ifdef PARANOID + EXCEPTION(EX_INTERNAL | 0x116); + return; +#endif /* PARANOID */ + break; + } + } + else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) + { + switch ( st1_tag ) + { + case TAG_Zero: + if ( signnegative(st0_ptr) ) + { + if ( exponent(st0_ptr) >= 0 ) + { + /* st(0) holds <= -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); +#else + if ( arith_invalid(1) < 0 ) + return; +#endif /* PECULIAR_486 */ + } + else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + else + changesign(st1_ptr); + } + else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + break; + + case TW_Infinity: + if ( signnegative(st0_ptr) ) + { + if ( (exponent(st0_ptr) >= 0) && + !((st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0)) ) + { + /* st(0) holds < -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); +#else + if ( arith_invalid(1) < 0 ) return; +#endif /* PECULIAR_486 */ + } + else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + else + changesign(st1_ptr); + } + else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + break; + + case TW_NaN: + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) + return; + } + + } + else if ( st0_tag == TW_NaN ) + { + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) + return; + } + else if ( st0_tag == TW_Infinity ) + { + if ( st1_tag == TW_NaN ) + { + if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) + return; + } + else if ( signnegative(st0_ptr) ) + { +#ifndef PECULIAR_486 + /* This should have higher priority than denormals, but... */ + if ( arith_invalid(1) < 0 ) /* log(-infinity) */ + return; +#endif /* PECULIAR_486 */ + if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; +#ifdef PECULIAR_486 + /* Denormal operands actually get higher priority */ + if ( arith_invalid(1) < 0 ) /* log(-infinity) */ + return; +#endif /* PECULIAR_486 */ + } + else if ( st1_tag == TAG_Zero ) + { + /* log(infinity) */ + if ( arith_invalid(1) < 0 ) + return; + } + + /* st(1) must be valid here. */ + + else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + /* The Manual says that log(Infinity) is invalid, but a real + 80486 sensibly says that it is o.k. */ + else + { + u_char sign = getsign(st1_ptr); + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } + } +#ifdef PARANOID + else + { + EXCEPTION(EX_INTERNAL | 0x117); + return; + } +#endif /* PARANOID */ + + FPU_pop(); + return; + +} + + +static void fscale(FPU_REG *st0_ptr, u_char st0_tag) +{ + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int old_cw = control_word; + u_char sign = getsign(st0_ptr); + + clear_C1(); + if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) + { + long scale; + FPU_REG tmp; + + /* Convert register for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + valid_scale: + + if ( exponent(st1_ptr) > 30 ) + { + /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ + + if ( signpositive(st1_ptr) ) + { + EXCEPTION(EX_Overflow); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + } + else + { + EXCEPTION(EX_Underflow); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + } + setsign(st0_ptr, sign); + return; + } + + control_word &= ~CW_RC; + control_word |= RC_CHOP; + reg_copy(st1_ptr, &tmp); + FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */ + control_word = old_cw; + scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl; + scale += exponent16(st0_ptr); + + setexponent16(st0_ptr, scale); + + /* Use FPU_round() to properly detect under/overflow etc */ + FPU_round(st0_ptr, 0, 0, control_word, sign); + + return; + } + + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + if ( st1_tag == TAG_Special ) + st1_tag = FPU_Special(st1_ptr); + + if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) + { + switch ( st1_tag ) + { + case TAG_Valid: + /* st(0) must be a denormal */ + if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */ + goto valid_scale; + + case TAG_Zero: + if ( st0_tag == TW_Denormal ) + denormal_operand(); + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) + return; + + if ( signpositive(st1_ptr) ) + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + else + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st0_ptr, sign); + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } + else if ( st0_tag == TAG_Zero ) + { + switch ( st1_tag ) + { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if ( signpositive(st1_ptr) ) + arith_invalid(0); /* Zero scaled by +Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } + else if ( st0_tag == TW_Infinity ) + { + switch ( st1_tag ) + { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if ( signnegative(st1_ptr) ) + arith_invalid(0); /* Infinity scaled by -Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } + else if ( st0_tag == TW_NaN ) + { + if ( st1_tag != TAG_Empty ) + { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; } + } + +#ifdef PARANOID + if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) ) + { + EXCEPTION(EX_INTERNAL | 0x115); + return; + } +#endif + + /* At least one of st(0), st(1) must be empty */ + FPU_stack_underflow(); + +} + + +/*---------------------------------------------------------------------------*/ + +static FUNC_ST0 const trig_table_a[] = { + f2xm1, fyl2x, fptan, fpatan, + fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp +}; + +void FPU_triga(void) +{ + (trig_table_a[FPU_rm])(&st(0), FPU_gettag0()); +} + + +static FUNC_ST0 const trig_table_b[] = + { + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos + }; + +void FPU_trigb(void) +{ + (trig_table_b[FPU_rm])(&st(0), FPU_gettag0()); +} diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c new file mode 100644 index 000000000000..91175738e948 --- /dev/null +++ b/arch/i386/math-emu/get_address.c @@ -0,0 +1,449 @@ +/*---------------------------------------------------------------------------+ + | get_address.c | + | | + | Get the effective address from an FPU instruction. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Note: | + | The file contains code which accesses user memory. | + | Emulator static data may change when user memory is accessed, due to | + | other processes using the emulator while swapping is in progress. | + +---------------------------------------------------------------------------*/ + + +#include <linux/stddef.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" + + +#define FPU_WRITE_BIT 0x10 + +static int reg_offset[] = { + offsetof(struct info,___eax), + offsetof(struct info,___ecx), + offsetof(struct info,___edx), + offsetof(struct info,___ebx), + offsetof(struct info,___esp), + offsetof(struct info,___ebp), + offsetof(struct info,___esi), + offsetof(struct info,___edi) +}; + +#define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) + +static int reg_offset_vm86[] = { + offsetof(struct info,___cs), + offsetof(struct info,___vm86_ds), + offsetof(struct info,___vm86_es), + offsetof(struct info,___vm86_fs), + offsetof(struct info,___vm86_gs), + offsetof(struct info,___ss), + offsetof(struct info,___vm86_ds) + }; + +#define VM86_REG_(x) (*(unsigned short *) \ + (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) + +/* These are dummy, fs and gs are not saved on the stack. */ +#define ___FS ___ds +#define ___GS ___ds + +static int reg_offset_pm[] = { + offsetof(struct info,___cs), + offsetof(struct info,___ds), + offsetof(struct info,___es), + offsetof(struct info,___FS), + offsetof(struct info,___GS), + offsetof(struct info,___ss), + offsetof(struct info,___ds) + }; + +#define PM_REG_(x) (*(unsigned short *) \ + (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) + + +/* Decode the SIB byte. This function assumes mod != 0 */ +static int sib(int mod, unsigned long *fpu_eip) +{ + u_char ss,index,base; + long offset; + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + ss = base >> 6; + index = (base >> 3) & 7; + base &= 7; + + if ((mod == 0) && (base == 5)) + offset = 0; /* No base register */ + else + offset = REG_(base); + + if (index == 4) + { + /* No index register */ + /* A non-zero ss is illegal */ + if ( ss ) + EXCEPTION(EX_Invalid); + } + else + { + offset += (REG_(index)) << ss; + } + + if (mod == 1) + { + /* 8 bit signed displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(displacement, (signed char __user *) (*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + } + else if (mod == 2 || base == 5) /* The second condition also has mod==0 */ + { + /* 32 bit displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(displacement, (long __user *) (*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip) += 4; + } + + return offset; +} + + +static unsigned long vm86_segment(u_char segment, + struct address *addr) +{ + segment--; +#ifdef PARANOID + if ( segment > PREFIX_SS_ ) + { + EXCEPTION(EX_INTERNAL|0x130); + math_abort(FPU_info,SIGSEGV); + } +#endif /* PARANOID */ + addr->selector = VM86_REG_(segment); + return (unsigned long)VM86_REG_(segment) << 4; +} + + +/* This should work for 16 and 32 bit protected mode. */ +static long pm_address(u_char FPU_modrm, u_char segment, + struct address *addr, long offset) +{ + struct desc_struct descriptor; + unsigned long base_address, limit, address, seg_top; + unsigned short selector; + + segment--; + +#ifdef PARANOID + /* segment is unsigned, so this also detects if segment was 0: */ + if ( segment > PREFIX_SS_ ) + { + EXCEPTION(EX_INTERNAL|0x132); + math_abort(FPU_info,SIGSEGV); + } +#endif /* PARANOID */ + + switch ( segment ) + { + /* fs and gs aren't used by the kernel, so they still have their + user-space values. */ + case PREFIX_FS_-1: + /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register + in the assembler statement. */ + + __asm__("mov %%fs,%0":"=r" (selector)); + addr->selector = selector; + break; + case PREFIX_GS_-1: + /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register + in the assembler statement. */ + __asm__("mov %%gs,%0":"=r" (selector)); + addr->selector = selector; + break; + default: + addr->selector = PM_REG_(segment); + } + + descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); + base_address = SEG_BASE_ADDR(descriptor); + address = base_address + offset; + limit = base_address + + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1; + if ( limit < base_address ) limit = 0xffffffff; + + if ( SEG_EXPAND_DOWN(descriptor) ) + { + if ( SEG_G_BIT(descriptor) ) + seg_top = 0xffffffff; + else + { + seg_top = base_address + (1 << 20); + if ( seg_top < base_address ) seg_top = 0xffffffff; + } + access_limit = + (address <= limit) || (address >= seg_top) ? 0 : + ((seg_top-address) >= 255 ? 255 : seg_top-address); + } + else + { + access_limit = + (address > limit) || (address < base_address) ? 0 : + ((limit-address) >= 254 ? 255 : limit-address+1); + } + if ( SEG_EXECUTE_ONLY(descriptor) || + (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) ) + { + access_limit = 0; + } + return address; +} + + +/* + MOD R/M byte: MOD == 3 has a special use for the FPU + SIB byte used iff R/M = 100b + + 7 6 5 4 3 2 1 0 + ..... ......... ......... + MOD OPCODE(2) R/M + + + SIB byte + + 7 6 5 4 3 2 1 0 + ..... ......... ......... + SS INDEX BASE + +*/ + +void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, + fpu_addr_modes addr_modes) +{ + u_char mod; + unsigned rm = FPU_modrm & 7; + long *cpu_reg_ptr; + int address = 0; /* Initialized just to stop compiler warnings. */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_) ) + { + math_abort(FPU_info,SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + if (rm == 4 && mod != 3) + { + address = sib(mod, fpu_eip); + } + else + { + cpu_reg_ptr = & REG_(rm); + switch (mod) + { + case 0: + if (rm == 5) + { + /* Special case: disp32 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, (unsigned long __user *) (*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + addr->offset = address; + return (void __user *) address; + } + else + { + address = *cpu_reg_ptr; /* Just return the contents + of the cpu register */ + addr->offset = address; + return (void __user *) address; + } + case 1: + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *) (*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; + case 2: + /* 32 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, (long __user *) (*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + break; + case 3: + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + } + address += *cpu_reg_ptr; + } + + addr->offset = address; + + switch ( addr_modes.default_mode ) + { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL|0x133); + } + + return (void __user *)address; +} + + +void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, + fpu_addr_modes addr_modes) +{ + u_char mod; + unsigned rm = FPU_modrm & 7; + int address = 0; /* Default used for mod == 0 */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_) ) + { + math_abort(FPU_info,SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + switch (mod) + { + case 0: + if (rm == 6) + { + /* Special case: disp16 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + goto add_segment; + } + break; + case 1: + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *) (*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; + case 2: + /* 16 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + break; + case 3: + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + break; + } + switch ( rm ) + { + case 0: + address += FPU_info->___ebx + FPU_info->___esi; + break; + case 1: + address += FPU_info->___ebx + FPU_info->___edi; + break; + case 2: + address += FPU_info->___ebp + FPU_info->___esi; + if ( addr_modes.override.segment == PREFIX_DEFAULT ) + addr_modes.override.segment = PREFIX_SS_; + break; + case 3: + address += FPU_info->___ebp + FPU_info->___edi; + if ( addr_modes.override.segment == PREFIX_DEFAULT ) + addr_modes.override.segment = PREFIX_SS_; + break; + case 4: + address += FPU_info->___esi; + break; + case 5: + address += FPU_info->___edi; + break; + case 6: + address += FPU_info->___ebp; + if ( addr_modes.override.segment == PREFIX_DEFAULT ) + addr_modes.override.segment = PREFIX_SS_; + break; + case 7: + address += FPU_info->___ebx; + break; + } + + add_segment: + address &= 0xffff; + + addr->offset = address; + + switch ( addr_modes.default_mode ) + { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL|0x131); + } + + return (void __user *)address ; +} diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c new file mode 100644 index 000000000000..85314be2fef8 --- /dev/null +++ b/arch/i386/math-emu/load_store.c @@ -0,0 +1,270 @@ +/*---------------------------------------------------------------------------+ + | load_store.c | + | | + | This file contains most of the code to interpret the FPU instructions | + | which load and store from user memory. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Note: | + | The file contains code which accesses user memory. | + | Emulator static data may change when user memory is accessed, due to | + | other processes using the emulator while swapping is in progress. | + +---------------------------------------------------------------------------*/ + +#include <asm/uaccess.h> + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" +#include "status_w.h" +#include "control_w.h" + + +#define _NONE_ 0 /* st0_ptr etc not needed */ +#define _REG0_ 1 /* Will be storing st(0) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ + +#define pop_0() { FPU_settag0(TAG_Empty); top++; } + + +static u_char const type_table[32] = { + _PUSH_, _PUSH_, _PUSH_, _PUSH_, + _null_, _null_, _null_, _null_, + _REG0_, _REG0_, _REG0_, _REG0_, + _REG0_, _REG0_, _REG0_, _REG0_, + _NONE_, _null_, _NONE_, _PUSH_, + _NONE_, _PUSH_, _null_, _PUSH_, + _NONE_, _null_, _NONE_, _REG0_, + _NONE_, _REG0_, _NONE_, _REG0_ + }; + +u_char const data_sizes_16[32] = { + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 14, 0, 94, 10, 2, 10, 0, 8, + 14, 0, 94, 10, 2, 10, 2, 8 +}; + +static u_char const data_sizes_32[32] = { + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 28, 0,108, 10, 2, 10, 0, 8, + 28, 0,108, 10, 2, 10, 2, 8 +}; + +int FPU_load_store(u_char type, fpu_addr_modes addr_modes, + void __user *data_address) +{ + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ + u_char loaded_tag; + + st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ + + if ( addr_modes.default_mode & PROTECTED ) + { + if ( addr_modes.default_mode == SEG32 ) + { + if ( access_limit < data_sizes_32[type] ) + math_abort(FPU_info,SIGSEGV); + } + else if ( addr_modes.default_mode == PM16 ) + { + if ( access_limit < data_sizes_16[type] ) + math_abort(FPU_info,SIGSEGV); + } +#ifdef PARANOID + else + EXCEPTION(EX_INTERNAL|0x140); +#endif /* PARANOID */ + } + + switch ( type_table[type] ) + { + case _NONE_: + break; + case _REG0_: + st0_ptr = &st(0); /* Some of these instructions pop after + storing */ + st0_tag = FPU_gettag0(); + break; + case _PUSH_: + { + if ( FPU_gettagi(-1) != TAG_Empty ) + { FPU_stack_overflow(); return 0; } + top--; + st0_ptr = &st(0); + } + break; + case _null_: + FPU_illegal(); + return 0; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x141); + return 0; +#endif /* PARANOID */ + } + + switch ( type ) + { + case 000: /* fld m32real */ + clear_C1(); + loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data); + if ( (loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0) ) + { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 001: /* fild m32int */ + clear_C1(); + loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 002: /* fld m64real */ + clear_C1(); + loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data); + if ( (loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0) ) + { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 003: /* fild m16int */ + clear_C1(); + loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 010: /* fst m32real */ + clear_C1(); + FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address); + break; + case 011: /* fist m32int */ + clear_C1(); + FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); + break; + case 012: /* fst m64real */ + clear_C1(); + FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address); + break; + case 013: /* fist m16int */ + clear_C1(); + FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); + break; + case 014: /* fstp m32real */ + clear_C1(); + if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 015: /* fistp m32int */ + clear_C1(); + if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 016: /* fstp m64real */ + clear_C1(); + if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 017: /* fistp m16int */ + clear_C1(); + if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 020: /* fldenv m14/28byte */ + fldenv(addr_modes, (u_char __user *)data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 022: /* frstor m94/108byte */ + frstor(addr_modes, (u_char __user *)data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 023: /* fbld m80dec */ + clear_C1(); + loaded_tag = FPU_load_bcd((u_char __user *)data_address); + FPU_settag0(loaded_tag); + break; + case 024: /* fldcw */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, data_address, 2); + FPU_get_user(control_word, (unsigned short __user *) data_address); + RE_ENTRANT_CHECK_ON; + if ( partial_status & ~control_word & CW_Exceptions ) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); +#ifdef PECULIAR_486 + control_word |= 0x40; /* An 80486 appears to always set this bit */ +#endif /* PECULIAR_486 */ + return 1; + case 025: /* fld m80real */ + clear_C1(); + loaded_tag = FPU_load_extended((long double __user *)data_address, 0); + FPU_settag0(loaded_tag); + break; + case 027: /* fild m64int */ + clear_C1(); + loaded_tag = FPU_load_int64((long long __user *)data_address); + FPU_settag0(loaded_tag); + break; + case 030: /* fstenv m14/28byte */ + fstenv(addr_modes, (u_char __user *)data_address); + return 1; + case 032: /* fsave */ + fsave(addr_modes, (u_char __user *)data_address); + return 1; + case 033: /* fbstp m80dec */ + clear_C1(); + if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 034: /* fstcw m16int */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,data_address,2); + FPU_put_user(control_word, (unsigned short __user *) data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 035: /* fstp m80real */ + clear_C1(); + if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 036: /* fstsw m2byte */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,data_address,2); + FPU_put_user(status_word(),(unsigned short __user *) data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 037: /* fistp m64int */ + clear_C1(); + if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) ) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + } + return 0; +} diff --git a/arch/i386/math-emu/mul_Xsig.S b/arch/i386/math-emu/mul_Xsig.S new file mode 100644 index 000000000000..717785a53eb4 --- /dev/null +++ b/arch/i386/math-emu/mul_Xsig.S @@ -0,0 +1,176 @@ +/*---------------------------------------------------------------------------+ + | mul_Xsig.S | + | | + | Multiply a 12 byte fixed point number by another fixed point number. | + | | + | Copyright (C) 1992,1994,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | Call from C as: | + | void mul32_Xsig(Xsig *x, unsigned b) | + | | + | void mul64_Xsig(Xsig *x, unsigned long long *b) | + | | + | void mul_Xsig_Xsig(Xsig *x, unsigned *b) | + | | + | The result is neither rounded nor normalized, and the ls bit or so may | + | be wrong. | + | | + +---------------------------------------------------------------------------*/ + .file "mul_Xsig.S" + + +#include "fpu_emu.h" + +.text +ENTRY(mul32_Xsig) + pushl %ebp + movl %esp,%ebp + subl $16,%esp + pushl %esi + + movl PARAM1,%esi + movl PARAM2,%ecx + + xor %eax,%eax + movl %eax,-4(%ebp) + movl %eax,-8(%ebp) + + movl (%esi),%eax /* lsl of Xsig */ + mull %ecx /* msl of b */ + movl %edx,-12(%ebp) + + movl 4(%esi),%eax /* midl of Xsig */ + mull %ecx /* msl of b */ + addl %eax,-12(%ebp) + adcl %edx,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull %ecx /* msl of b */ + addl %eax,-8(%ebp) + adcl %edx,-4(%ebp) + + movl -12(%ebp),%eax + movl %eax,(%esi) + movl -8(%ebp),%eax + movl %eax,4(%esi) + movl -4(%ebp),%eax + movl %eax,8(%esi) + + popl %esi + leave + ret + + +ENTRY(mul64_Xsig) + pushl %ebp + movl %esp,%ebp + subl $16,%esp + pushl %esi + + movl PARAM1,%esi + movl PARAM2,%ecx + + xor %eax,%eax + movl %eax,-4(%ebp) + movl %eax,-8(%ebp) + + movl (%esi),%eax /* lsl of Xsig */ + mull 4(%ecx) /* msl of b */ + movl %edx,-12(%ebp) + + movl 4(%esi),%eax /* midl of Xsig */ + mull (%ecx) /* lsl of b */ + addl %edx,-12(%ebp) + adcl $0,-8(%ebp) + adcl $0,-4(%ebp) + + movl 4(%esi),%eax /* midl of Xsig */ + mull 4(%ecx) /* msl of b */ + addl %eax,-12(%ebp) + adcl %edx,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull (%ecx) /* lsl of b */ + addl %eax,-12(%ebp) + adcl %edx,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull 4(%ecx) /* msl of b */ + addl %eax,-8(%ebp) + adcl %edx,-4(%ebp) + + movl -12(%ebp),%eax + movl %eax,(%esi) + movl -8(%ebp),%eax + movl %eax,4(%esi) + movl -4(%ebp),%eax + movl %eax,8(%esi) + + popl %esi + leave + ret + + + +ENTRY(mul_Xsig_Xsig) + pushl %ebp + movl %esp,%ebp + subl $16,%esp + pushl %esi + + movl PARAM1,%esi + movl PARAM2,%ecx + + xor %eax,%eax + movl %eax,-4(%ebp) + movl %eax,-8(%ebp) + + movl (%esi),%eax /* lsl of Xsig */ + mull 8(%ecx) /* msl of b */ + movl %edx,-12(%ebp) + + movl 4(%esi),%eax /* midl of Xsig */ + mull 4(%ecx) /* midl of b */ + addl %edx,-12(%ebp) + adcl $0,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull (%ecx) /* lsl of b */ + addl %edx,-12(%ebp) + adcl $0,-8(%ebp) + adcl $0,-4(%ebp) + + movl 4(%esi),%eax /* midl of Xsig */ + mull 8(%ecx) /* msl of b */ + addl %eax,-12(%ebp) + adcl %edx,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull 4(%ecx) /* midl of b */ + addl %eax,-12(%ebp) + adcl %edx,-8(%ebp) + adcl $0,-4(%ebp) + + movl 8(%esi),%eax /* msl of Xsig */ + mull 8(%ecx) /* msl of b */ + addl %eax,-8(%ebp) + adcl %edx,-4(%ebp) + + movl -12(%ebp),%edx + movl %edx,(%esi) + movl -8(%ebp),%edx + movl %edx,4(%esi) + movl -4(%ebp),%edx + movl %edx,8(%esi) + + popl %esi + leave + ret + diff --git a/arch/i386/math-emu/poly.h b/arch/i386/math-emu/poly.h new file mode 100644 index 000000000000..4db798114923 --- /dev/null +++ b/arch/i386/math-emu/poly.h @@ -0,0 +1,121 @@ +/*---------------------------------------------------------------------------+ + | poly.h | + | | + | Header file for the FPU-emu poly*.c source files. | + | | + | Copyright (C) 1994,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@melbpc.org.au | + | | + | Declarations and definitions for functions operating on Xsig (12-byte | + | extended-significand) quantities. | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _POLY_H +#define _POLY_H + +/* This 12-byte structure is used to improve the accuracy of computation + of transcendental functions. + Intended to be used to get results better than 8-byte computation + allows. 9-byte would probably be sufficient. + */ +typedef struct { + unsigned long lsw; + unsigned long midw; + unsigned long msw; +} Xsig; + +asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b, + unsigned long long *result); +asmlinkage void polynomial_Xsig(Xsig *, const unsigned long long *x, + const unsigned long long terms[], const int n); + +asmlinkage void mul32_Xsig(Xsig *, const unsigned long mult); +asmlinkage void mul64_Xsig(Xsig *, const unsigned long long *mult); +asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult); + +asmlinkage void shr_Xsig(Xsig *, const int n); +asmlinkage int round_Xsig(Xsig *); +asmlinkage int norm_Xsig(Xsig *); +asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); + +/* Macro to extract the most significant 32 bits from a long long */ +#define LL_MSW(x) (((unsigned long *)&x)[1]) + +/* Macro to initialize an Xsig struct */ +#define MK_XSIG(a,b,c) { c, b, a } + +/* Macro to access the 8 ms bytes of an Xsig as a long long */ +#define XSIG_LL(x) (*(unsigned long long *)&x.midw) + + +/* + Need to run gcc with optimizations on to get these to + actually be in-line. + */ + +/* Multiply two fixed-point 32 bit numbers, producing a 32 bit result. + The answer is the ms word of the product. */ +/* Some versions of gcc make it difficult to stop eax from being clobbered. + Merely specifying that it is used doesn't work... + */ +static inline unsigned long mul_32_32(const unsigned long arg1, + const unsigned long arg2) +{ + int retval; + asm volatile ("mull %2; movl %%edx,%%eax" \ + :"=a" (retval) \ + :"0" (arg1), "g" (arg2) \ + :"dx"); + return retval; +} + + +/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */ +static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2) +{ + asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n" + :"=g" (*dest):"g" (dest), "g" (x2) + :"ax","si","di"); +} + + +/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */ +/* Note: the constraints in the asm statement didn't always work properly + with gcc 2.5.8. Changing from using edi to using ecx got around the + problem, but keep fingers crossed! */ +static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp) +{ + asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" + "jnc 0f;\n" + "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" + "movl %4,%%ecx; incl (%%ecx)\n" + "movl $1,%%eax; jmp 1f;\n" + "0: xorl %%eax,%%eax;\n" + "1:\n" + :"=g" (*exp), "=g" (*dest) + :"g" (dest), "g" (x2), "g" (exp) + :"cx","si","ax"); +} + + +/* Negate (subtract from 1.0) the 12 byte Xsig */ +/* This is faster in a loop on my 386 than using the "neg" instruction. */ +static inline void negate_Xsig(Xsig *x) +{ + asm volatile("movl %1,%%esi;\n" + "xorl %%ecx,%%ecx;\n" + "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" + "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" + "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n" + :"=g" (*x):"g" (x):"si","ax","cx"); +} + +#endif /* _POLY_H */ diff --git a/arch/i386/math-emu/poly_2xm1.c b/arch/i386/math-emu/poly_2xm1.c new file mode 100644 index 000000000000..9766ad5e9743 --- /dev/null +++ b/arch/i386/math-emu/poly_2xm1.c @@ -0,0 +1,156 @@ +/*---------------------------------------------------------------------------+ + | poly_2xm1.c | + | | + | Function to compute 2^x-1 by a polynomial approximation. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" +#include "control_w.h" +#include "poly.h" + + +#define HIPOWER 11 +static const unsigned long long lterms[HIPOWER] = +{ + 0x0000000000000000LL, /* This term done separately as 12 bytes */ + 0xf5fdeffc162c7543LL, + 0x1c6b08d704a0bfa6LL, + 0x0276556df749cc21LL, + 0x002bb0ffcf14f6b8LL, + 0x0002861225ef751cLL, + 0x00001ffcbfcd5422LL, + 0x00000162c005d5f1LL, + 0x0000000da96ccb1bLL, + 0x0000000078d1b897LL, + 0x000000000422b029LL +}; + +static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194); + +/* Four slices: 0.0 : 0.25 : 0.50 : 0.75 : 1.0, + These numbers are 2^(1/4), 2^(1/2), and 2^(3/4) + */ +static const Xsig shiftterm0 = MK_XSIG(0, 0, 0); +static const Xsig shiftterm1 = MK_XSIG(0x9837f051, 0x8db8a96f, 0x46ad2318); +static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3); +static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9); + +static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1, + &shiftterm2, &shiftterm3 }; + + +/*--- poly_2xm1() -----------------------------------------------------------+ + | Requires st(0) which is TAG_Valid and < 1. | + +---------------------------------------------------------------------------*/ +int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) +{ + long int exponent, shift; + unsigned long long Xll; + Xsig accumulator, Denom, argSignif; + u_char tag; + + exponent = exponent16(arg); + +#ifdef PARANOID + if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */ + { + /* Number negative, too large, or not Valid. */ + EXCEPTION(EX_INTERNAL|0x127); + return 1; + } +#endif /* PARANOID */ + + argSignif.lsw = 0; + XSIG_LL(argSignif) = Xll = significand(arg); + + if ( exponent == -1 ) + { + shift = (argSignif.msw & 0x40000000) ? 3 : 2; + /* subtract 0.5 or 0.75 */ + exponent -= 2; + XSIG_LL(argSignif) <<= 2; + Xll <<= 2; + } + else if ( exponent == -2 ) + { + shift = 1; + /* subtract 0.25 */ + exponent--; + XSIG_LL(argSignif) <<= 1; + Xll <<= 1; + } + else + shift = 0; + + if ( exponent < -2 ) + { + /* Shift the argument right by the required places. */ + if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U ) + Xll++; /* round up */ + } + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1); + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 3); + + mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ + add_two_Xsig(&accumulator, &argSignif, &exponent); + + if ( shift ) + { + /* The argument is large, use the identity: + f(x+a) = f(a) * (f(x) + 1) - 1; + */ + shr_Xsig(&accumulator, - exponent); + accumulator.msw |= 0x80000000; /* add 1.0 */ + mul_Xsig_Xsig(&accumulator, shiftterm[shift]); + accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ + exponent = 1; + } + + if ( sign != SIGN_POS ) + { + /* The argument is negative, use the identity: + f(-x) = -f(x) / (1 + f(x)) + */ + Denom.lsw = accumulator.lsw; + XSIG_LL(Denom) = XSIG_LL(accumulator); + if ( exponent < 0 ) + shr_Xsig(&Denom, - exponent); + else if ( exponent > 0 ) + { + /* exponent must be 1 here */ + XSIG_LL(Denom) <<= 1; + if ( Denom.lsw & 0x80000000 ) + XSIG_LL(Denom) |= 1; + (Denom.lsw) <<= 1; + } + Denom.msw |= 0x80000000; /* add 1.0 */ + div_Xsig(&accumulator, &Denom, &accumulator); + } + + /* Convert to 64 bit signed-compatible */ + exponent += round_Xsig(&accumulator); + + result = &st(0); + significand(result) = XSIG_LL(accumulator); + setexponent16(result, exponent); + + tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); + + setsign(result, sign); + FPU_settag0(tag); + + return 0; + +} diff --git a/arch/i386/math-emu/poly_atan.c b/arch/i386/math-emu/poly_atan.c new file mode 100644 index 000000000000..82f702952f69 --- /dev/null +++ b/arch/i386/math-emu/poly_atan.c @@ -0,0 +1,229 @@ +/*---------------------------------------------------------------------------+ + | poly_atan.c | + | | + | Compute the arctan of a FPU_REG, using a polynomial approximation. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" +#include "status_w.h" +#include "control_w.h" +#include "poly.h" + + +#define HIPOWERon 6 /* odd poly, negative terms */ +static const unsigned long long oddnegterms[HIPOWERon] = +{ + 0x0000000000000000LL, /* Dummy (not for - 1.0) */ + 0x015328437f756467LL, + 0x0005dda27b73dec6LL, + 0x0000226bf2bfb91aLL, + 0x000000ccc439c5f7LL, + 0x0000000355438407LL +} ; + +#define HIPOWERop 6 /* odd poly, positive terms */ +static const unsigned long long oddplterms[HIPOWERop] = +{ +/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */ + 0x0db55a71875c9ac2LL, + 0x0029fce2d67880b0LL, + 0x0000dfd3908b4596LL, + 0x00000550fd61dab4LL, + 0x0000001c9422b3f9LL, + 0x000000003e3301e1LL +}; + +static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL; + +static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa); + +static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b); + + +/*--- poly_atan() -----------------------------------------------------------+ + | | + +---------------------------------------------------------------------------*/ +void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, + FPU_REG *st1_ptr, u_char st1_tag) +{ + u_char transformed, inverted, + sign1, sign2; + int exponent; + long int dummy_exp; + Xsig accumulator, Numer, Denom, accumulatore, argSignif, + argSq, argSqSq; + u_char tag; + + sign1 = getsign(st0_ptr); + sign2 = getsign(st1_ptr); + if ( st0_tag == TAG_Valid ) + { + exponent = exponent(st0_ptr); + } + else + { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st0_ptr, st0_ptr); + exponent = exponent16(st0_ptr); + } + if ( st1_tag == TAG_Valid ) + { + exponent -= exponent(st1_ptr); + } + else + { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st1_ptr, st1_ptr); + exponent -= exponent16(st1_ptr); + } + + if ( (exponent < 0) || ((exponent == 0) && + ((st0_ptr->sigh < st1_ptr->sigh) || + ((st0_ptr->sigh == st1_ptr->sigh) && + (st0_ptr->sigl < st1_ptr->sigl))) ) ) + { + inverted = 1; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st0_ptr); + XSIG_LL(Denom) = significand(st1_ptr); + } + else + { + inverted = 0; + exponent = -exponent; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st1_ptr); + XSIG_LL(Denom) = significand(st0_ptr); + } + div_Xsig(&Numer, &Denom, &argSignif); + exponent += norm_Xsig(&argSignif); + + if ( (exponent >= -1) + || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) ) + { + /* The argument is greater than sqrt(2)-1 (=0.414213562...) */ + /* Convert the argument by an identity for atan */ + transformed = 1; + + if ( exponent >= 0 ) + { +#ifdef PARANOID + if ( !( (exponent == 0) && + (argSignif.lsw == 0) && (argSignif.midw == 0) && + (argSignif.msw == 0x80000000) ) ) + { + EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */ + return; + } +#endif /* PARANOID */ + argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ + } + else + { + Numer.lsw = Denom.lsw = argSignif.lsw; + XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); + + if ( exponent < -1 ) + shr_Xsig(&Numer, -1-exponent); + negate_Xsig(&Numer); + + shr_Xsig(&Denom, -exponent); + Denom.msw |= 0x80000000; + + div_Xsig(&Numer, &Denom, &argSignif); + + exponent = -1 + norm_Xsig(&argSignif); + } + } + else + { + transformed = 0; + } + + argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw; + argSq.msw = argSignif.msw; + mul_Xsig_Xsig(&argSq, &argSq); + + argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + accumulatore.lsw = argSq.lsw; + XSIG_LL(accumulatore) = XSIG_LL(argSq); + + shr_Xsig(&argSq, 2*(-1-exponent-1)); + shr_Xsig(&argSqSq, 4*(-1-exponent-1)); + + /* Now have argSq etc with binary point at the left + .1xxxxxxxx */ + + /* Do the basic fixed point polynomial evaluation */ + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), + oddplterms, HIPOWERop-1); + mul64_Xsig(&accumulator, &XSIG_LL(argSq)); + negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1); + negate_Xsig(&accumulator); + add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp); + + mul64_Xsig(&accumulatore, &denomterm); + shr_Xsig(&accumulatore, 1 + 2*(-1-exponent)); + accumulatore.msw |= 0x80000000; + + div_Xsig(&accumulator, &accumulatore, &accumulator); + + mul_Xsig_Xsig(&accumulator, &argSignif); + mul_Xsig_Xsig(&accumulator, &argSq); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &argSignif); + + if ( transformed ) + { + /* compute pi/4 - accumulator */ + shr_Xsig(&accumulator, -1-exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = -1; + } + + if ( inverted ) + { + /* compute pi/2 - accumulator */ + shr_Xsig(&accumulator, -exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 0; + } + + if ( sign1 ) + { + /* compute pi - accumulator */ + shr_Xsig(&accumulator, 1 - exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 1; + } + + exponent += round_Xsig(&accumulator); + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, exponent); + + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* We do not really know if up or down, + use this as the default. */ + +} diff --git a/arch/i386/math-emu/poly_l2.c b/arch/i386/math-emu/poly_l2.c new file mode 100644 index 000000000000..dd00e1d5b074 --- /dev/null +++ b/arch/i386/math-emu/poly_l2.c @@ -0,0 +1,272 @@ +/*---------------------------------------------------------------------------+ + | poly_l2.c | + | | + | Compute the base 2 log of a FPU_REG, using a polynomial approximation. | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" +#include "control_w.h" +#include "poly.h" + + +static void log2_kernel(FPU_REG const *arg, u_char argsign, + Xsig *accum_result, long int *expon); + + +/*--- poly_l2() -------------------------------------------------------------+ + | Base 2 logarithm by a polynomial approximation. | + +---------------------------------------------------------------------------*/ +void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) +{ + long int exponent, expon, expon_expon; + Xsig accumulator, expon_accum, yaccum; + u_char sign, argsign; + FPU_REG x; + int tag; + + exponent = exponent16(st0_ptr); + + /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ + if ( st0_ptr->sigh > (unsigned)0xb504f334 ) + { + /* Treat as sqrt(2)/2 < st0_ptr < 1 */ + significand(&x) = - significand(st0_ptr); + setexponent16(&x, -1); + exponent++; + argsign = SIGN_NEG; + } + else + { + /* Treat as 1 <= st0_ptr < sqrt(2) */ + x.sigh = st0_ptr->sigh - 0x80000000; + x.sigl = st0_ptr->sigl; + setexponent16(&x, 0); + argsign = SIGN_POS; + } + tag = FPU_normalize_nuo(&x); + + if ( tag == TAG_Zero ) + { + expon = 0; + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + } + else + { + log2_kernel(&x, argsign, &accumulator, &expon); + } + + if ( exponent < 0 ) + { + sign = SIGN_NEG; + exponent = -exponent; + } + else + sign = SIGN_POS; + expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0; + if ( exponent ) + { + expon_expon = 31 + norm_Xsig(&expon_accum); + shr_Xsig(&accumulator, expon_expon - expon); + + if ( sign ^ argsign ) + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &expon_accum); + } + else + { + expon_expon = expon; + sign = argsign; + } + + yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); + + expon_expon += round_Xsig(&accumulator); + + if ( accumulator.msw == 0 ) + { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + return; + } + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1); + + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* 80486 appears to always do this */ + + return; + +} + + +/*--- poly_l2p1() -----------------------------------------------------------+ + | Base 2 logarithm by a polynomial approximation. | + | log2(x+1) | + +---------------------------------------------------------------------------*/ +int poly_l2p1(u_char sign0, u_char sign1, + FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest) +{ + u_char tag; + long int exponent; + Xsig accumulator, yaccum; + + if ( exponent16(st0_ptr) < 0 ) + { + log2_kernel(st0_ptr, sign0, &accumulator, &exponent); + + yaccum.lsw = 0; + XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); + + exponent += round_Xsig(&accumulator); + + exponent += exponent16(st1_ptr) + 1; + if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER; + + significand(dest) = XSIG_LL(accumulator); + setexponent16(dest, exponent); + + tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); + FPU_settagi(1, tag); + + if ( tag == TAG_Valid ) + set_precision_flag_up(); /* 80486 appears to always do this */ + } + else + { + /* The magnitude of st0_ptr is far too large. */ + + if ( sign0 != SIGN_POS ) + { + /* Trying to get the log of a negative number. */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); +#else + if ( arith_invalid(1) < 0 ) + return 1; +#endif /* PECULIAR_486 */ + } + + /* 80486 appears to do this */ + if ( sign0 == SIGN_NEG ) + set_precision_flag_down(); + else + set_precision_flag_up(); + } + + if ( exponent(dest) <= EXP_UNDER ) + EXCEPTION(EX_Underflow); + + return 0; + +} + + + + +#undef HIPOWER +#define HIPOWER 10 +static const unsigned long long logterms[HIPOWER] = +{ + 0x2a8eca5705fc2ef0LL, + 0xf6384ee1d01febceLL, + 0x093bb62877cdf642LL, + 0x006985d8a9ec439bLL, + 0x0005212c4f55a9c8LL, + 0x00004326a16927f0LL, + 0x0000038d1d80a0e7LL, + 0x0000003141cc80c6LL, + 0x00000002b1668c9fLL, + 0x000000002c7a46aaLL +}; + +static const unsigned long leadterm = 0xb8000000; + + +/*--- log2_kernel() ---------------------------------------------------------+ + | Base 2 logarithm by a polynomial approximation. | + | log2(x+1) | + +---------------------------------------------------------------------------*/ +static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result, + long int *expon) +{ + long int exponent, adj; + unsigned long long Xsq; + Xsig accumulator, Numer, Denom, argSignif, arg_signif; + + exponent = exponent16(arg); + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); + if ( argsign == SIGN_POS ) + { + shr_Xsig(&Denom, 2 - (1 + exponent)); + Denom.msw |= 0x80000000; + div_Xsig(&Numer, &Denom, &argSignif); + } + else + { + shr_Xsig(&Denom, 1 - (1 + exponent)); + negate_Xsig(&Denom); + if ( Denom.msw & 0x80000000 ) + { + div_Xsig(&Numer, &Denom, &argSignif); + exponent ++; + } + else + { + /* Denom must be 1.0 */ + argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw; + argSignif.msw = Numer.msw; + } + } + +#ifndef PECULIAR_486 + /* Should check here that |local_arg| is within the valid range */ + if ( exponent >= -2 ) + { + if ( (exponent > -2) || + (argSignif.msw > (unsigned)0xafb0ccc0) ) + { + /* The argument is too large */ + } + } +#endif /* PECULIAR_486 */ + + arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif); + adj = norm_Xsig(&argSignif); + accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif); + mul_Xsig_Xsig(&accumulator, &accumulator); + shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj))); + Xsq = XSIG_LL(accumulator); + if ( accumulator.lsw & 0x80000000 ) + Xsq++; + + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + /* Do the basic fixed point polynomial evaluation */ + polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1); + + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 6 - adj); + + mul32_Xsig(&arg_signif, leadterm); + add_two_Xsig(&accumulator, &arg_signif, &exponent); + + *expon = exponent + 1; + accum_result->lsw = accumulator.lsw; + accum_result->midw = accumulator.midw; + accum_result->msw = accumulator.msw; + +} diff --git a/arch/i386/math-emu/poly_sin.c b/arch/i386/math-emu/poly_sin.c new file mode 100644 index 000000000000..a36313fb06f1 --- /dev/null +++ b/arch/i386/math-emu/poly_sin.c @@ -0,0 +1,397 @@ +/*---------------------------------------------------------------------------+ + | poly_sin.c | + | | + | Computation of an approximation of the sin function and the cosine | + | function by a polynomial. | + | | + | Copyright (C) 1992,1993,1994,1997,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@melbpc.org.au | + | | + | | + +---------------------------------------------------------------------------*/ + + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" +#include "control_w.h" +#include "poly.h" + + +#define N_COEFF_P 4 +#define N_COEFF_N 4 + +static const unsigned long long pos_terms_l[N_COEFF_P] = +{ + 0xaaaaaaaaaaaaaaabLL, + 0x00d00d00d00cf906LL, + 0x000006b99159a8bbLL, + 0x000000000d7392e6LL +}; + +static const unsigned long long neg_terms_l[N_COEFF_N] = +{ + 0x2222222222222167LL, + 0x0002e3bc74aab624LL, + 0x0000000b09229062LL, + 0x00000000000c7973LL +}; + + + +#define N_COEFF_PH 4 +#define N_COEFF_NH 4 +static const unsigned long long pos_terms_h[N_COEFF_PH] = +{ + 0x0000000000000000LL, + 0x05b05b05b05b0406LL, + 0x000049f93edd91a9LL, + 0x00000000c9c9ed62LL +}; + +static const unsigned long long neg_terms_h[N_COEFF_NH] = +{ + 0xaaaaaaaaaaaaaa98LL, + 0x001a01a01a019064LL, + 0x0000008f76c68a77LL, + 0x0000000000d58f5eLL +}; + + +/*--- poly_sine() -----------------------------------------------------------+ + | | + +---------------------------------------------------------------------------*/ +void poly_sine(FPU_REG *st0_ptr) +{ + int exponent, echange; + Xsig accumulator, argSqrd, argTo4; + unsigned long fix_up, adj; + unsigned long long fixed_arg; + FPU_REG result; + + exponent = exponent(st0_ptr); + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + + /* Split into two ranges, for arguments below and above 1.0 */ + /* The boundary between upper and lower is approx 0.88309101259 */ + if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) ) + { + /* The argument is <= 0.88309101259 */ + + argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + shr_Xsig(&argSqrd, 2*(-1-exponent)); + argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N-1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P-1); + + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ + + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3*exponent; + + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); + + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += significand(st0_ptr); + + echange = round_Xsig(&accumulator); + + setexponentpos(&result, exponent(st0_ptr) + echange); + } + else + { + /* The argument is > 0.88309101259 */ + /* We use sin(st(0)) = cos(pi/2-st(0)) */ + + fixed_arg = significand(st0_ptr); + + if ( exponent == 0 ) + { + /* The argument is >= 1.0 */ + + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if ( fixed_arg == 0xffffffffffffffffLL ) + fixed_arg = 0; + + XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); + + XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH-1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH-1); + negate_Xsig(&accumulator); + + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + + add_Xsig_Xsig(&accumulator, &argSqrd); + + shr_Xsig(&accumulator, 1); + + accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ + negate_Xsig(&accumulator); + + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ + + /* This has an exponent of -65 */ + fix_up = 0x898cc517; + /* The fix-up needs to be improved for larger args */ + if ( argSqrd.msw & 0xffc00000 ) + { + /* Get about 32 bit precision in these: */ + fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6; + } + fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg)); + + adj = accumulator.lsw; /* temp save */ + accumulator.lsw -= fix_up; + if ( accumulator.lsw > adj ) + XSIG_LL(accumulator) --; + + echange = round_Xsig(&accumulator); + + setexponentpos(&result, echange - 1); + } + + significand(&result) = XSIG_LL(accumulator); + setsign(&result, getsign(st0_ptr)); + FPU_copy_to_reg0(&result, TAG_Valid); + +#ifdef PARANOID + if ( (exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL) ) + { + EXCEPTION(EX_INTERNAL|0x150); + } +#endif /* PARANOID */ + +} + + + +/*--- poly_cos() ------------------------------------------------------------+ + | | + +---------------------------------------------------------------------------*/ +void poly_cos(FPU_REG *st0_ptr) +{ + FPU_REG result; + long int exponent, exp2, echange; + Xsig accumulator, argSqrd, fix_up, argTo4; + unsigned long long fixed_arg; + +#ifdef PARANOID + if ( (exponent(st0_ptr) > 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) ) + { + EXCEPTION(EX_Invalid); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + return; + } +#endif /* PARANOID */ + + exponent = exponent(st0_ptr); + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + + if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) ) + { + /* arg is < 0.687705 */ + + argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + + if ( exponent < -1 ) + { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2*(-1-exponent)); + } + + argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH-1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH-1); + negate_Xsig(&accumulator); + + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + shr_Xsig(&accumulator, -2*(1+exponent)); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + + add_Xsig_Xsig(&accumulator, &argSqrd); + + shr_Xsig(&accumulator, 1); + + /* It doesn't matter if accumulator is all zero here, the + following code will work ok */ + negate_Xsig(&accumulator); + + if ( accumulator.lsw & 0x80000000 ) + XSIG_LL(accumulator) ++; + if ( accumulator.msw == 0 ) + { + /* The result is 1.0 */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; + } + else + { + significand(&result) = XSIG_LL(accumulator); + + /* will be a valid positive nr with expon = -1 */ + setexponentpos(&result, -1); + } + } + else + { + fixed_arg = significand(st0_ptr); + + if ( exponent == 0 ) + { + /* The argument is >= 1.0 */ + + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if ( fixed_arg == 0xffffffffffffffffLL ) + fixed_arg = 0; + + exponent = -1; + exp2 = -1; + + /* A shift is needed here only for a narrow range of arguments, + i.e. for fixed_arg approx 2^-32, but we pick up more... */ + if ( !(LL_MSW(fixed_arg) & 0xffff0000) ) + { + fixed_arg <<= 16; + exponent -= 16; + exp2 -= 16; + } + + XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); + + if ( exponent < -1 ) + { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2*(-1-exponent)); + } + + argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N-1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P-1); + + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ + + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3*exponent; + + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exp2 - exponent); + + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += fixed_arg; + + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ + + /* This has an exponent of -65 */ + XSIG_LL(fix_up) = 0x898cc51701b839a2ll; + fix_up.lsw = 0; + + /* The fix-up needs to be improved for larger args */ + if ( argSqrd.msw & 0xffc00000 ) + { + /* Get about 32 bit precision in these: */ + fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; + fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; + } + + exp2 += norm_Xsig(&accumulator); + shr_Xsig(&accumulator, 1); /* Prevent overflow */ + exp2++; + shr_Xsig(&fix_up, 65 + exp2); + + add_Xsig_Xsig(&accumulator, &fix_up); + + echange = round_Xsig(&accumulator); + + setexponentpos(&result, exp2 + echange); + significand(&result) = XSIG_LL(accumulator); + } + + FPU_copy_to_reg0(&result, TAG_Valid); + +#ifdef PARANOID + if ( (exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL) ) + { + EXCEPTION(EX_INTERNAL|0x151); + } +#endif /* PARANOID */ + +} diff --git a/arch/i386/math-emu/poly_tan.c b/arch/i386/math-emu/poly_tan.c new file mode 100644 index 000000000000..8df3e03b6e6f --- /dev/null +++ b/arch/i386/math-emu/poly_tan.c @@ -0,0 +1,222 @@ +/*---------------------------------------------------------------------------+ + | poly_tan.c | + | | + | Compute the tan of a FPU_REG, using a polynomial approximation. | + | | + | Copyright (C) 1992,1993,1994,1997,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@melbpc.org.au | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" +#include "control_w.h" +#include "poly.h" + + +#define HiPOWERop 3 /* odd poly, positive terms */ +static const unsigned long long oddplterm[HiPOWERop] = +{ + 0x0000000000000000LL, + 0x0051a1cf08fca228LL, + 0x0000000071284ff7LL +}; + +#define HiPOWERon 2 /* odd poly, negative terms */ +static const unsigned long long oddnegterm[HiPOWERon] = +{ + 0x1291a9a184244e80LL, + 0x0000583245819c21LL +}; + +#define HiPOWERep 2 /* even poly, positive terms */ +static const unsigned long long evenplterm[HiPOWERep] = +{ + 0x0e848884b539e888LL, + 0x00003c7f18b887daLL +}; + +#define HiPOWERen 2 /* even poly, negative terms */ +static const unsigned long long evennegterm[HiPOWERen] = +{ + 0xf1f0200fd51569ccLL, + 0x003afb46105c4432LL +}; + +static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL; + + +/*--- poly_tan() ------------------------------------------------------------+ + | | + +---------------------------------------------------------------------------*/ +void poly_tan(FPU_REG *st0_ptr) +{ + long int exponent; + int invert; + Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, + argSignif, fix_up; + unsigned long adj; + + exponent = exponent(st0_ptr); + +#ifdef PARANOID + if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */ + { arith_invalid(0); return; } /* Need a positive number */ +#endif /* PARANOID */ + + /* Split the problem into two domains, smaller and larger than pi/4 */ + if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) ) + { + /* The argument is greater than (approx) pi/4 */ + invert = 1; + accum.lsw = 0; + XSIG_LL(accum) = significand(st0_ptr); + + if ( exponent == 0 ) + { + /* The argument is >= 1.0 */ + /* Put the binary point at the left. */ + XSIG_LL(accum) <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); + /* This is a special case which arises due to rounding. */ + if ( XSIG_LL(accum) == 0xffffffffffffffffLL ) + { + FPU_settag0(TAG_Valid); + significand(st0_ptr) = 0x8a51e04daabda360LL; + setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative); + return; + } + + argSignif.lsw = accum.lsw; + XSIG_LL(argSignif) = XSIG_LL(accum); + exponent = -1 + norm_Xsig(&argSignif); + } + else + { + invert = 0; + argSignif.lsw = 0; + XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); + + if ( exponent < -1 ) + { + /* shift the argument right by the required places */ + if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U ) + XSIG_LL(accum) ++; /* round up */ + } + } + + XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw; + mul_Xsig_Xsig(&argSq, &argSq); + XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + /* Compute the negative terms for the numerator polynomial */ + accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1); + mul_Xsig_Xsig(&accumulatoro, &argSq); + negate_Xsig(&accumulatoro); + /* Add the positive terms */ + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1); + + + /* Compute the positive terms for the denominator polynomial */ + accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1); + mul_Xsig_Xsig(&accumulatore, &argSq); + negate_Xsig(&accumulatore); + /* Add the negative terms */ + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1); + /* Multiply by arg^2 */ + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + /* de-normalize and divide by 2 */ + shr_Xsig(&accumulatore, -2*(1+exponent) + 1); + negate_Xsig(&accumulatore); /* This does 1 - accumulator */ + + /* Now find the ratio. */ + if ( accumulatore.msw == 0 ) + { + /* accumulatoro must contain 1.0 here, (actually, 0) but it + really doesn't matter what value we use because it will + have negligible effect in later calculations + */ + XSIG_LL(accum) = 0x8000000000000000LL; + accum.lsw = 0; + } + else + { + div_Xsig(&accumulatoro, &accumulatore, &accum); + } + + /* Multiply by 1/3 * arg^3 */ + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &twothirds); + shr_Xsig(&accum, -2*(exponent+1)); + + /* tan(arg) = arg + accum */ + add_two_Xsig(&accum, &argSignif, &exponent); + + if ( invert ) + { + /* We now have the value of tan(pi_2 - arg) where pi_2 is an + approximation for pi/2 + */ + /* The next step is to fix the answer to compensate for the + error due to the approximation used for pi/2 + */ + + /* This is (approx) delta, the error in our approx for pi/2 + (see above). It has an exponent of -65 + */ + XSIG_LL(fix_up) = 0x898cc51701b839a2LL; + fix_up.lsw = 0; + + if ( exponent == 0 ) + adj = 0xffffffff; /* We want approx 1.0 here, but + this is close enough. */ + else if ( exponent > -30 ) + { + adj = accum.msw >> -(exponent+1); /* tan */ + adj = mul_32_32(adj, adj); /* tan^2 */ + } + else + adj = 0; + adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ + + fix_up.msw += adj; + if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */ + { + /* Yes, we need to add an msb */ + shr_Xsig(&fix_up, 1); + fix_up.msw |= 0x80000000; + shr_Xsig(&fix_up, 64 + exponent); + } + else + shr_Xsig(&fix_up, 65 + exponent); + + add_two_Xsig(&accum, &fix_up, &exponent); + + /* accum now contains tan(pi/2 - arg). + Use tan(arg) = 1.0 / tan(pi/2 - arg) + */ + accumulatoro.lsw = accumulatoro.midw = 0; + accumulatoro.msw = 0x80000000; + div_Xsig(&accumulatoro, &accum, &accum); + exponent = - exponent - 1; + } + + /* Transfer the result */ + round_Xsig(&accum); + FPU_settag0(TAG_Valid); + significand(st0_ptr) = XSIG_LL(accum); + setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */ + +} diff --git a/arch/i386/math-emu/polynom_Xsig.S b/arch/i386/math-emu/polynom_Xsig.S new file mode 100644 index 000000000000..17315c89ff3d --- /dev/null +++ b/arch/i386/math-emu/polynom_Xsig.S @@ -0,0 +1,135 @@ +/*---------------------------------------------------------------------------+ + | polynomial_Xsig.S | + | | + | Fixed point arithmetic polynomial evaluation. | + | | + | Copyright (C) 1992,1993,1994,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | Call from C as: | + | void polynomial_Xsig(Xsig *accum, unsigned long long x, | + | unsigned long long terms[], int n) | + | | + | Computes: | + | terms[0] + (terms[1] + (terms[2] + ... + (terms[n-1]*x)*x)*x)*x) ... )*x | + | and adds the result to the 12 byte Xsig. | + | The terms[] are each 8 bytes, but all computation is performed to 12 byte | + | precision. | + | | + | This function must be used carefully: most overflow of intermediate | + | results is controlled, but overflow of the result is not. | + | | + +---------------------------------------------------------------------------*/ + .file "polynomial_Xsig.S" + +#include "fpu_emu.h" + + +#define TERM_SIZE $8 +#define SUM_MS -20(%ebp) /* sum ms long */ +#define SUM_MIDDLE -24(%ebp) /* sum middle long */ +#define SUM_LS -28(%ebp) /* sum ls long */ +#define ACCUM_MS -4(%ebp) /* accum ms long */ +#define ACCUM_MIDDLE -8(%ebp) /* accum middle long */ +#define ACCUM_LS -12(%ebp) /* accum ls long */ +#define OVERFLOWED -16(%ebp) /* addition overflow flag */ + +.text +ENTRY(polynomial_Xsig) + pushl %ebp + movl %esp,%ebp + subl $32,%esp + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM2,%esi /* x */ + movl PARAM3,%edi /* terms */ + + movl TERM_SIZE,%eax + mull PARAM4 /* n */ + addl %eax,%edi + + movl 4(%edi),%edx /* terms[n] */ + movl %edx,SUM_MS + movl (%edi),%edx /* terms[n] */ + movl %edx,SUM_MIDDLE + xor %eax,%eax + movl %eax,SUM_LS + movb %al,OVERFLOWED + + subl TERM_SIZE,%edi + decl PARAM4 + js L_accum_done + +L_accum_loop: + xor %eax,%eax + movl %eax,ACCUM_MS + movl %eax,ACCUM_MIDDLE + + movl SUM_MIDDLE,%eax + mull (%esi) /* x ls long */ + movl %edx,ACCUM_LS + + movl SUM_MIDDLE,%eax + mull 4(%esi) /* x ms long */ + addl %eax,ACCUM_LS + adcl %edx,ACCUM_MIDDLE + adcl $0,ACCUM_MS + + movl SUM_MS,%eax + mull (%esi) /* x ls long */ + addl %eax,ACCUM_LS + adcl %edx,ACCUM_MIDDLE + adcl $0,ACCUM_MS + + movl SUM_MS,%eax + mull 4(%esi) /* x ms long */ + addl %eax,ACCUM_MIDDLE + adcl %edx,ACCUM_MS + + testb $0xff,OVERFLOWED + jz L_no_overflow + + movl (%esi),%eax + addl %eax,ACCUM_MIDDLE + movl 4(%esi),%eax + adcl %eax,ACCUM_MS /* This could overflow too */ + +L_no_overflow: + +/* + * Now put the sum of next term and the accumulator + * into the sum register + */ + movl ACCUM_LS,%eax + addl (%edi),%eax /* term ls long */ + movl %eax,SUM_LS + movl ACCUM_MIDDLE,%eax + adcl (%edi),%eax /* term ls long */ + movl %eax,SUM_MIDDLE + movl ACCUM_MS,%eax + adcl 4(%edi),%eax /* term ms long */ + movl %eax,SUM_MS + sbbb %al,%al + movb %al,OVERFLOWED /* Used in the next iteration */ + + subl TERM_SIZE,%edi + decl PARAM4 + jns L_accum_loop + +L_accum_done: + movl PARAM1,%edi /* accum */ + movl SUM_LS,%eax + addl %eax,(%edi) + movl SUM_MIDDLE,%eax + adcl %eax,4(%edi) + movl SUM_MS,%eax + adcl %eax,8(%edi) + + popl %ebx + popl %edi + popl %esi + leave + ret diff --git a/arch/i386/math-emu/reg_add_sub.c b/arch/i386/math-emu/reg_add_sub.c new file mode 100644 index 000000000000..7cd3b37ac084 --- /dev/null +++ b/arch/i386/math-emu/reg_add_sub.c @@ -0,0 +1,374 @@ +/*---------------------------------------------------------------------------+ + | reg_add_sub.c | + | | + | Functions to add or subtract two registers and put the result in a third. | + | | + | Copyright (C) 1992,1993,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | For each function, the destination may be any FPU_REG, including one of | + | the source FPU_REGs. | + | Each function returns 0 if the answer is o.k., otherwise a non-zero | + | value is returned, indicating either an exception condition or an | + | internal error. | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "control_w.h" +#include "fpu_system.h" + +static +int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, + FPU_REG const *b, u_char tagb, u_char signb, + FPU_REG *dest, int deststnr, int control_w); + +/* + Operates on st(0) and st(n), or on st(0) and temporary data. + The destination must be one of the source st(x). + */ +int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w) +{ + FPU_REG *a = &st(0); + FPU_REG *dest = &st(deststnr); + u_char signb = getsign(b); + u_char taga = FPU_gettag0(); + u_char signa = getsign(a); + u_char saved_sign = getsign(dest); + int diff, tag, expa, expb; + + if ( !(taga | tagb) ) + { + expa = exponent(a); + expb = exponent(b); + + valid_add: + /* Both registers are valid */ + if (!(signa ^ signb)) + { + /* signs are the same */ + tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb); + } + else + { + /* The signs are different, so do a subtraction */ + diff = expa - expb; + if (!diff) + { + diff = a->sigh - b->sigh; /* This works only if the ms bits + are identical. */ + if (!diff) + { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + if (diff > 0) + { + tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); + } + else if ( diff < 0 ) + { + tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa); + } + else + { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } + } + + if ( tag < 0 ) + { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } + + if ( taga == TAG_Special ) + taga = FPU_Special(a); + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + + if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + || ((taga == TW_Denormal) && (tagb == TAG_Valid)) + || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) + { + FPU_REG x, y; + + if ( denormal_operand() < 0 ) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); + goto valid_add; + } + + if ( (taga == TW_NaN) || (tagb == TW_NaN) ) + { + if ( deststnr == 0 ) + return real_2op_NaN(b, tagb, deststnr, a); + else + return real_2op_NaN(a, taga, deststnr, a); + } + + return add_sub_specials(a, taga, signa, b, tagb, signb, + dest, deststnr, control_w); +} + + +/* Subtract b from a. (a-b) -> dest */ +int FPU_sub(int flags, int rm, int control_w) +{ + FPU_REG const *a, *b; + FPU_REG *dest; + u_char taga, tagb, signa, signb, saved_sign, sign; + int diff, tag = 0, expa, expb, deststnr; + + a = &st(0); + taga = FPU_gettag0(); + + deststnr = 0; + if ( flags & LOADED ) + { + b = (FPU_REG *)rm; + tagb = flags & 0x0f; + } + else + { + b = &st(rm); + tagb = FPU_gettagi(rm); + + if ( flags & DEST_RM ) + deststnr = rm; + } + + signa = getsign(a); + signb = getsign(b); + + if ( flags & REV ) + { + signa ^= SIGN_NEG; + signb ^= SIGN_NEG; + } + + dest = &st(deststnr); + saved_sign = getsign(dest); + + if ( !(taga | tagb) ) + { + expa = exponent(a); + expb = exponent(b); + + valid_subtract: + /* Both registers are valid */ + + diff = expa - expb; + + if (!diff) + { + diff = a->sigh - b->sigh; /* Works only if ms bits are identical */ + if (!diff) + { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + switch ( (((int)signa)*2 + signb) / SIGN_NEG ) + { + case 0: /* P - P */ + case 3: /* N - N */ + if (diff > 0) + { + /* |a| > |b| */ + tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); + } + else if ( diff == 0 ) + { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } + else + { + sign = signa ^ SIGN_NEG; + tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa); + } + break; + case 1: /* P - N */ + tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb); + break; + case 2: /* N - P */ + tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb); + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x111); + return -1; +#endif + } + if ( tag < 0 ) + { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } + + if ( taga == TAG_Special ) + taga = FPU_Special(a); + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + + if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + || ((taga == TW_Denormal) && (tagb == TAG_Valid)) + || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) + { + FPU_REG x, y; + + if ( denormal_operand() < 0 ) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); + + goto valid_subtract; + } + + if ( (taga == TW_NaN) || (tagb == TW_NaN) ) + { + FPU_REG const *d1, *d2; + if ( flags & REV ) + { + d1 = b; + d2 = a; + } + else + { + d1 = a; + d2 = b; + } + if ( flags & LOADED ) + return real_2op_NaN(b, tagb, deststnr, d1); + if ( flags & DEST_RM ) + return real_2op_NaN(a, taga, deststnr, d2); + else + return real_2op_NaN(b, tagb, deststnr, d2); + } + + return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG, + dest, deststnr, control_w); +} + + +static +int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, + FPU_REG const *b, u_char tagb, u_char signb, + FPU_REG *dest, int deststnr, int control_w) +{ + if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0) ) + return FPU_Exception; + + if (taga == TAG_Zero) + { + if (tagb == TAG_Zero) + { + /* Both are zero, result will be zero. */ + u_char different_signs = signa ^ signb; + + FPU_copy_to_regi(a, TAG_Zero, deststnr); + if ( different_signs ) + { + /* Signs are different. */ + /* Sign of answer depends upon rounding mode. */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + } + else + setsign(dest, signa); /* signa may differ from the sign of a. */ + return TAG_Zero; + } + else + { + reg_copy(b, dest); + if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) ) + { + /* A pseudoDenormal, convert it. */ + addexponent(dest, 1); + tagb = TAG_Valid; + } + else if ( tagb > TAG_Empty ) + tagb = TAG_Special; + setsign(dest, signb); /* signb may differ from the sign of b. */ + FPU_settagi(deststnr, tagb); + return tagb; + } + } + else if (tagb == TAG_Zero) + { + reg_copy(a, dest); + if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) ) + { + /* A pseudoDenormal */ + addexponent(dest, 1); + taga = TAG_Valid; + } + else if ( taga > TAG_Empty ) + taga = TAG_Special; + setsign(dest, signa); /* signa may differ from the sign of a. */ + FPU_settagi(deststnr, taga); + return taga; + } + else if (taga == TW_Infinity) + { + if ( (tagb != TW_Infinity) || (signa == signb) ) + { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, signa); /* signa may differ from the sign of a. */ + return taga; + } + /* Infinity-Infinity is undefined. */ + return arith_invalid(deststnr); + } + else if (tagb == TW_Infinity) + { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, signb); /* signb may differ from the sign of b. */ + return tagb; + } + +#ifdef PARANOID + EXCEPTION(EX_INTERNAL|0x101); +#endif + + return FPU_Exception; +} + diff --git a/arch/i386/math-emu/reg_compare.c b/arch/i386/math-emu/reg_compare.c new file mode 100644 index 000000000000..f37c5b5a35ad --- /dev/null +++ b/arch/i386/math-emu/reg_compare.c @@ -0,0 +1,381 @@ +/*---------------------------------------------------------------------------+ + | reg_compare.c | + | | + | Compare two floating point registers | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | compare() is the core FPU_REG comparison function | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "exception.h" +#include "fpu_emu.h" +#include "control_w.h" +#include "status_w.h" + + +static int compare(FPU_REG const *b, int tagb) +{ + int diff, exp0, expb; + u_char st0_tag; + FPU_REG *st0_ptr; + FPU_REG x, y; + u_char st0_sign, signb = getsign(b); + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + st0_sign = getsign(st0_ptr); + + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + if ( st0_tag == TAG_Special ) + st0_tag = FPU_Special(st0_ptr); + + if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) + || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) ) + { + if ( st0_tag == TAG_Zero ) + { + if ( tagb == TAG_Zero ) return COMP_A_eq_B; + if ( tagb == TAG_Valid ) + return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if ( tagb == TW_Denormal ) + return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + } + else if ( tagb == TAG_Zero ) + { + if ( st0_tag == TAG_Valid ) + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + if ( st0_tag == TW_Denormal ) + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + } + + if ( st0_tag == TW_Infinity ) + { + if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) ) + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + else if ( tagb == TW_Denormal ) + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + else if ( tagb == TW_Infinity ) + { + /* The 80486 book says that infinities can be equal! */ + return (st0_sign == signb) ? COMP_A_eq_B : + ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + } + /* Fall through to the NaN code */ + } + else if ( tagb == TW_Infinity ) + { + if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) ) + return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if ( st0_tag == TW_Denormal ) + return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + /* Fall through to the NaN code */ + } + + /* The only possibility now should be that one of the arguments + is a NaN */ + if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) ) + { + int signalling = 0, unsupported = 0; + if ( st0_tag == TW_NaN ) + { + signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000; + unsupported = !((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000)); + } + if ( tagb == TW_NaN ) + { + signalling |= (b->sigh & 0xc0000000) == 0x80000000; + unsupported |= !((exponent(b) == EXP_OVER) + && (b->sigh & 0x80000000)); + } + if ( signalling || unsupported ) + return COMP_No_Comp | COMP_SNaN | COMP_NaN; + else + /* Neither is a signaling NaN */ + return COMP_No_Comp | COMP_NaN; + } + + EXCEPTION(EX_Invalid); + } + + if (st0_sign != signb) + { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + + if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) ) + { + FPU_to_exp16(st0_ptr, &x); + FPU_to_exp16(b, &y); + st0_ptr = &x; + b = &y; + exp0 = exponent16(st0_ptr); + expb = exponent16(b); + } + else + { + exp0 = exponent(st0_ptr); + expb = exponent(b); + } + +#ifdef PARANOID + if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid); + if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid); +#endif /* PARANOID */ + + diff = exp0 - expb; + if ( diff == 0 ) + { + diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are + identical */ + if ( diff == 0 ) + { + diff = st0_ptr->sigl > b->sigl; + if ( diff == 0 ) + diff = -(st0_ptr->sigl < b->sigl); + } + } + + if ( diff > 0 ) + { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + if ( diff < 0 ) + { + return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + + return COMP_A_eq_B + | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + +} + + +/* This function requires that st(0) is not empty */ +int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) +{ + int f = 0, c; + + c = compare(loaded_data, loaded_tag); + + if (c & COMP_NaN) + { + EXCEPTION(EX_Invalid); + f = SW_C3 | SW_C2 | SW_C0; + } + else + switch (c & 7) + { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x121); + f = SW_C3 | SW_C2 | SW_C0; + break; +#endif /* PARANOID */ + } + setcc(f); + if (c & COMP_Denormal) + { + return denormal_operand() < 0; + } + return 0; +} + + +static int compare_st_st(int nr) +{ + int f = 0, c; + FPU_REG *st_ptr; + + if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) + { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) + { + setcc(SW_C3 | SW_C2 | SW_C0); + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + else + switch (c & 7) + { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x122); + f = SW_C3 | SW_C2 | SW_C0; + break; +#endif /* PARANOID */ + } + setcc(f); + if (c & COMP_Denormal) + { + return denormal_operand() < 0; + } + return 0; +} + + +static int compare_u_st_st(int nr) +{ + int f = 0, c; + FPU_REG *st_ptr; + + if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) + { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) + { + setcc(SW_C3 | SW_C2 | SW_C0); + if (c & COMP_SNaN) /* This is the only difference between + un-ordered and ordinary comparisons */ + { + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + return 0; + } + else + switch (c & 7) + { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; +#ifdef PARANOID + default: + EXCEPTION(EX_INTERNAL|0x123); + f = SW_C3 | SW_C2 | SW_C0; + break; +#endif /* PARANOID */ + } + setcc(f); + if (c & COMP_Denormal) + { + return denormal_operand() < 0; + } + return 0; +} + +/*---------------------------------------------------------------------------*/ + +void fcom_st(void) +{ + /* fcom st(i) */ + compare_st_st(FPU_rm); +} + + +void fcompst(void) +{ + /* fcomp st(i) */ + if ( !compare_st_st(FPU_rm) ) + FPU_pop(); +} + + +void fcompp(void) +{ + /* fcompp */ + if (FPU_rm != 1) + { + FPU_illegal(); + return; + } + if ( !compare_st_st(1) ) + poppop(); +} + + +void fucom_(void) +{ + /* fucom st(i) */ + compare_u_st_st(FPU_rm); + +} + + +void fucomp(void) +{ + /* fucomp st(i) */ + if ( !compare_u_st_st(FPU_rm) ) + FPU_pop(); +} + + +void fucompp(void) +{ + /* fucompp */ + if (FPU_rm == 1) + { + if ( !compare_u_st_st(1) ) + poppop(); + } + else + FPU_illegal(); +} diff --git a/arch/i386/math-emu/reg_constant.c b/arch/i386/math-emu/reg_constant.c new file mode 100644 index 000000000000..a85015801969 --- /dev/null +++ b/arch/i386/math-emu/reg_constant.c @@ -0,0 +1,120 @@ +/*---------------------------------------------------------------------------+ + | reg_constant.c | + | | + | All of the constant FPU_REGs | + | | + | Copyright (C) 1992,1993,1994,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_system.h" +#include "fpu_emu.h" +#include "status_w.h" +#include "reg_constant.h" +#include "control_w.h" + + +#define MAKE_REG(s,e,l,h) { l, h, \ + ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } + +FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); +#if 0 +FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); +FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000); +#endif /* 0 */ +static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); +static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); +FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); +static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); +static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); + +/* Extra bits to take pi/2 to more than 128 bits precision. */ +FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, + 0xfc8f8cbb, 0xece675d1); + +/* Only the sign (and tag) is used in internal zeroes */ +FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); + +/* Only the sign and significand (and tag) are used in internal NaNs */ +/* The 80486 never generates one of these +FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); + */ +/* This is the real indefinite QNaN */ +FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); + +/* Only the sign (and tag) is used in internal infinities */ +FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); + + +static void fld_const(FPU_REG const *c, int adj, u_char tag) +{ + FPU_REG *st_new_ptr; + + if ( STACK_OVERFLOW ) + { + FPU_stack_overflow(); + return; + } + push(); + reg_copy(c, st_new_ptr); + st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to + borrow or carry. */ + FPU_settag0(tag); + clear_C1(); +} + +/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP + (and not one of RC_RND or RC_UP). + */ +#define DOWN_OR_CHOP(x) (x & RC_DOWN) + +static void fld1(int rc) +{ + fld_const(&CONST_1, 0, TAG_Valid); +} + +static void fldl2t(int rc) +{ + fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); +} + +static void fldl2e(int rc) +{ + fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); +} + +static void fldpi(int rc) +{ + fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); +} + +static void fldlg2(int rc) +{ + fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); +} + +static void fldln2(int rc) +{ + fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); +} + +static void fldz(int rc) +{ + fld_const(&CONST_Z, 0, TAG_Zero); +} + +typedef void (*FUNC_RC)(int); + +static FUNC_RC constants_table[] = { + fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal +}; + +void fconst(void) +{ + (constants_table[FPU_rm])(control_word & CW_RC); +} diff --git a/arch/i386/math-emu/reg_constant.h b/arch/i386/math-emu/reg_constant.h new file mode 100644 index 000000000000..1bffaec3a134 --- /dev/null +++ b/arch/i386/math-emu/reg_constant.h @@ -0,0 +1,25 @@ +/*---------------------------------------------------------------------------+ + | reg_constant.h | + | | + | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@vaxc.cc.monash.edu.au | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _REG_CONSTANT_H_ +#define _REG_CONSTANT_H_ + +#include "fpu_emu.h" + +extern FPU_REG const CONST_1; +extern FPU_REG const CONST_PI; +extern FPU_REG const CONST_PI2; +extern FPU_REG const CONST_PI2extra; +extern FPU_REG const CONST_PI4; +extern FPU_REG const CONST_Z; +extern FPU_REG const CONST_PINF; +extern FPU_REG const CONST_INF; +extern FPU_REG const CONST_MINF; +extern FPU_REG const CONST_QNaN; + +#endif /* _REG_CONSTANT_H_ */ diff --git a/arch/i386/math-emu/reg_convert.c b/arch/i386/math-emu/reg_convert.c new file mode 100644 index 000000000000..45a258752703 --- /dev/null +++ b/arch/i386/math-emu/reg_convert.c @@ -0,0 +1,53 @@ +/*---------------------------------------------------------------------------+ + | reg_convert.c | + | | + | Convert register representation. | + | | + | Copyright (C) 1992,1993,1994,1996,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "fpu_emu.h" + + +int FPU_to_exp16(FPU_REG const *a, FPU_REG *x) +{ + int sign = getsign(a); + + *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); + + /* Set up the exponent as a 16 bit quantity. */ + setexponent16(x, exponent(a)); + + if ( exponent16(x) == EXP_UNDER ) + { + /* The number is a de-normal or pseudodenormal. */ + /* We only deal with the significand and exponent. */ + + if (x->sigh & 0x80000000) + { + /* Is a pseudodenormal. */ + /* This is non-80486 behaviour because the number + loses its 'denormal' identity. */ + addexponent(x, 1); + } + else + { + /* Is a denormal. */ + addexponent(x, 1); + FPU_normalize_nuo(x); + } + } + + if ( !(x->sigh & 0x80000000) ) + { + EXCEPTION(EX_INTERNAL | 0x180); + } + + return sign; +} + diff --git a/arch/i386/math-emu/reg_divide.c b/arch/i386/math-emu/reg_divide.c new file mode 100644 index 000000000000..5cee7ff920d9 --- /dev/null +++ b/arch/i386/math-emu/reg_divide.c @@ -0,0 +1,207 @@ +/*---------------------------------------------------------------------------+ + | reg_divide.c | + | | + | Divide one FPU_REG by another and put the result in a destination FPU_REG.| + | | + | Copyright (C) 1996 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@jacobi.maths.monash.edu.au | + | | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | The destination may be any FPU_REG, including one of the source FPU_REGs. | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "reg_constant.h" +#include "fpu_emu.h" +#include "fpu_system.h" + +/* + Divide one register by another and put the result into a third register. + */ +int FPU_div(int flags, int rm, int control_w) +{ + FPU_REG x, y; + FPU_REG const *a, *b, *st0_ptr, *st_ptr; + FPU_REG *dest; + u_char taga, tagb, signa, signb, sign, saved_sign; + int tag, deststnr; + + if ( flags & DEST_RM ) + deststnr = rm; + else + deststnr = 0; + + if ( flags & REV ) + { + b = &st(0); + st0_ptr = b; + tagb = FPU_gettag0(); + if ( flags & LOADED ) + { + a = (FPU_REG *)rm; + taga = flags & 0x0f; + } + else + { + a = &st(rm); + st_ptr = a; + taga = FPU_gettagi(rm); + } + } + else + { + a = &st(0); + st0_ptr = a; + taga = FPU_gettag0(); + if ( flags & LOADED ) + { + b = (FPU_REG *)rm; + tagb = flags & 0x0f; + } + else + { + b = &st(rm); + st_ptr = b; + tagb = FPU_gettagi(rm); + } + } + + signa = getsign(a); + signb = getsign(b); + + sign = signa ^ signb; + + dest = &st(deststnr); + saved_sign = getsign(dest); + + if ( !(taga | tagb) ) + { + /* Both regs Valid, this should be the most common case. */ + reg_copy(a, &x); + reg_copy(b, &y); + setpositive(&x); + setpositive(&y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); + + if ( tag < 0 ) + return tag; + + FPU_settagi(deststnr, tag); + return tag; + } + + if ( taga == TAG_Special ) + taga = FPU_Special(a); + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + + if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + || ((taga == TW_Denormal) && (tagb == TAG_Valid)) + || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) + { + if ( denormal_operand() < 0 ) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); + if ( tag < 0 ) + return tag; + + FPU_settagi(deststnr, tag); + return tag; + } + else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) + { + if ( tagb != TAG_Zero ) + { + /* Want to find Zero/Valid */ + if ( tagb == TW_Denormal ) + { + if ( denormal_operand() < 0 ) + return FPU_Exception; + } + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; + } + /* We have an exception condition, either 0/0 or Valid/Zero. */ + if ( taga == TAG_Zero ) + { + /* 0/0 */ + return arith_invalid(deststnr); + } + /* Valid/Zero */ + return FPU_divide_by_zero(deststnr, sign); + } + /* Must have infinities, NaNs, etc */ + else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) + { + if ( flags & LOADED ) + return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr); + + if ( flags & DEST_RM ) + { + int tag; + tag = FPU_gettag0(); + if ( tag == TAG_Special ) + tag = FPU_Special(st0_ptr); + return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm)); + } + else + { + int tag; + tag = FPU_gettagi(rm); + if ( tag == TAG_Special ) + tag = FPU_Special(&st(rm)); + return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm)); + } + } + else if (taga == TW_Infinity) + { + if (tagb == TW_Infinity) + { + /* infinity/infinity */ + return arith_invalid(deststnr); + } + else + { + /* tagb must be Valid or Zero */ + if ( (tagb == TW_Denormal) && (denormal_operand() < 0) ) + return FPU_Exception; + + /* Infinity divided by Zero or Valid does + not raise and exception, but returns Infinity */ + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return taga; + } + } + else if (tagb == TW_Infinity) + { + if ( (taga == TW_Denormal) && (denormal_operand() < 0) ) + return FPU_Exception; + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; + } +#ifdef PARANOID + else + { + EXCEPTION(EX_INTERNAL|0x102); + return FPU_Exception; + } +#endif /* PARANOID */ + + return 0; +} diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c new file mode 100644 index 000000000000..f06ed41d191d --- /dev/null +++ b/arch/i386/math-emu/reg_ld_str.c @@ -0,0 +1,1370 @@ +/*---------------------------------------------------------------------------+ + | reg_ld_str.c | + | | + | All of the functions which transfer data between user memory and FPU_REGs.| + | | + | Copyright (C) 1992,1993,1994,1996,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Note: | + | The file contains code which accesses user memory. | + | Emulator static data may change when user memory is accessed, due to | + | other processes using the emulator while swapping is in progress. | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" + +#include <asm/uaccess.h> + +#include "fpu_system.h" +#include "exception.h" +#include "reg_constant.h" +#include "control_w.h" +#include "status_w.h" + + +#define DOUBLE_Emax 1023 /* largest valid exponent */ +#define DOUBLE_Ebias 1023 +#define DOUBLE_Emin (-1022) /* smallest valid exponent */ + +#define SINGLE_Emax 127 /* largest valid exponent */ +#define SINGLE_Ebias 127 +#define SINGLE_Emin (-126) /* smallest valid exponent */ + + +static u_char normalize_no_excep(FPU_REG *r, int exp, int sign) +{ + u_char tag; + + setexponent16(r, exp); + + tag = FPU_normalize_nuo(r); + stdexp(r); + if ( sign ) + setnegative(r); + + return tag; +} + + +int FPU_tagof(FPU_REG *ptr) +{ + int exp; + + exp = exponent16(ptr) & 0x7fff; + if ( exp == 0 ) + { + if ( !(ptr->sigh | ptr->sigl) ) + { + return TAG_Zero; + } + /* The number is a de-normal or pseudodenormal. */ + return TAG_Special; + } + + if ( exp == 0x7fff ) + { + /* Is an Infinity, a NaN, or an unsupported data type. */ + return TAG_Special; + } + + if ( !(ptr->sigh & 0x80000000) ) + { + /* Unsupported data type. */ + /* Valid numbers have the ms bit set to 1. */ + /* Unnormal. */ + return TAG_Special; + } + + return TAG_Valid; +} + + +/* Get a long double from user memory */ +int FPU_load_extended(long double __user *s, int stnr) +{ + FPU_REG *sti_ptr = &st(stnr); + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + __copy_from_user(sti_ptr, s, 10); + RE_ENTRANT_CHECK_ON; + + return FPU_tagof(sti_ptr); +} + + +/* Get a double from user memory */ +int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) +{ + int exp, tag, negative; + unsigned m64, l64; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, dfloat, 8); + FPU_get_user(m64, 1 + (unsigned long __user *) dfloat); + FPU_get_user(l64, (unsigned long __user *) dfloat); + RE_ENTRANT_CHECK_ON; + + negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; + m64 &= 0xfffff; + if ( exp > DOUBLE_Emax + EXTENDED_Ebias ) + { + /* Infinity or NaN */ + if ((m64 == 0) && (l64 == 0)) + { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } + else + { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } + else if ( exp < DOUBLE_Emin + EXTENDED_Ebias ) + { + /* Zero or de-normal */ + if ((m64 == 0) && (l64 == 0)) + { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + exp = 0; + tag = TAG_Zero; + } + else + { + /* De-normal */ + loaded_data->sigh = m64 << 11; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + + return normalize_no_excep(loaded_data, DOUBLE_Emin, negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } + } + else + { + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + + tag = TAG_Valid; + } + + setexponent16(loaded_data, exp | negative); + + return tag; +} + + +/* Get a float from user memory */ +int FPU_load_single(float __user *single, FPU_REG *loaded_data) +{ + unsigned m32; + int exp, tag, negative; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, single, 4); + FPU_get_user(m32, (unsigned long __user *) single); + RE_ENTRANT_CHECK_ON; + + negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + + if (!(m32 & 0x7fffffff)) + { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + addexponent(loaded_data, negative); + return TAG_Zero; + } + exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias; + m32 = (m32 & 0x7fffff) << 8; + if ( exp < SINGLE_Emin + EXTENDED_Ebias ) + { + /* De-normals */ + loaded_data->sigh = m32; + loaded_data->sigl = 0; + + return normalize_no_excep(loaded_data, SINGLE_Emin, negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } + else if ( exp > SINGLE_Emax + EXTENDED_Ebias ) + { + /* Infinity or NaN */ + if ( m32 == 0 ) + { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } + else + { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } + else + { + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Valid; + } + + setexponent16(loaded_data, exp | negative); /* Set the sign. */ + + return tag; +} + + +/* Get a long long from user memory */ +int FPU_load_int64(long long __user *_s) +{ + long long s; + int sign; + FPU_REG *st0_ptr = &st(0); + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 8); + copy_from_user(&s,_s,8); + RE_ENTRANT_CHECK_ON; + + if (s == 0) + { + reg_copy(&CONST_Z, st0_ptr); + return TAG_Zero; + } + + if (s > 0) + sign = SIGN_Positive; + else + { + s = -s; + sign = SIGN_Negative; + } + + significand(st0_ptr) = s; + + return normalize_no_excep(st0_ptr, 63, sign); +} + + +/* Get a long from user memory */ +int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) +{ + long s; + int negative; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 4); + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; + + if (s == 0) + { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + + if (s > 0) + negative = SIGN_Positive; + else + { + s = -s; + negative = SIGN_Negative; + } + + loaded_data->sigh = s; + loaded_data->sigl = 0; + + return normalize_no_excep(loaded_data, 31, negative); +} + + +/* Get a short from user memory */ +int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) +{ + int s, negative; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 2); + /* Cast as short to get the sign extended. */ + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; + + if (s == 0) + { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + + if (s > 0) + negative = SIGN_Positive; + else + { + s = -s; + negative = SIGN_Negative; + } + + loaded_data->sigh = s << 16; + loaded_data->sigl = 0; + + return normalize_no_excep(loaded_data, 15, negative); +} + + +/* Get a packed bcd array from user memory */ +int FPU_load_bcd(u_char __user *s) +{ + FPU_REG *st0_ptr = &st(0); + int pos; + u_char bcd; + long long l=0; + int sign; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + RE_ENTRANT_CHECK_ON; + for ( pos = 8; pos >= 0; pos--) + { + l *= 10; + RE_ENTRANT_CHECK_OFF; + FPU_get_user(bcd, s+pos); + RE_ENTRANT_CHECK_ON; + l += bcd >> 4; + l *= 10; + l += bcd & 0x0f; + } + + RE_ENTRANT_CHECK_OFF; + FPU_get_user(sign, s+9); + sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; + RE_ENTRANT_CHECK_ON; + + if ( l == 0 ) + { + reg_copy(&CONST_Z, st0_ptr); + addexponent(st0_ptr, sign); /* Set the sign. */ + return TAG_Zero; + } + else + { + significand(st0_ptr) = l; + return normalize_no_excep(st0_ptr, 63, sign); + } +} + +/*===========================================================================*/ + +/* Put a long double into user memory */ +int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d) +{ + /* + The only exception raised by an attempt to store to an + extended format is the Invalid Stack exception, i.e. + attempting to store from an empty register. + */ + + if ( st0_tag != TAG_Empty ) + { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + + FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d); + FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4)); + FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8)); + RE_ENTRANT_CHECK_ON; + + return 1; + } + + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if ( control_word & CW_Invalid ) + { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,10); + FPU_put_user(0, (unsigned long __user *) d); + FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d); + FPU_put_user(0xffff, 4 + (short __user *) d); + RE_ENTRANT_CHECK_ON; + return 1; + } + else + return 0; + +} + + +/* Put a double into user memory */ +int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat) +{ + unsigned long l[2]; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; + + if ( st0_tag == TAG_Valid ) + { + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); + + if ( exp < DOUBLE_Emin ) /* It may be a denormal */ + { + addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */ + + denormal_arg: + + if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) + { +#ifdef PECULIAR_486 + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) && + (st0_ptr->sigl & 0x000007ff)) ) +#endif /* PECULIAR_486 */ + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if ( !(control_word & CW_Underflow) ) + return 0; + } + EXCEPTION(precision_loss); + if ( !(control_word & CW_Precision) ) + return 0; + } + l[0] = tmp.sigl; + l[1] = tmp.sigh; + } + else + { + if ( tmp.sigl & 0x000007ff ) + { + precision_loss = 1; + switch (control_word & CW_RC) + { + case RC_RND: + /* Rounding can get a little messy.. */ + increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */ + ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff; + break; + case RC_UP: /* towards +infinity */ + increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate the mantissa */ + tmp.sigl &= 0xfffff800; + + if ( increment ) + { + if ( tmp.sigl >= 0xfffff800 ) + { + /* the sigl part overflows */ + if ( tmp.sigh == 0xffffffff ) + { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if (exp >= EXP_OVER) + goto overflow; + } + else + { + tmp.sigh ++; + } + tmp.sigl = 0x00000000; + } + else + { + /* We only need to increment sigl */ + tmp.sigl += 0x00000800; + } + } + } + else + precision_loss = 0; + + l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21); + l[1] = ((tmp.sigh >> 11) & 0xfffff); + + if ( exp > DOUBLE_Emax ) + { + overflow: + EXCEPTION(EX_Overflow); + if ( !(control_word & CW_Overflow) ) + return 0; + set_precision_flag_up(); + if ( !(control_word & CW_Precision) ) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + /* Overflow to infinity */ + l[0] = 0x00000000; /* Set to */ + l[1] = 0x7ff00000; /* + INF */ + } + else + { + if ( precision_loss ) + { + if ( increment ) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20); + } + } + } + else if (st0_tag == TAG_Zero) + { + /* Number is zero */ + l[0] = 0; + l[1] = 0; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if ( st0_tag == TW_Denormal ) + { + /* A denormal will always underflow. */ +#ifndef PECULIAR_486 + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if ( control_word & CW_Underflow ) + denormal_operand(); +#endif /* PECULIAR_486 */ + reg_copy(st0_ptr, &tmp); + goto denormal_arg; + } + else if (st0_tag == TW_Infinity) + { + l[0] = 0; + l[1] = 0x7ff00000; + } + else if (st0_tag == TW_NaN) + { + /* Is it really a NaN ? */ + if ( (exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000) ) + { + /* See if we can get a valid NaN from the FPU_REG */ + l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21); + l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); + if ( !(st0_ptr->sigh & 0x40000000) ) + { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if ( !(control_word & CW_Invalid) ) + return 0; + l[1] |= (0x40000000 >> 11); + } + l[1] |= 0x7ff00000; + } + else + { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if ( !(control_word & CW_Invalid) ) + return 0; + l[0] = 0; + l[1] = 0xfff80000; + } + } + } + else if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if ( control_word & CW_Invalid ) + { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,dfloat,8); + FPU_put_user(0, (unsigned long __user *) dfloat); + FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat); + RE_ENTRANT_CHECK_ON; + return 1; + } + else + return 0; + } + if ( getsign(st0_ptr) ) + l[1] |= 0x80000000; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,dfloat,8); + FPU_put_user(l[0], (unsigned long __user *)dfloat); + FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + + return 1; +} + + +/* Put a float into user memory */ +int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) +{ + long templ = 0; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; + + if ( st0_tag == TAG_Valid ) + { + + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); + + if ( exp < SINGLE_Emin ) + { + addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */ + + denormal_arg: + + if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) + { +#ifdef PECULIAR_486 + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if ( !((tmp.sigl == 0x00800000) && + ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) ) +#endif /* PECULIAR_486 */ + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if ( !(control_word & CW_Underflow) ) + return 0; + } + EXCEPTION(precision_loss); + if ( !(control_word & CW_Precision) ) + return 0; + } + templ = tmp.sigl; + } + else + { + if ( tmp.sigl | (tmp.sigh & 0x000000ff) ) + { + unsigned long sigh = tmp.sigh; + unsigned long sigl = tmp.sigl; + + precision_loss = 1; + switch (control_word & CW_RC) + { + case RC_RND: + increment = ((sigh & 0xff) > 0x80) /* more than half */ + || (((sigh & 0xff) == 0x80) && sigl) /* more than half */ + || ((sigh & 0x180) == 0x180); /* round to even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = signpositive(&tmp) + ? 0 : (sigl | (sigh & 0xff)); + break; + case RC_UP: /* towards +infinity */ + increment = signpositive(&tmp) + ? (sigl | (sigh & 0xff)) : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate part of the mantissa */ + tmp.sigl = 0; + + if (increment) + { + if ( sigh >= 0xffffff00 ) + { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if ( exp >= EXP_OVER ) + goto overflow; + } + else + { + tmp.sigh &= 0xffffff00; + tmp.sigh += 0x100; + } + } + else + { + tmp.sigh &= 0xffffff00; /* Finish the truncation */ + } + } + else + precision_loss = 0; + + templ = (tmp.sigh >> 8) & 0x007fffff; + + if ( exp > SINGLE_Emax ) + { + overflow: + EXCEPTION(EX_Overflow); + if ( !(control_word & CW_Overflow) ) + return 0; + set_precision_flag_up(); + if ( !(control_word & CW_Precision) ) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ + /* Masked response is overflow to infinity. */ + templ = 0x7f800000; + } + else + { + if ( precision_loss ) + { + if ( increment ) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + templ |= ((exp+SINGLE_Ebias) & 0xff) << 23; + } + } + } + else if (st0_tag == TAG_Zero) + { + templ = 0; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if (st0_tag == TW_Denormal) + { + reg_copy(st0_ptr, &tmp); + + /* A denormal will always underflow. */ +#ifndef PECULIAR_486 + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if ( control_word & CW_Underflow ) + denormal_operand(); +#endif /* PECULIAR_486 */ + goto denormal_arg; + } + else if (st0_tag == TW_Infinity) + { + templ = 0x7f800000; + } + else if (st0_tag == TW_NaN) + { + /* Is it really a NaN ? */ + if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) ) + { + /* See if we can get a valid NaN from the FPU_REG */ + templ = st0_ptr->sigh >> 8; + if ( !(st0_ptr->sigh & 0x40000000) ) + { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if ( !(control_word & CW_Invalid) ) + return 0; + templ |= (0x40000000 >> 8); + } + templ |= 0x7f800000; + } + else + { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if ( !(control_word & CW_Invalid) ) + return 0; + templ = 0xffc00000; + } + } +#ifdef PARANOID + else + { + EXCEPTION(EX_INTERNAL|0x164); + return 0; + } +#endif + } + else if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if ( control_word & EX_Invalid ) + { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,single,4); + FPU_put_user(0xffc00000, (unsigned long __user *) single); + RE_ENTRANT_CHECK_ON; + return 1; + } + else + return 0; + } +#ifdef PARANOID + else + { + EXCEPTION(EX_INTERNAL|0x163); + return 0; + } +#endif + if ( getsign(st0_ptr) ) + templ |= 0x80000000; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,single,4); + FPU_put_user(templ,(unsigned long __user *) single); + RE_ENTRANT_CHECK_ON; + + return 1; +} + + +/* Put a long long into user memory */ +int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) +{ + FPU_REG t; + long long tll; + int precision_loss; + + if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if ( (st0_tag == TW_Infinity) || + (st0_tag == TW_NaN) ) + { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ((long *)&tll)[0] = t.sigl; + ((long *)&tll)[1] = t.sigh; + if ( (precision_loss == 1) || + ((t.sigh & 0x80000000) && + !((t.sigh == 0x80000000) && (t.sigl == 0) && + signnegative(&t))) ) + { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if ( control_word & EX_Invalid ) + { + /* Produce something like QNaN "indefinite" */ + tll = 0x8000000000000000LL; + } + else + return 0; + } + else + { + if ( precision_loss ) + set_precision_flag(precision_loss); + if ( signnegative(&t) ) + tll = - tll; + } + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,8); + copy_to_user(d, &tll, 8); + RE_ENTRANT_CHECK_ON; + + return 1; +} + + +/* Put a long into user memory */ +int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) +{ + FPU_REG t; + int precision_loss; + + if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if ( (st0_tag == TW_Infinity) || + (st0_tag == TW_NaN) ) + { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0x80000000) && + !((t.sigl == 0x80000000) && signnegative(&t))) ) + { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if ( control_word & EX_Invalid ) + { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x80000000; + } + else + return 0; + } + else + { + if ( precision_loss ) + set_precision_flag(precision_loss); + if ( signnegative(&t) ) + t.sigl = -(long)t.sigl; + } + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,4); + FPU_put_user(t.sigl, (unsigned long __user *) d); + RE_ENTRANT_CHECK_ON; + + return 1; +} + + +/* Put a short into user memory */ +int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) +{ + FPU_REG t; + int precision_loss; + + if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if ( (st0_tag == TW_Infinity) || + (st0_tag == TW_NaN) ) + { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0xffff8000) && + !((t.sigl == 0x8000) && signnegative(&t))) ) + { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if ( control_word & EX_Invalid ) + { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x8000; + } + else + return 0; + } + else + { + if ( precision_loss ) + set_precision_flag(precision_loss); + if ( signnegative(&t) ) + t.sigl = -t.sigl; + } + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,2); + FPU_put_user((short)t.sigl, d); + RE_ENTRANT_CHECK_ON; + + return 1; +} + + +/* Put a packed bcd array into user memory */ +int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) +{ + FPU_REG t; + unsigned long long ll; + u_char b; + int i, precision_loss; + u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; + + if ( st0_tag == TAG_Empty ) + { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } + else if ( st0_tag == TAG_Special ) + { + st0_tag = FPU_Special(st0_ptr); + if ( (st0_tag == TW_Infinity) || + (st0_tag == TW_NaN) ) + { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ll = significand(&t); + + /* Check for overflow, by comparing with 999999999999999999 decimal. */ + if ( (t.sigh > 0x0de0b6b3) || + ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) ) + { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if ( control_word & CW_Invalid ) + { + /* Produce the QNaN "indefinite" */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,10); + for ( i = 0; i < 7; i++) + FPU_put_user(0, d+i); /* These bytes "undefined" */ + FPU_put_user(0xc0, d+7); /* This byte "undefined" */ + FPU_put_user(0xff, d+8); + FPU_put_user(0xff, d+9); + RE_ENTRANT_CHECK_ON; + return 1; + } + else + return 0; + } + else if ( precision_loss ) + { + /* Precision loss doesn't stop the data transfer */ + set_precision_flag(precision_loss); + } + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,10); + RE_ENTRANT_CHECK_ON; + for ( i = 0; i < 9; i++) + { + b = FPU_div_small(&ll, 10); + b |= (FPU_div_small(&ll, 10)) << 4; + RE_ENTRANT_CHECK_OFF; + FPU_put_user(b, d+i); + RE_ENTRANT_CHECK_ON; + } + RE_ENTRANT_CHECK_OFF; + FPU_put_user(sign, d+9); + RE_ENTRANT_CHECK_ON; + + return 1; +} + +/*===========================================================================*/ + +/* r gets mangled such that sig is int, sign: + it is NOT normalized */ +/* The return value (in eax) is zero if the result is exact, + if bits are changed due to rounding, truncation, etc, then + a non-zero value is returned */ +/* Overflow is signalled by a non-zero return value (in eax). + In the case of overflow, the returned significand always has the + largest possible value */ +int FPU_round_to_int(FPU_REG *r, u_char tag) +{ + u_char very_big; + unsigned eax; + + if (tag == TAG_Zero) + { + /* Make sure that zero is returned */ + significand(r) = 0; + return 0; /* o.k. */ + } + + if (exponent(r) > 63) + { + r->sigl = r->sigh = ~0; /* The largest representable number */ + return 1; /* overflow */ + } + + eax = FPU_shrxs(&r->sigl, 63 - exponent(r)); + very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */ +#define half_or_more (eax & 0x80000000) +#define frac_part (eax) +#define more_than_half ((eax & 0x80000001) == 0x80000001) + switch (control_word & CW_RC) + { + case RC_RND: + if ( more_than_half /* nearest */ + || (half_or_more && (r->sigl & 1)) ) /* odd -> even */ + { + if ( very_big ) return 1; /* overflow */ + significand(r) ++; + return PRECISION_LOST_UP; + } + break; + case RC_DOWN: + if (frac_part && getsign(r)) + { + if ( very_big ) return 1; /* overflow */ + significand(r) ++; + return PRECISION_LOST_UP; + } + break; + case RC_UP: + if (frac_part && !getsign(r)) + { + if ( very_big ) return 1; /* overflow */ + significand(r) ++; + return PRECISION_LOST_UP; + } + break; + case RC_CHOP: + break; + } + + return eax ? PRECISION_LOST_DOWN : 0; + +} + +/*===========================================================================*/ + +u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) +{ + unsigned short tag_word = 0; + u_char tag; + int i; + + if ( (addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) + { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x0e); + FPU_get_user(control_word, (unsigned short __user *) s); + FPU_get_user(partial_status, (unsigned short __user *) (s+2)); + FPU_get_user(tag_word, (unsigned short __user *) (s+4)); + FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6)); + FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8)); + FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a)); + FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c)); + RE_ENTRANT_CHECK_ON; + s += 0x0e; + if ( addr_modes.default_mode == VM86 ) + { + instruction_address.offset + += (instruction_address.selector & 0xf000) << 4; + operand_address.offset += (operand_address.selector & 0xf000) << 4; + } + } + else + { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x1c); + FPU_get_user(control_word, (unsigned short __user *) s); + FPU_get_user(partial_status, (unsigned short __user *) (s+4)); + FPU_get_user(tag_word, (unsigned short __user *) (s+8)); + FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c)); + FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10)); + FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12)); + FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14)); + FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18)); + RE_ENTRANT_CHECK_ON; + s += 0x1c; + } + +#ifdef PECULIAR_486 + control_word &= ~0xe080; +#endif /* PECULIAR_486 */ + + top = (partial_status >> SW_Top_Shift) & 7; + + if ( partial_status & ~control_word & CW_Exceptions ) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); + + for ( i = 0; i < 8; i++ ) + { + tag = tag_word & 3; + tag_word >>= 2; + + if ( tag == TAG_Empty ) + /* New tag is empty. Accept it */ + FPU_settag(i, TAG_Empty); + else if ( FPU_gettag(i) == TAG_Empty ) + { + /* Old tag is empty and new tag is not empty. New tag is determined + by old reg contents */ + if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias ) + { + if ( !(fpu_register(i).sigl | fpu_register(i).sigh) ) + FPU_settag(i, TAG_Zero); + else + FPU_settag(i, TAG_Special); + } + else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias ) + { + FPU_settag(i, TAG_Special); + } + else if ( fpu_register(i).sigh & 0x80000000 ) + FPU_settag(i, TAG_Valid); + else + FPU_settag(i, TAG_Special); /* An Un-normal */ + } + /* Else old tag is not empty and new tag is not empty. Old tag + remains correct */ + } + + return s; +} + + +void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) +{ + int i, regnr; + u_char __user *s = fldenv(addr_modes, data_address); + int offset = (top & 7) * 10, other = 80 - offset; + + /* Copy all registers in stack order. */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ,s,80); + __copy_from_user(register_base+offset, s, other); + if ( offset ) + __copy_from_user(register_base, s+other, offset); + RE_ENTRANT_CHECK_ON; + + for ( i = 0; i < 8; i++ ) + { + regnr = (i+top) & 7; + if ( FPU_gettag(regnr) != TAG_Empty ) + /* The loaded data over-rides all other cases. */ + FPU_settag(regnr, FPU_tagof(&st(i))); + } + +} + + +u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) +{ + if ( (addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) + { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,14); +#ifdef PECULIAR_486 + FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d); +#else + FPU_put_user(control_word, (unsigned short __user *) d); +#endif /* PECULIAR_486 */ + FPU_put_user(status_word(), (unsigned short __user *) (d+2)); + FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4)); + FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6)); + FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a)); + if ( addr_modes.default_mode == VM86 ) + { + FPU_put_user((instruction_address.offset & 0xf0000) >> 4, + (unsigned short __user *) (d+8)); + FPU_put_user((operand_address.offset & 0xf0000) >> 4, + (unsigned short __user *) (d+0x0c)); + } + else + { + FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8)); + FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c)); + } + RE_ENTRANT_CHECK_ON; + d += 0x0e; + } + else + { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 7*4); +#ifdef PECULIAR_486 + control_word &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + control_word |= 0xffff0040; + partial_status = status_word() | 0xffff0000; + fpu_tag_word |= 0xffff0000; + I387.soft.fcs &= ~0xf8000000; + I387.soft.fos |= 0xffff0000; +#endif /* PECULIAR_486 */ + __copy_to_user(d, &control_word, 7*4); + RE_ENTRANT_CHECK_ON; + d += 0x1c; + } + + control_word |= CW_Exceptions; + partial_status &= ~(SW_Summary | SW_Backward); + + return d; +} + + +void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) +{ + u_char __user *d; + int offset = (top & 7) * 10, other = 80 - offset; + + d = fstenv(addr_modes, data_address); + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE,d,80); + + /* Copy all registers in stack order. */ + __copy_to_user(d, register_base+offset, other); + if ( offset ) + __copy_to_user(d+other, register_base, offset); + RE_ENTRANT_CHECK_ON; + + finit(); +} + +/*===========================================================================*/ diff --git a/arch/i386/math-emu/reg_mul.c b/arch/i386/math-emu/reg_mul.c new file mode 100644 index 000000000000..40f50b61bc67 --- /dev/null +++ b/arch/i386/math-emu/reg_mul.c @@ -0,0 +1,132 @@ +/*---------------------------------------------------------------------------+ + | reg_mul.c | + | | + | Multiply one FPU_REG by another, put the result in a destination FPU_REG. | + | | + | Copyright (C) 1992,1993,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | Returns the tag of the result if no exceptions or errors occurred. | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | The destination may be any FPU_REG, including one of the source FPU_REGs. | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" +#include "exception.h" +#include "reg_constant.h" +#include "fpu_system.h" + + +/* + Multiply two registers to give a register result. + The sources are st(deststnr) and (b,tagb,signb). + The destination is st(deststnr). + */ +/* This routine must be called with non-empty source registers */ +int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w) +{ + FPU_REG *a = &st(deststnr); + FPU_REG *dest = a; + u_char taga = FPU_gettagi(deststnr); + u_char saved_sign = getsign(dest); + u_char sign = (getsign(a) ^ getsign(b)); + int tag; + + + if ( !(taga | tagb) ) + { + /* Both regs Valid, this should be the most common case. */ + + tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b)); + if ( tag < 0 ) + { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } + + if ( taga == TAG_Special ) + taga = FPU_Special(a); + if ( tagb == TAG_Special ) + tagb = FPU_Special(b); + + if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + || ((taga == TW_Denormal) && (tagb == TAG_Valid)) + || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) + { + FPU_REG x, y; + if ( denormal_operand() < 0 ) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_mul(&x, &y, dest, control_w, sign, + exponent16(&x) + exponent16(&y)); + if ( tag < 0 ) + { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } + else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) + { + if ( ((tagb == TW_Denormal) || (taga == TW_Denormal)) + && (denormal_operand() < 0) ) + return FPU_Exception; + + /* Must have either both arguments == zero, or + one valid and the other zero. + The result is therefore zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* The 80486 book says that the answer is +0, but a real + 80486 behaves this way. + IEEE-754 apparently says it should be this way. */ + setsign(dest, sign); + return TAG_Zero; + } + /* Must have infinities, NaNs, etc */ + else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) + { + return real_2op_NaN(b, tagb, deststnr, &st(0)); + } + else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero)) + || ((tagb == TW_Infinity) && (taga == TAG_Zero)) ) + { + return arith_invalid(deststnr); /* Zero*Infinity is invalid */ + } + else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0) ) + { + return FPU_Exception; + } + else if (taga == TW_Infinity) + { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } + else if (tagb == TW_Infinity) + { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } + +#ifdef PARANOID + else + { + EXCEPTION(EX_INTERNAL|0x102); + return FPU_Exception; + } +#endif /* PARANOID */ + + return 0; +} diff --git a/arch/i386/math-emu/reg_norm.S b/arch/i386/math-emu/reg_norm.S new file mode 100644 index 000000000000..8b6352efceef --- /dev/null +++ b/arch/i386/math-emu/reg_norm.S @@ -0,0 +1,147 @@ +/*---------------------------------------------------------------------------+ + | reg_norm.S | + | | + | Copyright (C) 1992,1993,1994,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | Normalize the value in a FPU_REG. | + | | + | Call from C as: | + | int FPU_normalize(FPU_REG *n) | + | | + | int FPU_normalize_nuo(FPU_REG *n) | + | | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" + + +.text +ENTRY(FPU_normalize) + pushl %ebp + movl %esp,%ebp + pushl %ebx + + movl PARAM1,%ebx + + movl SIGH(%ebx),%edx + movl SIGL(%ebx),%eax + + orl %edx,%edx /* ms bits */ + js L_done /* Already normalized */ + jnz L_shift_1 /* Shift left 1 - 31 bits */ + + orl %eax,%eax + jz L_zero /* The contents are zero */ + + movl %eax,%edx + xorl %eax,%eax + subw $32,EXP(%ebx) /* This can cause an underflow */ + +/* We need to shift left by 1 - 31 bits */ +L_shift_1: + bsrl %edx,%ecx /* get the required shift in %ecx */ + subl $31,%ecx + negl %ecx + shld %cl,%eax,%edx + shl %cl,%eax + subw %cx,EXP(%ebx) /* This can cause an underflow */ + + movl %edx,SIGH(%ebx) + movl %eax,SIGL(%ebx) + +L_done: + cmpw EXP_OVER,EXP(%ebx) + jge L_overflow + + cmpw EXP_UNDER,EXP(%ebx) + jle L_underflow + +L_exit_valid: + movl TAG_Valid,%eax + + /* Convert the exponent to 80x87 form. */ + addw EXTENDED_Ebias,EXP(%ebx) + andw $0x7fff,EXP(%ebx) + +L_exit: + popl %ebx + leave + ret + + +L_zero: + movw $0,EXP(%ebx) + movl TAG_Zero,%eax + jmp L_exit + +L_underflow: + /* Convert the exponent to 80x87 form. */ + addw EXTENDED_Ebias,EXP(%ebx) + push %ebx + call arith_underflow + pop %ebx + jmp L_exit + +L_overflow: + /* Convert the exponent to 80x87 form. */ + addw EXTENDED_Ebias,EXP(%ebx) + push %ebx + call arith_overflow + pop %ebx + jmp L_exit + + + +/* Normalise without reporting underflow or overflow */ +ENTRY(FPU_normalize_nuo) + pushl %ebp + movl %esp,%ebp + pushl %ebx + + movl PARAM1,%ebx + + movl SIGH(%ebx),%edx + movl SIGL(%ebx),%eax + + orl %edx,%edx /* ms bits */ + js L_exit_nuo_valid /* Already normalized */ + jnz L_nuo_shift_1 /* Shift left 1 - 31 bits */ + + orl %eax,%eax + jz L_exit_nuo_zero /* The contents are zero */ + + movl %eax,%edx + xorl %eax,%eax + subw $32,EXP(%ebx) /* This can cause an underflow */ + +/* We need to shift left by 1 - 31 bits */ +L_nuo_shift_1: + bsrl %edx,%ecx /* get the required shift in %ecx */ + subl $31,%ecx + negl %ecx + shld %cl,%eax,%edx + shl %cl,%eax + subw %cx,EXP(%ebx) /* This can cause an underflow */ + + movl %edx,SIGH(%ebx) + movl %eax,SIGL(%ebx) + +L_exit_nuo_valid: + movl TAG_Valid,%eax + + popl %ebx + leave + ret + +L_exit_nuo_zero: + movl TAG_Zero,%eax + movw EXP_UNDER,EXP(%ebx) + + popl %ebx + leave + ret diff --git a/arch/i386/math-emu/reg_round.S b/arch/i386/math-emu/reg_round.S new file mode 100644 index 000000000000..d1d4e48b4f67 --- /dev/null +++ b/arch/i386/math-emu/reg_round.S @@ -0,0 +1,708 @@ + .file "reg_round.S" +/*---------------------------------------------------------------------------+ + | reg_round.S | + | | + | Rounding/truncation/etc for FPU basic arithmetic functions. | + | | + | Copyright (C) 1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | This code has four possible entry points. | + | The following must be entered by a jmp instruction: | + | fpu_reg_round, fpu_reg_round_sqrt, and fpu_Arith_exit. | + | | + | The FPU_round entry point is intended to be used by C code. | + | From C, call as: | + | int FPU_round(FPU_REG *arg, unsigned int extent, unsigned int control_w) | + | | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + | | + | For correct "up" and "down" rounding, the argument must have the correct | + | sign. | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Four entry points. | + | | + | Needed by both the fpu_reg_round and fpu_reg_round_sqrt entry points: | + | %eax:%ebx 64 bit significand | + | %edx 32 bit extension of the significand | + | %edi pointer to an FPU_REG for the result to be stored | + | stack calling function must have set up a C stack frame and | + | pushed %esi, %edi, and %ebx | + | | + | Needed just for the fpu_reg_round_sqrt entry point: | + | %cx A control word in the same format as the FPU control word. | + | Otherwise, PARAM4 must give such a value. | + | | + | | + | The significand and its extension are assumed to be exact in the | + | following sense: | + | If the significand by itself is the exact result then the significand | + | extension (%edx) must contain 0, otherwise the significand extension | + | must be non-zero. | + | If the significand extension is non-zero then the significand is | + | smaller than the magnitude of the correct exact result by an amount | + | greater than zero and less than one ls bit of the significand. | + | The significand extension is only required to have three possible | + | non-zero values: | + | less than 0x80000000 <=> the significand is less than 1/2 an ls | + | bit smaller than the magnitude of the | + | true exact result. | + | exactly 0x80000000 <=> the significand is exactly 1/2 an ls bit | + | smaller than the magnitude of the true | + | exact result. | + | greater than 0x80000000 <=> the significand is more than 1/2 an ls | + | bit smaller than the magnitude of the | + | true exact result. | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | The code in this module has become quite complex, but it should handle | + | all of the FPU flags which are set at this stage of the basic arithmetic | + | computations. | + | There are a few rare cases where the results are not set identically to | + | a real FPU. These require a bit more thought because at this stage the | + | results of the code here appear to be more consistent... | + | This may be changed in a future version. | + +---------------------------------------------------------------------------*/ + + +#include "fpu_emu.h" +#include "exception.h" +#include "control_w.h" + +/* Flags for FPU_bits_lost */ +#define LOST_DOWN $1 +#define LOST_UP $2 + +/* Flags for FPU_denormal */ +#define DENORMAL $1 +#define UNMASKED_UNDERFLOW $2 + + +#ifndef NON_REENTRANT_FPU +/* Make the code re-entrant by putting + local storage on the stack: */ +#define FPU_bits_lost (%esp) +#define FPU_denormal 1(%esp) + +#else +/* Not re-entrant, so we can gain speed by putting + local storage in a static area: */ +.data + .align 4,0 +FPU_bits_lost: + .byte 0 +FPU_denormal: + .byte 0 +#endif /* NON_REENTRANT_FPU */ + + +.text +.globl fpu_reg_round +.globl fpu_Arith_exit + +/* Entry point when called from C */ +ENTRY(FPU_round) + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%edi + movl SIGH(%edi),%eax + movl SIGL(%edi),%ebx + movl PARAM2,%edx + +fpu_reg_round: /* Normal entry point */ + movl PARAM4,%ecx + +#ifndef NON_REENTRANT_FPU + pushl %ebx /* adjust the stack pointer */ +#endif /* NON_REENTRANT_FPU */ + +#ifdef PARANOID +/* Cannot use this here yet */ +/* orl %eax,%eax */ +/* jns L_entry_bugged */ +#endif /* PARANOID */ + + cmpw EXP_UNDER,EXP(%edi) + jle L_Make_denorm /* The number is a de-normal */ + + movb $0,FPU_denormal /* 0 -> not a de-normal */ + +Denorm_done: + movb $0,FPU_bits_lost /* No bits yet lost in rounding */ + + movl %ecx,%esi + andl CW_PC,%ecx + cmpl PR_64_BITS,%ecx + je LRound_To_64 + + cmpl PR_53_BITS,%ecx + je LRound_To_53 + + cmpl PR_24_BITS,%ecx + je LRound_To_24 + +#ifdef PECULIAR_486 +/* With the precision control bits set to 01 "(reserved)", a real 80486 + behaves as if the precision control bits were set to 11 "64 bits" */ + cmpl PR_RESERVED_BITS,%ecx + je LRound_To_64 +#ifdef PARANOID + jmp L_bugged_denorm_486 +#endif /* PARANOID */ +#else +#ifdef PARANOID + jmp L_bugged_denorm /* There is no bug, just a bad control word */ +#endif /* PARANOID */ +#endif /* PECULIAR_486 */ + + +/* Round etc to 24 bit precision */ +LRound_To_24: + movl %esi,%ecx + andl CW_RC,%ecx + cmpl RC_RND,%ecx + je LRound_nearest_24 + + cmpl RC_CHOP,%ecx + je LCheck_truncate_24 + + cmpl RC_UP,%ecx /* Towards +infinity */ + je LUp_24 + + cmpl RC_DOWN,%ecx /* Towards -infinity */ + je LDown_24 + +#ifdef PARANOID + jmp L_bugged_round24 +#endif /* PARANOID */ + +LUp_24: + cmpb SIGN_POS,PARAM5 + jne LCheck_truncate_24 /* If negative then up==truncate */ + + jmp LCheck_24_round_up + +LDown_24: + cmpb SIGN_POS,PARAM5 + je LCheck_truncate_24 /* If positive then down==truncate */ + +LCheck_24_round_up: + movl %eax,%ecx + andl $0x000000ff,%ecx + orl %ebx,%ecx + orl %edx,%ecx + jnz LDo_24_round_up + jmp L_Re_normalise + +LRound_nearest_24: + /* Do rounding of the 24th bit if needed (nearest or even) */ + movl %eax,%ecx + andl $0x000000ff,%ecx + cmpl $0x00000080,%ecx + jc LCheck_truncate_24 /* less than half, no increment needed */ + + jne LGreater_Half_24 /* greater than half, increment needed */ + + /* Possibly half, we need to check the ls bits */ + orl %ebx,%ebx + jnz LGreater_Half_24 /* greater than half, increment needed */ + + orl %edx,%edx + jnz LGreater_Half_24 /* greater than half, increment needed */ + + /* Exactly half, increment only if 24th bit is 1 (round to even) */ + testl $0x00000100,%eax + jz LDo_truncate_24 + +LGreater_Half_24: /* Rounding: increment at the 24th bit */ +LDo_24_round_up: + andl $0xffffff00,%eax /* Truncate to 24 bits */ + xorl %ebx,%ebx + movb LOST_UP,FPU_bits_lost + addl $0x00000100,%eax + jmp LCheck_Round_Overflow + +LCheck_truncate_24: + movl %eax,%ecx + andl $0x000000ff,%ecx + orl %ebx,%ecx + orl %edx,%ecx + jz L_Re_normalise /* No truncation needed */ + +LDo_truncate_24: + andl $0xffffff00,%eax /* Truncate to 24 bits */ + xorl %ebx,%ebx + movb LOST_DOWN,FPU_bits_lost + jmp L_Re_normalise + + +/* Round etc to 53 bit precision */ +LRound_To_53: + movl %esi,%ecx + andl CW_RC,%ecx + cmpl RC_RND,%ecx + je LRound_nearest_53 + + cmpl RC_CHOP,%ecx + je LCheck_truncate_53 + + cmpl RC_UP,%ecx /* Towards +infinity */ + je LUp_53 + + cmpl RC_DOWN,%ecx /* Towards -infinity */ + je LDown_53 + +#ifdef PARANOID + jmp L_bugged_round53 +#endif /* PARANOID */ + +LUp_53: + cmpb SIGN_POS,PARAM5 + jne LCheck_truncate_53 /* If negative then up==truncate */ + + jmp LCheck_53_round_up + +LDown_53: + cmpb SIGN_POS,PARAM5 + je LCheck_truncate_53 /* If positive then down==truncate */ + +LCheck_53_round_up: + movl %ebx,%ecx + andl $0x000007ff,%ecx + orl %edx,%ecx + jnz LDo_53_round_up + jmp L_Re_normalise + +LRound_nearest_53: + /* Do rounding of the 53rd bit if needed (nearest or even) */ + movl %ebx,%ecx + andl $0x000007ff,%ecx + cmpl $0x00000400,%ecx + jc LCheck_truncate_53 /* less than half, no increment needed */ + + jnz LGreater_Half_53 /* greater than half, increment needed */ + + /* Possibly half, we need to check the ls bits */ + orl %edx,%edx + jnz LGreater_Half_53 /* greater than half, increment needed */ + + /* Exactly half, increment only if 53rd bit is 1 (round to even) */ + testl $0x00000800,%ebx + jz LTruncate_53 + +LGreater_Half_53: /* Rounding: increment at the 53rd bit */ +LDo_53_round_up: + movb LOST_UP,FPU_bits_lost + andl $0xfffff800,%ebx /* Truncate to 53 bits */ + addl $0x00000800,%ebx + adcl $0,%eax + jmp LCheck_Round_Overflow + +LCheck_truncate_53: + movl %ebx,%ecx + andl $0x000007ff,%ecx + orl %edx,%ecx + jz L_Re_normalise + +LTruncate_53: + movb LOST_DOWN,FPU_bits_lost + andl $0xfffff800,%ebx /* Truncate to 53 bits */ + jmp L_Re_normalise + + +/* Round etc to 64 bit precision */ +LRound_To_64: + movl %esi,%ecx + andl CW_RC,%ecx + cmpl RC_RND,%ecx + je LRound_nearest_64 + + cmpl RC_CHOP,%ecx + je LCheck_truncate_64 + + cmpl RC_UP,%ecx /* Towards +infinity */ + je LUp_64 + + cmpl RC_DOWN,%ecx /* Towards -infinity */ + je LDown_64 + +#ifdef PARANOID + jmp L_bugged_round64 +#endif /* PARANOID */ + +LUp_64: + cmpb SIGN_POS,PARAM5 + jne LCheck_truncate_64 /* If negative then up==truncate */ + + orl %edx,%edx + jnz LDo_64_round_up + jmp L_Re_normalise + +LDown_64: + cmpb SIGN_POS,PARAM5 + je LCheck_truncate_64 /* If positive then down==truncate */ + + orl %edx,%edx + jnz LDo_64_round_up + jmp L_Re_normalise + +LRound_nearest_64: + cmpl $0x80000000,%edx + jc LCheck_truncate_64 + + jne LDo_64_round_up + + /* Now test for round-to-even */ + testb $1,%bl + jz LCheck_truncate_64 + +LDo_64_round_up: + movb LOST_UP,FPU_bits_lost + addl $1,%ebx + adcl $0,%eax + +LCheck_Round_Overflow: + jnc L_Re_normalise + + /* Overflow, adjust the result (significand to 1.0) */ + rcrl $1,%eax + rcrl $1,%ebx + incw EXP(%edi) + jmp L_Re_normalise + +LCheck_truncate_64: + orl %edx,%edx + jz L_Re_normalise + +LTruncate_64: + movb LOST_DOWN,FPU_bits_lost + +L_Re_normalise: + testb $0xff,FPU_denormal + jnz Normalise_result + +L_Normalised: + movl TAG_Valid,%edx + +L_deNormalised: + cmpb LOST_UP,FPU_bits_lost + je L_precision_lost_up + + cmpb LOST_DOWN,FPU_bits_lost + je L_precision_lost_down + +L_no_precision_loss: + /* store the result */ + +L_Store_significand: + movl %eax,SIGH(%edi) + movl %ebx,SIGL(%edi) + + cmpw EXP_OVER,EXP(%edi) + jge L_overflow + + movl %edx,%eax + + /* Convert the exponent to 80x87 form. */ + addw EXTENDED_Ebias,EXP(%edi) + andw $0x7fff,EXP(%edi) + +fpu_reg_round_signed_special_exit: + + cmpb SIGN_POS,PARAM5 + je fpu_reg_round_special_exit + + orw $0x8000,EXP(%edi) /* Negative sign for the result. */ + +fpu_reg_round_special_exit: + +#ifndef NON_REENTRANT_FPU + popl %ebx /* adjust the stack pointer */ +#endif /* NON_REENTRANT_FPU */ + +fpu_Arith_exit: + popl %ebx + popl %edi + popl %esi + leave + ret + + +/* + * Set the FPU status flags to represent precision loss due to + * round-up. + */ +L_precision_lost_up: + push %edx + push %eax + call set_precision_flag_up + popl %eax + popl %edx + jmp L_no_precision_loss + +/* + * Set the FPU status flags to represent precision loss due to + * truncation. + */ +L_precision_lost_down: + push %edx + push %eax + call set_precision_flag_down + popl %eax + popl %edx + jmp L_no_precision_loss + + +/* + * The number is a denormal (which might get rounded up to a normal) + * Shift the number right the required number of bits, which will + * have to be undone later... + */ +L_Make_denorm: + /* The action to be taken depends upon whether the underflow + exception is masked */ + testb CW_Underflow,%cl /* Underflow mask. */ + jz Unmasked_underflow /* Do not make a denormal. */ + + movb DENORMAL,FPU_denormal + + pushl %ecx /* Save */ + movw EXP_UNDER+1,%cx + subw EXP(%edi),%cx + + cmpw $64,%cx /* shrd only works for 0..31 bits */ + jnc Denorm_shift_more_than_63 + + cmpw $32,%cx /* shrd only works for 0..31 bits */ + jnc Denorm_shift_more_than_32 + +/* + * We got here without jumps by assuming that the most common requirement + * is for a small de-normalising shift. + * Shift by [1..31] bits + */ + addw %cx,EXP(%edi) + orl %edx,%edx /* extension */ + setne %ch /* Save whether %edx is non-zero */ + xorl %edx,%edx + shrd %cl,%ebx,%edx + shrd %cl,%eax,%ebx + shr %cl,%eax + orb %ch,%dl + popl %ecx + jmp Denorm_done + +/* Shift by [32..63] bits */ +Denorm_shift_more_than_32: + addw %cx,EXP(%edi) + subb $32,%cl + orl %edx,%edx + setne %ch + orb %ch,%bl + xorl %edx,%edx + shrd %cl,%ebx,%edx + shrd %cl,%eax,%ebx + shr %cl,%eax + orl %edx,%edx /* test these 32 bits */ + setne %cl + orb %ch,%bl + orb %cl,%bl + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + popl %ecx + jmp Denorm_done + +/* Shift by [64..) bits */ +Denorm_shift_more_than_63: + cmpw $64,%cx + jne Denorm_shift_more_than_64 + +/* Exactly 64 bit shift */ + addw %cx,EXP(%edi) + xorl %ecx,%ecx + orl %edx,%edx + setne %cl + orl %ebx,%ebx + setne %ch + orb %ch,%cl + orb %cl,%al + movl %eax,%edx + xorl %eax,%eax + xorl %ebx,%ebx + popl %ecx + jmp Denorm_done + +Denorm_shift_more_than_64: + movw EXP_UNDER+1,EXP(%edi) +/* This is easy, %eax must be non-zero, so.. */ + movl $1,%edx + xorl %eax,%eax + xorl %ebx,%ebx + popl %ecx + jmp Denorm_done + + +Unmasked_underflow: + movb UNMASKED_UNDERFLOW,FPU_denormal + jmp Denorm_done + + +/* Undo the de-normalisation. */ +Normalise_result: + cmpb UNMASKED_UNDERFLOW,FPU_denormal + je Signal_underflow + +/* The number must be a denormal if we got here. */ +#ifdef PARANOID + /* But check it... just in case. */ + cmpw EXP_UNDER+1,EXP(%edi) + jne L_norm_bugged +#endif /* PARANOID */ + +#ifdef PECULIAR_486 + /* + * This implements a special feature of 80486 behaviour. + * Underflow will be signalled even if the number is + * not a denormal after rounding. + * This difference occurs only for masked underflow, and not + * in the unmasked case. + * Actual 80486 behaviour differs from this in some circumstances. + */ + orl %eax,%eax /* ms bits */ + js LPseudoDenormal /* Will be masked underflow */ +#else + orl %eax,%eax /* ms bits */ + js L_Normalised /* No longer a denormal */ +#endif /* PECULIAR_486 */ + + jnz LDenormal_adj_exponent + + orl %ebx,%ebx + jz L_underflow_to_zero /* The contents are zero */ + +LDenormal_adj_exponent: + decw EXP(%edi) + +LPseudoDenormal: + testb $0xff,FPU_bits_lost /* bits lost == underflow */ + movl TAG_Special,%edx + jz L_deNormalised + + /* There must be a masked underflow */ + push %eax + pushl EX_Underflow + call EXCEPTION + popl %eax + popl %eax + movl TAG_Special,%edx + jmp L_deNormalised + + +/* + * The operations resulted in a number too small to represent. + * Masked response. + */ +L_underflow_to_zero: + push %eax + call set_precision_flag_down + popl %eax + + push %eax + pushl EX_Underflow + call EXCEPTION + popl %eax + popl %eax + +/* Reduce the exponent to EXP_UNDER */ + movw EXP_UNDER,EXP(%edi) + movl TAG_Zero,%edx + jmp L_Store_significand + + +/* The operations resulted in a number too large to represent. */ +L_overflow: + addw EXTENDED_Ebias,EXP(%edi) /* Set for unmasked response. */ + push %edi + call arith_overflow + pop %edi + jmp fpu_reg_round_signed_special_exit + + +Signal_underflow: + /* The number may have been changed to a non-denormal */ + /* by the rounding operations. */ + cmpw EXP_UNDER,EXP(%edi) + jle Do_unmasked_underflow + + jmp L_Normalised + +Do_unmasked_underflow: + /* Increase the exponent by the magic number */ + addw $(3*(1<<13)),EXP(%edi) + push %eax + pushl EX_Underflow + call EXCEPTION + popl %eax + popl %eax + jmp L_Normalised + + +#ifdef PARANOID +#ifdef PECULIAR_486 +L_bugged_denorm_486: + pushl EX_INTERNAL|0x236 + call EXCEPTION + popl %ebx + jmp L_exception_exit +#else +L_bugged_denorm: + pushl EX_INTERNAL|0x230 + call EXCEPTION + popl %ebx + jmp L_exception_exit +#endif /* PECULIAR_486 */ + +L_bugged_round24: + pushl EX_INTERNAL|0x231 + call EXCEPTION + popl %ebx + jmp L_exception_exit + +L_bugged_round53: + pushl EX_INTERNAL|0x232 + call EXCEPTION + popl %ebx + jmp L_exception_exit + +L_bugged_round64: + pushl EX_INTERNAL|0x233 + call EXCEPTION + popl %ebx + jmp L_exception_exit + +L_norm_bugged: + pushl EX_INTERNAL|0x234 + call EXCEPTION + popl %ebx + jmp L_exception_exit + +L_entry_bugged: + pushl EX_INTERNAL|0x235 + call EXCEPTION + popl %ebx +L_exception_exit: + mov $-1,%eax + jmp fpu_reg_round_special_exit +#endif /* PARANOID */ diff --git a/arch/i386/math-emu/reg_u_add.S b/arch/i386/math-emu/reg_u_add.S new file mode 100644 index 000000000000..47c4c2434d85 --- /dev/null +++ b/arch/i386/math-emu/reg_u_add.S @@ -0,0 +1,167 @@ + .file "reg_u_add.S" +/*---------------------------------------------------------------------------+ + | reg_u_add.S | + | | + | Add two valid (TAG_Valid) FPU_REG numbers, of the same sign, and put the | + | result in a destination FPU_REG. | + | | + | Copyright (C) 1992,1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | Call from C as: | + | int FPU_u_add(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, | + | int control_w) | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + | | + +---------------------------------------------------------------------------*/ + +/* + | Kernel addition routine FPU_u_add(reg *arg1, reg *arg2, reg *answ). + | Takes two valid reg f.p. numbers (TAG_Valid), which are + | treated as unsigned numbers, + | and returns their sum as a TAG_Valid or TAG_Special f.p. number. + | The returned number is normalized. + | Basic checks are performed if PARANOID is defined. + */ + +#include "exception.h" +#include "fpu_emu.h" +#include "control_w.h" + +.text +ENTRY(FPU_u_add) + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%esi /* source 1 */ + movl PARAM2,%edi /* source 2 */ + + movl PARAM6,%ecx + movl %ecx,%edx + subl PARAM7,%ecx /* exp1 - exp2 */ + jge L_arg1_larger + + /* num1 is smaller */ + movl SIGL(%esi),%ebx + movl SIGH(%esi),%eax + + movl %edi,%esi + movl PARAM7,%edx + negw %cx + jmp L_accum_loaded + +L_arg1_larger: + /* num1 has larger or equal exponent */ + movl SIGL(%edi),%ebx + movl SIGH(%edi),%eax + +L_accum_loaded: + movl PARAM3,%edi /* destination */ + movw %dx,EXP(%edi) /* Copy exponent to destination */ + + xorl %edx,%edx /* clear the extension */ + +#ifdef PARANOID + testl $0x80000000,%eax + je L_bugged + + testl $0x80000000,SIGH(%esi) + je L_bugged +#endif /* PARANOID */ + +/* The number to be shifted is in %eax:%ebx:%edx */ + cmpw $32,%cx /* shrd only works for 0..31 bits */ + jnc L_more_than_31 + +/* less than 32 bits */ + shrd %cl,%ebx,%edx + shrd %cl,%eax,%ebx + shr %cl,%eax + jmp L_shift_done + +L_more_than_31: + cmpw $64,%cx + jnc L_more_than_63 + + subb $32,%cl + jz L_exactly_32 + + shrd %cl,%eax,%edx + shr %cl,%eax + orl %ebx,%ebx + jz L_more_31_no_low /* none of the lowest bits is set */ + + orl $1,%edx /* record the fact in the extension */ + +L_more_31_no_low: + movl %eax,%ebx + xorl %eax,%eax + jmp L_shift_done + +L_exactly_32: + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + jmp L_shift_done + +L_more_than_63: + cmpw $65,%cx + jnc L_more_than_64 + + movl %eax,%edx + orl %ebx,%ebx + jz L_more_63_no_low + + orl $1,%edx + jmp L_more_63_no_low + +L_more_than_64: + movl $1,%edx /* The shifted nr always at least one '1' */ + +L_more_63_no_low: + xorl %ebx,%ebx + xorl %eax,%eax + +L_shift_done: + /* Now do the addition */ + addl SIGL(%esi),%ebx + adcl SIGH(%esi),%eax + jnc L_round_the_result + + /* Overflow, adjust the result */ + rcrl $1,%eax + rcrl $1,%ebx + rcrl $1,%edx + jnc L_no_bit_lost + + orl $1,%edx + +L_no_bit_lost: + incw EXP(%edi) + +L_round_the_result: + jmp fpu_reg_round /* Round the result */ + + + +#ifdef PARANOID +/* If we ever get here then we have problems! */ +L_bugged: + pushl EX_INTERNAL|0x201 + call EXCEPTION + pop %ebx + movl $-1,%eax + jmp L_exit + +L_exit: + popl %ebx + popl %edi + popl %esi + leave + ret +#endif /* PARANOID */ diff --git a/arch/i386/math-emu/reg_u_div.S b/arch/i386/math-emu/reg_u_div.S new file mode 100644 index 000000000000..cc00654b6f9a --- /dev/null +++ b/arch/i386/math-emu/reg_u_div.S @@ -0,0 +1,471 @@ + .file "reg_u_div.S" +/*---------------------------------------------------------------------------+ + | reg_u_div.S | + | | + | Divide one FPU_REG by another and put the result in a destination FPU_REG.| + | | + | Copyright (C) 1992,1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Call from C as: | + | int FPU_u_div(FPU_REG *a, FPU_REG *b, FPU_REG *dest, | + | unsigned int control_word, char *sign) | + | | + | Does not compute the destination exponent, but does adjust it. | + | | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "fpu_emu.h" +#include "control_w.h" + + +/* #define dSIGL(x) (x) */ +/* #define dSIGH(x) 4(x) */ + + +#ifndef NON_REENTRANT_FPU +/* + Local storage on the stack: + Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 + Overflow flag: ovfl_flag + */ +#define FPU_accum_3 -4(%ebp) +#define FPU_accum_2 -8(%ebp) +#define FPU_accum_1 -12(%ebp) +#define FPU_accum_0 -16(%ebp) +#define FPU_result_1 -20(%ebp) +#define FPU_result_2 -24(%ebp) +#define FPU_ovfl_flag -28(%ebp) + +#else +.data +/* + Local storage in a static area: + Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0 + Overflow flag: ovfl_flag + */ + .align 4,0 +FPU_accum_3: + .long 0 +FPU_accum_2: + .long 0 +FPU_accum_1: + .long 0 +FPU_accum_0: + .long 0 +FPU_result_1: + .long 0 +FPU_result_2: + .long 0 +FPU_ovfl_flag: + .byte 0 +#endif /* NON_REENTRANT_FPU */ + +#define REGA PARAM1 +#define REGB PARAM2 +#define DEST PARAM3 + +.text +ENTRY(FPU_u_div) + pushl %ebp + movl %esp,%ebp +#ifndef NON_REENTRANT_FPU + subl $28,%esp +#endif /* NON_REENTRANT_FPU */ + + pushl %esi + pushl %edi + pushl %ebx + + movl REGA,%esi + movl REGB,%ebx + movl DEST,%edi + + movswl EXP(%esi),%edx + movswl EXP(%ebx),%eax + subl %eax,%edx + addl EXP_BIAS,%edx + + /* A denormal and a large number can cause an exponent underflow */ + cmpl EXP_WAY_UNDER,%edx + jg xExp_not_underflow + + /* Set to a really low value allow correct handling */ + movl EXP_WAY_UNDER,%edx + +xExp_not_underflow: + + movw %dx,EXP(%edi) + +#ifdef PARANOID +/* testl $0x80000000, SIGH(%esi) // Dividend */ +/* je L_bugged */ + testl $0x80000000, SIGH(%ebx) /* Divisor */ + je L_bugged +#endif /* PARANOID */ + +/* Check if the divisor can be treated as having just 32 bits */ + cmpl $0,SIGL(%ebx) + jnz L_Full_Division /* Can't do a quick divide */ + +/* We should be able to zip through the division here */ + movl SIGH(%ebx),%ecx /* The divisor */ + movl SIGH(%esi),%edx /* Dividend */ + movl SIGL(%esi),%eax /* Dividend */ + + cmpl %ecx,%edx + setaeb FPU_ovfl_flag /* Keep a record */ + jb L_no_adjust + + subl %ecx,%edx /* Prevent the overflow */ + +L_no_adjust: + /* Divide the 64 bit number by the 32 bit denominator */ + divl %ecx + movl %eax,FPU_result_2 + + /* Work on the remainder of the first division */ + xorl %eax,%eax + divl %ecx + movl %eax,FPU_result_1 + + /* Work on the remainder of the 64 bit division */ + xorl %eax,%eax + divl %ecx + + testb $255,FPU_ovfl_flag /* was the num > denom ? */ + je L_no_overflow + + /* Do the shifting here */ + /* increase the exponent */ + incw EXP(%edi) + + /* shift the mantissa right one bit */ + stc /* To set the ms bit */ + rcrl FPU_result_2 + rcrl FPU_result_1 + rcrl %eax + +L_no_overflow: + jmp LRound_precision /* Do the rounding as required */ + + +/*---------------------------------------------------------------------------+ + | Divide: Return arg1/arg2 to arg3. | + | | + | This routine does not use the exponents of arg1 and arg2, but does | + | adjust the exponent of arg3. | + | | + | The maximum returned value is (ignoring exponents) | + | .ffffffff ffffffff | + | ------------------ = 1.ffffffff fffffffe | + | .80000000 00000000 | + | and the minimum is | + | .80000000 00000000 | + | ------------------ = .80000000 00000001 (rounded) | + | .ffffffff ffffffff | + | | + +---------------------------------------------------------------------------*/ + + +L_Full_Division: + /* Save extended dividend in local register */ + movl SIGL(%esi),%eax + movl %eax,FPU_accum_2 + movl SIGH(%esi),%eax + movl %eax,FPU_accum_3 + xorl %eax,%eax + movl %eax,FPU_accum_1 /* zero the extension */ + movl %eax,FPU_accum_0 /* zero the extension */ + + movl SIGL(%esi),%eax /* Get the current num */ + movl SIGH(%esi),%edx + +/*----------------------------------------------------------------------*/ +/* Initialization done. + Do the first 32 bits. */ + + movb $0,FPU_ovfl_flag + cmpl SIGH(%ebx),%edx /* Test for imminent overflow */ + jb LLess_than_1 + ja LGreater_than_1 + + cmpl SIGL(%ebx),%eax + jb LLess_than_1 + +LGreater_than_1: +/* The dividend is greater or equal, would cause overflow */ + setaeb FPU_ovfl_flag /* Keep a record */ + + subl SIGL(%ebx),%eax + sbbl SIGH(%ebx),%edx /* Prevent the overflow */ + movl %eax,FPU_accum_2 + movl %edx,FPU_accum_3 + +LLess_than_1: +/* At this point, we have a dividend < divisor, with a record of + adjustment in FPU_ovfl_flag */ + + /* We will divide by a number which is too large */ + movl SIGH(%ebx),%ecx + addl $1,%ecx + jnc LFirst_div_not_1 + + /* here we need to divide by 100000000h, + i.e., no division at all.. */ + mov %edx,%eax + jmp LFirst_div_done + +LFirst_div_not_1: + divl %ecx /* Divide the numerator by the augmented + denom ms dw */ + +LFirst_div_done: + movl %eax,FPU_result_2 /* Put the result in the answer */ + + mull SIGH(%ebx) /* mul by the ms dw of the denom */ + + subl %eax,FPU_accum_2 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_3 + + movl FPU_result_2,%eax /* Get the result back */ + mull SIGL(%ebx) /* now mul the ls dw of the denom */ + + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + sbbl $0,FPU_accum_3 + je LDo_2nd_32_bits /* Must check for non-zero result here */ + +#ifdef PARANOID + jb L_bugged_1 +#endif /* PARANOID */ + + /* need to subtract another once of the denom */ + incl FPU_result_2 /* Correct the answer */ + + movl SIGL(%ebx),%eax + movl SIGH(%ebx),%edx + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + +#ifdef PARANOID + sbbl $0,FPU_accum_3 + jne L_bugged_1 /* Must check for non-zero result here */ +#endif /* PARANOID */ + +/*----------------------------------------------------------------------*/ +/* Half of the main problem is done, there is just a reduced numerator + to handle now. + Work with the second 32 bits, FPU_accum_0 not used from now on */ +LDo_2nd_32_bits: + movl FPU_accum_2,%edx /* get the reduced num */ + movl FPU_accum_1,%eax + + /* need to check for possible subsequent overflow */ + cmpl SIGH(%ebx),%edx + jb LDo_2nd_div + ja LPrevent_2nd_overflow + + cmpl SIGL(%ebx),%eax + jb LDo_2nd_div + +LPrevent_2nd_overflow: +/* The numerator is greater or equal, would cause overflow */ + /* prevent overflow */ + subl SIGL(%ebx),%eax + sbbl SIGH(%ebx),%edx + movl %edx,FPU_accum_2 + movl %eax,FPU_accum_1 + + incl FPU_result_2 /* Reflect the subtraction in the answer */ + +#ifdef PARANOID + je L_bugged_2 /* Can't bump the result to 1.0 */ +#endif /* PARANOID */ + +LDo_2nd_div: + cmpl $0,%ecx /* augmented denom msw */ + jnz LSecond_div_not_1 + + /* %ecx == 0, we are dividing by 1.0 */ + mov %edx,%eax + jmp LSecond_div_done + +LSecond_div_not_1: + divl %ecx /* Divide the numerator by the denom ms dw */ + +LSecond_div_done: + movl %eax,FPU_result_1 /* Put the result in the answer */ + + mull SIGH(%ebx) /* mul by the ms dw of the denom */ + + subl %eax,FPU_accum_1 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 +#endif /* PARANOID */ + + movl FPU_result_1,%eax /* Get the result back */ + mull SIGL(%ebx) /* now mul the ls dw of the denom */ + + subl %eax,FPU_accum_0 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */ + sbbl $0,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 +#endif /* PARANOID */ + + jz LDo_3rd_32_bits + +#ifdef PARANOID + cmpl $1,FPU_accum_2 + jne L_bugged_2 +#endif /* PARANOID */ + + /* need to subtract another once of the denom */ + movl SIGL(%ebx),%eax + movl SIGH(%ebx),%edx + subl %eax,FPU_accum_0 /* Subtract from the num local reg */ + sbbl %edx,FPU_accum_1 + sbbl $0,FPU_accum_2 + +#ifdef PARANOID + jc L_bugged_2 + jne L_bugged_2 +#endif /* PARANOID */ + + addl $1,FPU_result_1 /* Correct the answer */ + adcl $0,FPU_result_2 + +#ifdef PARANOID + jc L_bugged_2 /* Must check for non-zero result here */ +#endif /* PARANOID */ + +/*----------------------------------------------------------------------*/ +/* The division is essentially finished here, we just need to perform + tidying operations. + Deal with the 3rd 32 bits */ +LDo_3rd_32_bits: + movl FPU_accum_1,%edx /* get the reduced num */ + movl FPU_accum_0,%eax + + /* need to check for possible subsequent overflow */ + cmpl SIGH(%ebx),%edx /* denom */ + jb LRound_prep + ja LPrevent_3rd_overflow + + cmpl SIGL(%ebx),%eax /* denom */ + jb LRound_prep + +LPrevent_3rd_overflow: + /* prevent overflow */ + subl SIGL(%ebx),%eax + sbbl SIGH(%ebx),%edx + movl %edx,FPU_accum_1 + movl %eax,FPU_accum_0 + + addl $1,FPU_result_1 /* Reflect the subtraction in the answer */ + adcl $0,FPU_result_2 + jne LRound_prep + jnc LRound_prep + + /* This is a tricky spot, there is an overflow of the answer */ + movb $255,FPU_ovfl_flag /* Overflow -> 1.000 */ + +LRound_prep: +/* + * Prepare for rounding. + * To test for rounding, we just need to compare 2*accum with the + * denom. + */ + movl FPU_accum_0,%ecx + movl FPU_accum_1,%edx + movl %ecx,%eax + orl %edx,%eax + jz LRound_ovfl /* The accumulator contains zero. */ + + /* Multiply by 2 */ + clc + rcll $1,%ecx + rcll $1,%edx + jc LRound_large /* No need to compare, denom smaller */ + + subl SIGL(%ebx),%ecx + sbbl SIGH(%ebx),%edx + jnc LRound_not_small + + movl $0x70000000,%eax /* Denom was larger */ + jmp LRound_ovfl + +LRound_not_small: + jnz LRound_large + + movl $0x80000000,%eax /* Remainder was exactly 1/2 denom */ + jmp LRound_ovfl + +LRound_large: + movl $0xff000000,%eax /* Denom was smaller */ + +LRound_ovfl: +/* We are now ready to deal with rounding, but first we must get + the bits properly aligned */ + testb $255,FPU_ovfl_flag /* was the num > denom ? */ + je LRound_precision + + incw EXP(%edi) + + /* shift the mantissa right one bit */ + stc /* Will set the ms bit */ + rcrl FPU_result_2 + rcrl FPU_result_1 + rcrl %eax + +/* Round the result as required */ +LRound_precision: + decw EXP(%edi) /* binary point between 1st & 2nd bits */ + + movl %eax,%edx + movl FPU_result_1,%ebx + movl FPU_result_2,%eax + jmp fpu_reg_round + + +#ifdef PARANOID +/* The logic is wrong if we got here */ +L_bugged: + pushl EX_INTERNAL|0x202 + call EXCEPTION + pop %ebx + jmp L_exit + +L_bugged_1: + pushl EX_INTERNAL|0x203 + call EXCEPTION + pop %ebx + jmp L_exit + +L_bugged_2: + pushl EX_INTERNAL|0x204 + call EXCEPTION + pop %ebx + jmp L_exit + +L_exit: + movl $-1,%eax + popl %ebx + popl %edi + popl %esi + + leave + ret +#endif /* PARANOID */ diff --git a/arch/i386/math-emu/reg_u_mul.S b/arch/i386/math-emu/reg_u_mul.S new file mode 100644 index 000000000000..973f12af97df --- /dev/null +++ b/arch/i386/math-emu/reg_u_mul.S @@ -0,0 +1,148 @@ + .file "reg_u_mul.S" +/*---------------------------------------------------------------------------+ + | reg_u_mul.S | + | | + | Core multiplication routine | + | | + | Copyright (C) 1992,1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | Basic multiplication routine. | + | Does not check the resulting exponent for overflow/underflow | + | | + | FPU_u_mul(FPU_REG *a, FPU_REG *b, FPU_REG *c, unsigned int cw); | + | | + | Internal working is at approx 128 bits. | + | Result is rounded to nearest 53 or 64 bits, using "nearest or even". | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "fpu_emu.h" +#include "control_w.h" + + + +#ifndef NON_REENTRANT_FPU +/* Local storage on the stack: */ +#define FPU_accum_0 -4(%ebp) /* ms word */ +#define FPU_accum_1 -8(%ebp) + +#else +/* Local storage in a static area: */ +.data + .align 4,0 +FPU_accum_0: + .long 0 +FPU_accum_1: + .long 0 +#endif /* NON_REENTRANT_FPU */ + + +.text +ENTRY(FPU_u_mul) + pushl %ebp + movl %esp,%ebp +#ifndef NON_REENTRANT_FPU + subl $8,%esp +#endif /* NON_REENTRANT_FPU */ + + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%esi + movl PARAM2,%edi + +#ifdef PARANOID + testl $0x80000000,SIGH(%esi) + jz L_bugged + testl $0x80000000,SIGH(%edi) + jz L_bugged +#endif /* PARANOID */ + + xorl %ecx,%ecx + xorl %ebx,%ebx + + movl SIGL(%esi),%eax + mull SIGL(%edi) + movl %eax,FPU_accum_0 + movl %edx,FPU_accum_1 + + movl SIGL(%esi),%eax + mull SIGH(%edi) + addl %eax,FPU_accum_1 + adcl %edx,%ebx +/* adcl $0,%ecx // overflow here is not possible */ + + movl SIGH(%esi),%eax + mull SIGL(%edi) + addl %eax,FPU_accum_1 + adcl %edx,%ebx + adcl $0,%ecx + + movl SIGH(%esi),%eax + mull SIGH(%edi) + addl %eax,%ebx + adcl %edx,%ecx + + /* Get the sum of the exponents. */ + movl PARAM6,%eax + subl EXP_BIAS-1,%eax + + /* Two denormals can cause an exponent underflow */ + cmpl EXP_WAY_UNDER,%eax + jg Exp_not_underflow + + /* Set to a really low value allow correct handling */ + movl EXP_WAY_UNDER,%eax + +Exp_not_underflow: + +/* Have now finished with the sources */ + movl PARAM3,%edi /* Point to the destination */ + movw %ax,EXP(%edi) + +/* Now make sure that the result is normalized */ + testl $0x80000000,%ecx + jnz LResult_Normalised + + /* Normalize by shifting left one bit */ + shll $1,FPU_accum_0 + rcll $1,FPU_accum_1 + rcll $1,%ebx + rcll $1,%ecx + decw EXP(%edi) + +LResult_Normalised: + movl FPU_accum_0,%eax + movl FPU_accum_1,%edx + orl %eax,%eax + jz L_extent_zero + + orl $1,%edx + +L_extent_zero: + movl %ecx,%eax + jmp fpu_reg_round + + +#ifdef PARANOID +L_bugged: + pushl EX_INTERNAL|0x205 + call EXCEPTION + pop %ebx + jmp L_exit + +L_exit: + popl %ebx + popl %edi + popl %esi + leave + ret +#endif /* PARANOID */ + diff --git a/arch/i386/math-emu/reg_u_sub.S b/arch/i386/math-emu/reg_u_sub.S new file mode 100644 index 000000000000..1b6c24801d22 --- /dev/null +++ b/arch/i386/math-emu/reg_u_sub.S @@ -0,0 +1,272 @@ + .file "reg_u_sub.S" +/*---------------------------------------------------------------------------+ + | reg_u_sub.S | + | | + | Core floating point subtraction routine. | + | | + | Copyright (C) 1992,1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@suburbia.net | + | | + | Call from C as: | + | int FPU_u_sub(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, | + | int control_w) | + | Return value is the tag of the answer, or-ed with FPU_Exception if | + | one was raised, or -1 on internal error. | + | | + +---------------------------------------------------------------------------*/ + +/* + | Kernel subtraction routine FPU_u_sub(reg *arg1, reg *arg2, reg *answ). + | Takes two valid reg f.p. numbers (TAG_Valid), which are + | treated as unsigned numbers, + | and returns their difference as a TAG_Valid or TAG_Zero f.p. + | number. + | The first number (arg1) must be the larger. + | The returned number is normalized. + | Basic checks are performed if PARANOID is defined. + */ + +#include "exception.h" +#include "fpu_emu.h" +#include "control_w.h" + +.text +ENTRY(FPU_u_sub) + pushl %ebp + movl %esp,%ebp + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%esi /* source 1 */ + movl PARAM2,%edi /* source 2 */ + + movl PARAM6,%ecx + subl PARAM7,%ecx /* exp1 - exp2 */ + +#ifdef PARANOID + /* source 2 is always smaller than source 1 */ + js L_bugged_1 + + testl $0x80000000,SIGH(%edi) /* The args are assumed to be be normalized */ + je L_bugged_2 + + testl $0x80000000,SIGH(%esi) + je L_bugged_2 +#endif /* PARANOID */ + +/*--------------------------------------+ + | Form a register holding the | + | smaller number | + +--------------------------------------*/ + movl SIGH(%edi),%eax /* register ms word */ + movl SIGL(%edi),%ebx /* register ls word */ + + movl PARAM3,%edi /* destination */ + movl PARAM6,%edx + movw %dx,EXP(%edi) /* Copy exponent to destination */ + + xorl %edx,%edx /* register extension */ + +/*--------------------------------------+ + | Shift the temporary register | + | right the required number of | + | places. | + +--------------------------------------*/ + + cmpw $32,%cx /* shrd only works for 0..31 bits */ + jnc L_more_than_31 + +/* less than 32 bits */ + shrd %cl,%ebx,%edx + shrd %cl,%eax,%ebx + shr %cl,%eax + jmp L_shift_done + +L_more_than_31: + cmpw $64,%cx + jnc L_more_than_63 + + subb $32,%cl + jz L_exactly_32 + + shrd %cl,%eax,%edx + shr %cl,%eax + orl %ebx,%ebx + jz L_more_31_no_low /* none of the lowest bits is set */ + + orl $1,%edx /* record the fact in the extension */ + +L_more_31_no_low: + movl %eax,%ebx + xorl %eax,%eax + jmp L_shift_done + +L_exactly_32: + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + jmp L_shift_done + +L_more_than_63: + cmpw $65,%cx + jnc L_more_than_64 + + /* Shift right by 64 bits */ + movl %eax,%edx + orl %ebx,%ebx + jz L_more_63_no_low + + orl $1,%edx + jmp L_more_63_no_low + +L_more_than_64: + jne L_more_than_65 + + /* Shift right by 65 bits */ + /* Carry is clear if we get here */ + movl %eax,%edx + rcrl %edx + jnc L_shift_65_nc + + orl $1,%edx + jmp L_more_63_no_low + +L_shift_65_nc: + orl %ebx,%ebx + jz L_more_63_no_low + + orl $1,%edx + jmp L_more_63_no_low + +L_more_than_65: + movl $1,%edx /* The shifted nr always at least one '1' */ + +L_more_63_no_low: + xorl %ebx,%ebx + xorl %eax,%eax + +L_shift_done: +L_subtr: +/*------------------------------+ + | Do the subtraction | + +------------------------------*/ + xorl %ecx,%ecx + subl %edx,%ecx + movl %ecx,%edx + movl SIGL(%esi),%ecx + sbbl %ebx,%ecx + movl %ecx,%ebx + movl SIGH(%esi),%ecx + sbbl %eax,%ecx + movl %ecx,%eax + +#ifdef PARANOID + /* We can never get a borrow */ + jc L_bugged +#endif /* PARANOID */ + +/*--------------------------------------+ + | Normalize the result | + +--------------------------------------*/ + testl $0x80000000,%eax + jnz L_round /* no shifting needed */ + + orl %eax,%eax + jnz L_shift_1 /* shift left 1 - 31 bits */ + + orl %ebx,%ebx + jnz L_shift_32 /* shift left 32 - 63 bits */ + +/* + * A rare case, the only one which is non-zero if we got here + * is: 1000000 .... 0000 + * -0111111 .... 1111 1 + * -------------------- + * 0000000 .... 0000 1 + */ + + cmpl $0x80000000,%edx + jnz L_must_be_zero + + /* Shift left 64 bits */ + subw $64,EXP(%edi) + xchg %edx,%eax + jmp fpu_reg_round + +L_must_be_zero: +#ifdef PARANOID + orl %edx,%edx + jnz L_bugged_3 +#endif /* PARANOID */ + + /* The result is zero */ + movw $0,EXP(%edi) /* exponent */ + movl $0,SIGL(%edi) + movl $0,SIGH(%edi) + movl TAG_Zero,%eax + jmp L_exit + +L_shift_32: + movl %ebx,%eax + movl %edx,%ebx + movl $0,%edx + subw $32,EXP(%edi) /* Can get underflow here */ + +/* We need to shift left by 1 - 31 bits */ +L_shift_1: + bsrl %eax,%ecx /* get the required shift in %ecx */ + subl $31,%ecx + negl %ecx + shld %cl,%ebx,%eax + shld %cl,%edx,%ebx + shl %cl,%edx + subw %cx,EXP(%edi) /* Can get underflow here */ + +L_round: + jmp fpu_reg_round /* Round the result */ + + +#ifdef PARANOID +L_bugged_1: + pushl EX_INTERNAL|0x206 + call EXCEPTION + pop %ebx + jmp L_error_exit + +L_bugged_2: + pushl EX_INTERNAL|0x209 + call EXCEPTION + pop %ebx + jmp L_error_exit + +L_bugged_3: + pushl EX_INTERNAL|0x210 + call EXCEPTION + pop %ebx + jmp L_error_exit + +L_bugged_4: + pushl EX_INTERNAL|0x211 + call EXCEPTION + pop %ebx + jmp L_error_exit + +L_bugged: + pushl EX_INTERNAL|0x212 + call EXCEPTION + pop %ebx + jmp L_error_exit + +L_error_exit: + movl $-1,%eax + +#endif /* PARANOID */ + +L_exit: + popl %ebx + popl %edi + popl %esi + leave + ret diff --git a/arch/i386/math-emu/round_Xsig.S b/arch/i386/math-emu/round_Xsig.S new file mode 100644 index 000000000000..bbe0e87718e4 --- /dev/null +++ b/arch/i386/math-emu/round_Xsig.S @@ -0,0 +1,141 @@ +/*---------------------------------------------------------------------------+ + | round_Xsig.S | + | | + | Copyright (C) 1992,1993,1994,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | Normalize and round a 12 byte quantity. | + | Call from C as: | + | int round_Xsig(Xsig *n) | + | | + | Normalize a 12 byte quantity. | + | Call from C as: | + | int norm_Xsig(Xsig *n) | + | | + | Each function returns the size of the shift (nr of bits). | + | | + +---------------------------------------------------------------------------*/ + .file "round_Xsig.S" + +#include "fpu_emu.h" + + +.text +ENTRY(round_Xsig) + pushl %ebp + movl %esp,%ebp + pushl %ebx /* Reserve some space */ + pushl %ebx + pushl %esi + + movl PARAM1,%esi + + movl 8(%esi),%edx + movl 4(%esi),%ebx + movl (%esi),%eax + + movl $0,-4(%ebp) + + orl %edx,%edx /* ms bits */ + js L_round /* Already normalized */ + jnz L_shift_1 /* Shift left 1 - 31 bits */ + + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + movl $-32,-4(%ebp) + +/* We need to shift left by 1 - 31 bits */ +L_shift_1: + bsrl %edx,%ecx /* get the required shift in %ecx */ + subl $31,%ecx + negl %ecx + subl %ecx,-4(%ebp) + shld %cl,%ebx,%edx + shld %cl,%eax,%ebx + shl %cl,%eax + +L_round: + testl $0x80000000,%eax + jz L_exit + + addl $1,%ebx + adcl $0,%edx + jnz L_exit + + movl $0x80000000,%edx + incl -4(%ebp) + +L_exit: + movl %edx,8(%esi) + movl %ebx,4(%esi) + movl %eax,(%esi) + + movl -4(%ebp),%eax + + popl %esi + popl %ebx + leave + ret + + + + +ENTRY(norm_Xsig) + pushl %ebp + movl %esp,%ebp + pushl %ebx /* Reserve some space */ + pushl %ebx + pushl %esi + + movl PARAM1,%esi + + movl 8(%esi),%edx + movl 4(%esi),%ebx + movl (%esi),%eax + + movl $0,-4(%ebp) + + orl %edx,%edx /* ms bits */ + js L_n_exit /* Already normalized */ + jnz L_n_shift_1 /* Shift left 1 - 31 bits */ + + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + movl $-32,-4(%ebp) + + orl %edx,%edx /* ms bits */ + js L_n_exit /* Normalized now */ + jnz L_n_shift_1 /* Shift left 1 - 31 bits */ + + movl %ebx,%edx + movl %eax,%ebx + xorl %eax,%eax + addl $-32,-4(%ebp) + jmp L_n_exit /* Might not be normalized, + but shift no more. */ + +/* We need to shift left by 1 - 31 bits */ +L_n_shift_1: + bsrl %edx,%ecx /* get the required shift in %ecx */ + subl $31,%ecx + negl %ecx + subl %ecx,-4(%ebp) + shld %cl,%ebx,%edx + shld %cl,%eax,%ebx + shl %cl,%eax + +L_n_exit: + movl %edx,8(%esi) + movl %ebx,4(%esi) + movl %eax,(%esi) + + movl -4(%ebp),%eax + + popl %esi + popl %ebx + leave + ret + diff --git a/arch/i386/math-emu/shr_Xsig.S b/arch/i386/math-emu/shr_Xsig.S new file mode 100644 index 000000000000..31cdd118e918 --- /dev/null +++ b/arch/i386/math-emu/shr_Xsig.S @@ -0,0 +1,87 @@ + .file "shr_Xsig.S" +/*---------------------------------------------------------------------------+ + | shr_Xsig.S | + | | + | 12 byte right shift function | + | | + | Copyright (C) 1992,1994,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | Call from C as: | + | void shr_Xsig(Xsig *arg, unsigned nr) | + | | + | Extended shift right function. | + | Fastest for small shifts. | + | Shifts the 12 byte quantity pointed to by the first arg (arg) | + | right by the number of bits specified by the second arg (nr). | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" + +.text +ENTRY(shr_Xsig) + push %ebp + movl %esp,%ebp + pushl %esi + movl PARAM2,%ecx + movl PARAM1,%esi + cmpl $32,%ecx /* shrd only works for 0..31 bits */ + jnc L_more_than_31 + +/* less than 32 bits */ + pushl %ebx + movl (%esi),%eax /* lsl */ + movl 4(%esi),%ebx /* midl */ + movl 8(%esi),%edx /* msl */ + shrd %cl,%ebx,%eax + shrd %cl,%edx,%ebx + shr %cl,%edx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edx,8(%esi) + popl %ebx + popl %esi + leave + ret + +L_more_than_31: + cmpl $64,%ecx + jnc L_more_than_63 + + subb $32,%cl + movl 4(%esi),%eax /* midl */ + movl 8(%esi),%edx /* msl */ + shrd %cl,%edx,%eax + shr %cl,%edx + movl %eax,(%esi) + movl %edx,4(%esi) + movl $0,8(%esi) + popl %esi + leave + ret + +L_more_than_63: + cmpl $96,%ecx + jnc L_more_than_95 + + subb $64,%cl + movl 8(%esi),%eax /* msl */ + shr %cl,%eax + xorl %edx,%edx + movl %eax,(%esi) + movl %edx,4(%esi) + movl %edx,8(%esi) + popl %esi + leave + ret + +L_more_than_95: + xorl %eax,%eax + movl %eax,(%esi) + movl %eax,4(%esi) + movl %eax,8(%esi) + popl %esi + leave + ret diff --git a/arch/i386/math-emu/status_w.h b/arch/i386/math-emu/status_w.h new file mode 100644 index 000000000000..78d7b7689dd6 --- /dev/null +++ b/arch/i386/math-emu/status_w.h @@ -0,0 +1,65 @@ +/*---------------------------------------------------------------------------+ + | status_w.h | + | | + | Copyright (C) 1992,1993 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@vaxc.cc.monash.edu.au | + | | + +---------------------------------------------------------------------------*/ + +#ifndef _STATUS_H_ +#define _STATUS_H_ + +#include "fpu_emu.h" /* for definition of PECULIAR_486 */ + +#ifdef __ASSEMBLY__ +#define Const__(x) $##x +#else +#define Const__(x) x +#endif + +#define SW_Backward Const__(0x8000) /* backward compatibility */ +#define SW_C3 Const__(0x4000) /* condition bit 3 */ +#define SW_Top Const__(0x3800) /* top of stack */ +#define SW_Top_Shift Const__(11) /* shift for top of stack bits */ +#define SW_C2 Const__(0x0400) /* condition bit 2 */ +#define SW_C1 Const__(0x0200) /* condition bit 1 */ +#define SW_C0 Const__(0x0100) /* condition bit 0 */ +#define SW_Summary Const__(0x0080) /* exception summary */ +#define SW_Stack_Fault Const__(0x0040) /* stack fault */ +#define SW_Precision Const__(0x0020) /* loss of precision */ +#define SW_Underflow Const__(0x0010) /* underflow */ +#define SW_Overflow Const__(0x0008) /* overflow */ +#define SW_Zero_Div Const__(0x0004) /* divide by zero */ +#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */ +#define SW_Invalid Const__(0x0001) /* invalid operation */ + +#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ + +#ifndef __ASSEMBLY__ + +#define COMP_A_gt_B 1 +#define COMP_A_eq_B 2 +#define COMP_A_lt_B 3 +#define COMP_No_Comp 4 +#define COMP_Denormal 0x20 +#define COMP_NaN 0x40 +#define COMP_SNaN 0x80 + +#define status_word() \ + ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) +#define setcc(cc) ({ \ + partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); \ + partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); }) + +#ifdef PECULIAR_486 + /* Default, this conveys no information, but an 80486 does it. */ + /* Clear the SW_C1 bit, "other bits undefined". */ +# define clear_C1() { partial_status &= ~SW_C1; } +# else +# define clear_C1() +#endif /* PECULIAR_486 */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _STATUS_H_ */ diff --git a/arch/i386/math-emu/version.h b/arch/i386/math-emu/version.h new file mode 100644 index 000000000000..a0d73a1d2b67 --- /dev/null +++ b/arch/i386/math-emu/version.h @@ -0,0 +1,12 @@ +/*---------------------------------------------------------------------------+ + | version.h | + | | + | | + | Copyright (C) 1992,1993,1994,1996,1997,1999 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia | + | E-mail billm@melbpc.org.au | + | | + | | + +---------------------------------------------------------------------------*/ + +#define FPU_VERSION "wm-FPU-emu version 2.01" diff --git a/arch/i386/math-emu/wm_shrx.S b/arch/i386/math-emu/wm_shrx.S new file mode 100644 index 000000000000..518428317985 --- /dev/null +++ b/arch/i386/math-emu/wm_shrx.S @@ -0,0 +1,204 @@ + .file "wm_shrx.S" +/*---------------------------------------------------------------------------+ + | wm_shrx.S | + | | + | 64 bit right shift functions | + | | + | Copyright (C) 1992,1995 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@jacobi.maths.monash.edu.au | + | | + | Call from C as: | + | unsigned FPU_shrx(void *arg1, unsigned arg2) | + | and | + | unsigned FPU_shrxs(void *arg1, unsigned arg2) | + | | + +---------------------------------------------------------------------------*/ + +#include "fpu_emu.h" + +.text +/*---------------------------------------------------------------------------+ + | unsigned FPU_shrx(void *arg1, unsigned arg2) | + | | + | Extended shift right function. | + | Fastest for small shifts. | + | Shifts the 64 bit quantity pointed to by the first arg (arg1) | + | right by the number of bits specified by the second arg (arg2). | + | Forms a 96 bit quantity from the 64 bit arg and eax: | + | [ 64 bit arg ][ eax ] | + | shift right ---------> | + | The eax register is initialized to 0 before the shifting. | + | Results returned in the 64 bit arg and eax. | + +---------------------------------------------------------------------------*/ + +ENTRY(FPU_shrx) + push %ebp + movl %esp,%ebp + pushl %esi + movl PARAM2,%ecx + movl PARAM1,%esi + cmpl $32,%ecx /* shrd only works for 0..31 bits */ + jnc L_more_than_31 + +/* less than 32 bits */ + pushl %ebx + movl (%esi),%ebx /* lsl */ + movl 4(%esi),%edx /* msl */ + xorl %eax,%eax /* extension */ + shrd %cl,%ebx,%eax + shrd %cl,%edx,%ebx + shr %cl,%edx + movl %ebx,(%esi) + movl %edx,4(%esi) + popl %ebx + popl %esi + leave + ret + +L_more_than_31: + cmpl $64,%ecx + jnc L_more_than_63 + + subb $32,%cl + movl (%esi),%eax /* lsl */ + movl 4(%esi),%edx /* msl */ + shrd %cl,%edx,%eax + shr %cl,%edx + movl %edx,(%esi) + movl $0,4(%esi) + popl %esi + leave + ret + +L_more_than_63: + cmpl $96,%ecx + jnc L_more_than_95 + + subb $64,%cl + movl 4(%esi),%eax /* msl */ + shr %cl,%eax + xorl %edx,%edx + movl %edx,(%esi) + movl %edx,4(%esi) + popl %esi + leave + ret + +L_more_than_95: + xorl %eax,%eax + movl %eax,(%esi) + movl %eax,4(%esi) + popl %esi + leave + ret + + +/*---------------------------------------------------------------------------+ + | unsigned FPU_shrxs(void *arg1, unsigned arg2) | + | | + | Extended shift right function (optimized for small floating point | + | integers). | + | Shifts the 64 bit quantity pointed to by the first arg (arg1) | + | right by the number of bits specified by the second arg (arg2). | + | Forms a 96 bit quantity from the 64 bit arg and eax: | + | [ 64 bit arg ][ eax ] | + | shift right ---------> | + | The eax register is initialized to 0 before the shifting. | + | The lower 8 bits of eax are lost and replaced by a flag which is | + | set (to 0x01) if any bit, apart from the first one, is set in the | + | part which has been shifted out of the arg. | + | Results returned in the 64 bit arg and eax. | + +---------------------------------------------------------------------------*/ +ENTRY(FPU_shrxs) + push %ebp + movl %esp,%ebp + pushl %esi + pushl %ebx + movl PARAM2,%ecx + movl PARAM1,%esi + cmpl $64,%ecx /* shrd only works for 0..31 bits */ + jnc Ls_more_than_63 + + cmpl $32,%ecx /* shrd only works for 0..31 bits */ + jc Ls_less_than_32 + +/* We got here without jumps by assuming that the most common requirement + is for small integers */ +/* Shift by [32..63] bits */ + subb $32,%cl + movl (%esi),%eax /* lsl */ + movl 4(%esi),%edx /* msl */ + xorl %ebx,%ebx + shrd %cl,%eax,%ebx + shrd %cl,%edx,%eax + shr %cl,%edx + orl %ebx,%ebx /* test these 32 bits */ + setne %bl + test $0x7fffffff,%eax /* and 31 bits here */ + setne %bh + orw %bx,%bx /* Any of the 63 bit set ? */ + setne %al + movl %edx,(%esi) + movl $0,4(%esi) + popl %ebx + popl %esi + leave + ret + +/* Shift by [0..31] bits */ +Ls_less_than_32: + movl (%esi),%ebx /* lsl */ + movl 4(%esi),%edx /* msl */ + xorl %eax,%eax /* extension */ + shrd %cl,%ebx,%eax + shrd %cl,%edx,%ebx + shr %cl,%edx + test $0x7fffffff,%eax /* only need to look at eax here */ + setne %al + movl %ebx,(%esi) + movl %edx,4(%esi) + popl %ebx + popl %esi + leave + ret + +/* Shift by [64..95] bits */ +Ls_more_than_63: + cmpl $96,%ecx + jnc Ls_more_than_95 + + subb $64,%cl + movl (%esi),%ebx /* lsl */ + movl 4(%esi),%eax /* msl */ + xorl %edx,%edx /* extension */ + shrd %cl,%ebx,%edx + shrd %cl,%eax,%ebx + shr %cl,%eax + orl %ebx,%edx + setne %bl + test $0x7fffffff,%eax /* only need to look at eax here */ + setne %bh + orw %bx,%bx + setne %al + xorl %edx,%edx + movl %edx,(%esi) /* set to zero */ + movl %edx,4(%esi) /* set to zero */ + popl %ebx + popl %esi + leave + ret + +Ls_more_than_95: +/* Shift by [96..inf) bits */ + xorl %eax,%eax + movl (%esi),%ebx + orl 4(%esi),%ebx + setne %al + xorl %ebx,%ebx + movl %ebx,(%esi) + movl %ebx,4(%esi) + popl %ebx + popl %esi + leave + ret diff --git a/arch/i386/math-emu/wm_sqrt.S b/arch/i386/math-emu/wm_sqrt.S new file mode 100644 index 000000000000..d258f59564e1 --- /dev/null +++ b/arch/i386/math-emu/wm_sqrt.S @@ -0,0 +1,470 @@ + .file "wm_sqrt.S" +/*---------------------------------------------------------------------------+ + | wm_sqrt.S | + | | + | Fixed point arithmetic square root evaluation. | + | | + | Copyright (C) 1992,1993,1995,1997 | + | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, | + | Australia. E-mail billm@suburbia.net | + | | + | Call from C as: | + | int wm_sqrt(FPU_REG *n, unsigned int control_word) | + | | + +---------------------------------------------------------------------------*/ + +/*---------------------------------------------------------------------------+ + | wm_sqrt(FPU_REG *n, unsigned int control_word) | + | returns the square root of n in n. | + | | + | Use Newton's method to compute the square root of a number, which must | + | be in the range [1.0 .. 4.0), to 64 bits accuracy. | + | Does not check the sign or tag of the argument. | + | Sets the exponent, but not the sign or tag of the result. | + | | + | The guess is kept in %esi:%edi | + +---------------------------------------------------------------------------*/ + +#include "exception.h" +#include "fpu_emu.h" + + +#ifndef NON_REENTRANT_FPU +/* Local storage on the stack: */ +#define FPU_accum_3 -4(%ebp) /* ms word */ +#define FPU_accum_2 -8(%ebp) +#define FPU_accum_1 -12(%ebp) +#define FPU_accum_0 -16(%ebp) + +/* + * The de-normalised argument: + * sq_2 sq_1 sq_0 + * b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0 + * ^ binary point here + */ +#define FPU_fsqrt_arg_2 -20(%ebp) /* ms word */ +#define FPU_fsqrt_arg_1 -24(%ebp) +#define FPU_fsqrt_arg_0 -28(%ebp) /* ls word, at most the ms bit is set */ + +#else +/* Local storage in a static area: */ +.data + .align 4,0 +FPU_accum_3: + .long 0 /* ms word */ +FPU_accum_2: + .long 0 +FPU_accum_1: + .long 0 +FPU_accum_0: + .long 0 + +/* The de-normalised argument: + sq_2 sq_1 sq_0 + b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0 + ^ binary point here + */ +FPU_fsqrt_arg_2: + .long 0 /* ms word */ +FPU_fsqrt_arg_1: + .long 0 +FPU_fsqrt_arg_0: + .long 0 /* ls word, at most the ms bit is set */ +#endif /* NON_REENTRANT_FPU */ + + +.text +ENTRY(wm_sqrt) + pushl %ebp + movl %esp,%ebp +#ifndef NON_REENTRANT_FPU + subl $28,%esp +#endif /* NON_REENTRANT_FPU */ + pushl %esi + pushl %edi + pushl %ebx + + movl PARAM1,%esi + + movl SIGH(%esi),%eax + movl SIGL(%esi),%ecx + xorl %edx,%edx + +/* We use a rough linear estimate for the first guess.. */ + + cmpw EXP_BIAS,EXP(%esi) + jnz sqrt_arg_ge_2 + + shrl $1,%eax /* arg is in the range [1.0 .. 2.0) */ + rcrl $1,%ecx + rcrl $1,%edx + +sqrt_arg_ge_2: +/* From here on, n is never accessed directly again until it is + replaced by the answer. */ + + movl %eax,FPU_fsqrt_arg_2 /* ms word of n */ + movl %ecx,FPU_fsqrt_arg_1 + movl %edx,FPU_fsqrt_arg_0 + +/* Make a linear first estimate */ + shrl $1,%eax + addl $0x40000000,%eax + movl $0xaaaaaaaa,%ecx + mull %ecx + shll %edx /* max result was 7fff... */ + testl $0x80000000,%edx /* but min was 3fff... */ + jnz sqrt_prelim_no_adjust + + movl $0x80000000,%edx /* round up */ + +sqrt_prelim_no_adjust: + movl %edx,%esi /* Our first guess */ + +/* We have now computed (approx) (2 + x) / 3, which forms the basis + for a few iterations of Newton's method */ + + movl FPU_fsqrt_arg_2,%ecx /* ms word */ + +/* + * From our initial estimate, three iterations are enough to get us + * to 30 bits or so. This will then allow two iterations at better + * precision to complete the process. + */ + +/* Compute (g + n/g)/2 at each iteration (g is the guess). */ + shrl %ecx /* Doing this first will prevent a divide */ + /* overflow later. */ + + movl %ecx,%edx /* msw of the arg / 2 */ + divl %esi /* current estimate */ + shrl %esi /* divide by 2 */ + addl %eax,%esi /* the new estimate */ + + movl %ecx,%edx + divl %esi + shrl %esi + addl %eax,%esi + + movl %ecx,%edx + divl %esi + shrl %esi + addl %eax,%esi + +/* + * Now that an estimate accurate to about 30 bits has been obtained (in %esi), + * we improve it to 60 bits or so. + * + * The strategy from now on is to compute new estimates from + * guess := guess + (n - guess^2) / (2 * guess) + */ + +/* First, find the square of the guess */ + movl %esi,%eax + mull %esi +/* guess^2 now in %edx:%eax */ + + movl FPU_fsqrt_arg_1,%ecx + subl %ecx,%eax + movl FPU_fsqrt_arg_2,%ecx /* ms word of normalized n */ + sbbl %ecx,%edx + jnc sqrt_stage_2_positive + +/* Subtraction gives a negative result, + negate the result before division. */ + notl %edx + notl %eax + addl $1,%eax + adcl $0,%edx + + divl %esi + movl %eax,%ecx + + movl %edx,%eax + divl %esi + jmp sqrt_stage_2_finish + +sqrt_stage_2_positive: + divl %esi + movl %eax,%ecx + + movl %edx,%eax + divl %esi + + notl %ecx + notl %eax + addl $1,%eax + adcl $0,%ecx + +sqrt_stage_2_finish: + sarl $1,%ecx /* divide by 2 */ + rcrl $1,%eax + + /* Form the new estimate in %esi:%edi */ + movl %eax,%edi + addl %ecx,%esi + + jnz sqrt_stage_2_done /* result should be [1..2) */ + +#ifdef PARANOID +/* It should be possible to get here only if the arg is ffff....ffff */ + cmp $0xffffffff,FPU_fsqrt_arg_1 + jnz sqrt_stage_2_error +#endif /* PARANOID */ + +/* The best rounded result. */ + xorl %eax,%eax + decl %eax + movl %eax,%edi + movl %eax,%esi + movl $0x7fffffff,%eax + jmp sqrt_round_result + +#ifdef PARANOID +sqrt_stage_2_error: + pushl EX_INTERNAL|0x213 + call EXCEPTION +#endif /* PARANOID */ + +sqrt_stage_2_done: + +/* Now the square root has been computed to better than 60 bits. */ + +/* Find the square of the guess. */ + movl %edi,%eax /* ls word of guess */ + mull %edi + movl %edx,FPU_accum_1 + + movl %esi,%eax + mull %esi + movl %edx,FPU_accum_3 + movl %eax,FPU_accum_2 + + movl %edi,%eax + mull %esi + addl %eax,FPU_accum_1 + adcl %edx,FPU_accum_2 + adcl $0,FPU_accum_3 + +/* movl %esi,%eax */ +/* mull %edi */ + addl %eax,FPU_accum_1 + adcl %edx,FPU_accum_2 + adcl $0,FPU_accum_3 + +/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */ + + movl FPU_fsqrt_arg_0,%eax /* get normalized n */ + subl %eax,FPU_accum_1 + movl FPU_fsqrt_arg_1,%eax + sbbl %eax,FPU_accum_2 + movl FPU_fsqrt_arg_2,%eax /* ms word of normalized n */ + sbbl %eax,FPU_accum_3 + jnc sqrt_stage_3_positive + +/* Subtraction gives a negative result, + negate the result before division */ + notl FPU_accum_1 + notl FPU_accum_2 + notl FPU_accum_3 + addl $1,FPU_accum_1 + adcl $0,FPU_accum_2 + +#ifdef PARANOID + adcl $0,FPU_accum_3 /* This must be zero */ + jz sqrt_stage_3_no_error + +sqrt_stage_3_error: + pushl EX_INTERNAL|0x207 + call EXCEPTION + +sqrt_stage_3_no_error: +#endif /* PARANOID */ + + movl FPU_accum_2,%edx + movl FPU_accum_1,%eax + divl %esi + movl %eax,%ecx + + movl %edx,%eax + divl %esi + + sarl $1,%ecx /* divide by 2 */ + rcrl $1,%eax + + /* prepare to round the result */ + + addl %ecx,%edi + adcl $0,%esi + + jmp sqrt_stage_3_finished + +sqrt_stage_3_positive: + movl FPU_accum_2,%edx + movl FPU_accum_1,%eax + divl %esi + movl %eax,%ecx + + movl %edx,%eax + divl %esi + + sarl $1,%ecx /* divide by 2 */ + rcrl $1,%eax + + /* prepare to round the result */ + + notl %eax /* Negate the correction term */ + notl %ecx + addl $1,%eax + adcl $0,%ecx /* carry here ==> correction == 0 */ + adcl $0xffffffff,%esi + + addl %ecx,%edi + adcl $0,%esi + +sqrt_stage_3_finished: + +/* + * The result in %esi:%edi:%esi should be good to about 90 bits here, + * and the rounding information here does not have sufficient accuracy + * in a few rare cases. + */ + cmpl $0xffffffe0,%eax + ja sqrt_near_exact_x + + cmpl $0x00000020,%eax + jb sqrt_near_exact + + cmpl $0x7fffffe0,%eax + jb sqrt_round_result + + cmpl $0x80000020,%eax + jb sqrt_get_more_precision + +sqrt_round_result: +/* Set up for rounding operations */ + movl %eax,%edx + movl %esi,%eax + movl %edi,%ebx + movl PARAM1,%edi + movw EXP_BIAS,EXP(%edi) /* Result is in [1.0 .. 2.0) */ + jmp fpu_reg_round + + +sqrt_near_exact_x: +/* First, the estimate must be rounded up. */ + addl $1,%edi + adcl $0,%esi + +sqrt_near_exact: +/* + * This is an easy case because x^1/2 is monotonic. + * We need just find the square of our estimate, compare it + * with the argument, and deduce whether our estimate is + * above, below, or exact. We use the fact that the estimate + * is known to be accurate to about 90 bits. + */ + movl %edi,%eax /* ls word of guess */ + mull %edi + movl %edx,%ebx /* 2nd ls word of square */ + movl %eax,%ecx /* ls word of square */ + + movl %edi,%eax + mull %esi + addl %eax,%ebx + addl %eax,%ebx + +#ifdef PARANOID + cmp $0xffffffb0,%ebx + jb sqrt_near_exact_ok + + cmp $0x00000050,%ebx + ja sqrt_near_exact_ok + + pushl EX_INTERNAL|0x214 + call EXCEPTION + +sqrt_near_exact_ok: +#endif /* PARANOID */ + + or %ebx,%ebx + js sqrt_near_exact_small + + jnz sqrt_near_exact_large + + or %ebx,%edx + jnz sqrt_near_exact_large + +/* Our estimate is exactly the right answer */ + xorl %eax,%eax + jmp sqrt_round_result + +sqrt_near_exact_small: +/* Our estimate is too small */ + movl $0x000000ff,%eax + jmp sqrt_round_result + +sqrt_near_exact_large: +/* Our estimate is too large, we need to decrement it */ + subl $1,%edi + sbbl $0,%esi + movl $0xffffff00,%eax + jmp sqrt_round_result + + +sqrt_get_more_precision: +/* This case is almost the same as the above, except we start + with an extra bit of precision in the estimate. */ + stc /* The extra bit. */ + rcll $1,%edi /* Shift the estimate left one bit */ + rcll $1,%esi + + movl %edi,%eax /* ls word of guess */ + mull %edi + movl %edx,%ebx /* 2nd ls word of square */ + movl %eax,%ecx /* ls word of square */ + + movl %edi,%eax + mull %esi + addl %eax,%ebx + addl %eax,%ebx + +/* Put our estimate back to its original value */ + stc /* The ms bit. */ + rcrl $1,%esi /* Shift the estimate left one bit */ + rcrl $1,%edi + +#ifdef PARANOID + cmp $0xffffff60,%ebx + jb sqrt_more_prec_ok + + cmp $0x000000a0,%ebx + ja sqrt_more_prec_ok + + pushl EX_INTERNAL|0x215 + call EXCEPTION + +sqrt_more_prec_ok: +#endif /* PARANOID */ + + or %ebx,%ebx + js sqrt_more_prec_small + + jnz sqrt_more_prec_large + + or %ebx,%ecx + jnz sqrt_more_prec_large + +/* Our estimate is exactly the right answer */ + movl $0x80000000,%eax + jmp sqrt_round_result + +sqrt_more_prec_small: +/* Our estimate is too small */ + movl $0x800000ff,%eax + jmp sqrt_round_result + +sqrt_more_prec_large: +/* Our estimate is too large */ + movl $0x7fffff00,%eax + jmp sqrt_round_result diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile new file mode 100644 index 000000000000..fc3272506846 --- /dev/null +++ b/arch/i386/mm/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the linux i386-specific parts of the memory manager. +# + +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o + +obj-$(CONFIG_DISCONTIGMEM) += discontig.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c new file mode 100644 index 000000000000..523b30634e0a --- /dev/null +++ b/arch/i386/mm/boot_ioremap.c @@ -0,0 +1,97 @@ +/* + * arch/i386/mm/boot_ioremap.c + * + * Re-map functions for early boot-time before paging_init() when the + * boot-time pagetables are still in use + * + * Written by Dave Hansen <haveblue@us.ibm.com> + */ + + +/* + * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE + * keeps that from happenning. If anyone has a better way, I'm listening. + * + * boot_pte_t is defined only if this all works correctly + */ + +#include <linux/config.h> +#undef CONFIG_X86_PAE +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <linux/init.h> +#include <linux/stddef.h> + +/* + * I'm cheating here. It is known that the two boot PTE pages are + * allocated next to each other. I'm pretending that they're just + * one big array. + */ + +#define BOOT_PTE_PTRS (PTRS_PER_PTE*2) +#define boot_pte_index(address) \ + (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1)) + +static inline boot_pte_t* boot_vaddr_to_pte(void *address) +{ + boot_pte_t* boot_pg = (boot_pte_t*)pg0; + return &boot_pg[boot_pte_index((unsigned long)address)]; +} + +/* + * This is only for a caller who is clever enough to page-align + * phys_addr and virtual_source, and who also has a preference + * about which virtual address from which to steal ptes + */ +static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, + void* virtual_source) +{ + boot_pte_t* pte; + int i; + char *vaddr = virtual_source; + + pte = boot_vaddr_to_pte(virtual_source); + for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { + set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); + __flush_tlb_one(&vaddr[i*PAGE_SIZE]); + } +} + +/* the virtual space we're going to remap comes from this array */ +#define BOOT_IOREMAP_PAGES 4 +#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) +static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +/* + * This only applies to things which need to ioremap before paging_init() + * bt_ioremap() and plain ioremap() are both useless at this point. + * + * When used, we're still using the boot-time pagetables, which only + * have 2 PTE pages mapping the first 8MB + * + * There is no unmap. The boot-time PTE pages aren't used after boot. + * If you really want the space back, just remap it yourself. + * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) + */ +__init void* boot_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long last_addr, offset; + unsigned int nrpages; + + last_addr = phys_addr + size - 1; + + /* page align the requested address */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + nrpages = size >> PAGE_SHIFT; + if (nrpages > BOOT_IOREMAP_PAGES) + return NULL; + + __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); + + return &boot_ioremap_space[offset]; +} diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c new file mode 100644 index 000000000000..1726b4096b10 --- /dev/null +++ b/arch/i386/mm/discontig.c @@ -0,0 +1,383 @@ +/* + * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/bootmem.h> +#include <linux/mmzone.h> +#include <linux/highmem.h> +#include <linux/initrd.h> +#include <linux/nodemask.h> +#include <asm/e820.h> +#include <asm/setup.h> +#include <asm/mmzone.h> +#include <bios_ebda.h> + +struct pglist_data *node_data[MAX_NUMNODES]; +bootmem_data_t node0_bdata; + +/* + * numa interface - we expect the numa architecture specfic code to have + * populated the following initialisation. + * + * 1) node_online_map - the map of all nodes configured (online) in the system + * 2) physnode_map - the mapping between a pfn and owning node + * 3) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ + +/* + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 256Mb break (each element of the array will + * represent 256Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-3] = 0; + * physnode_map[4-7] = 1; + * physnode_map[8- ] = -1; + */ +s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; + +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", + nid, start, end); + printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); + printk(KERN_DEBUG " "); + for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { + physnode_map[pfn / PAGES_PER_ELEMENT] = nid; + printk("%ld ", pfn); + } + printk("\n"); +} + +unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long nr_pages = end_pfn - start_pfn; + + if (!nr_pages) + return 0; + + return (nr_pages + 1) * sizeof(struct page); +} + +unsigned long node_start_pfn[MAX_NUMNODES]; +unsigned long node_end_pfn[MAX_NUMNODES]; + +extern unsigned long find_max_low_pfn(void); +extern void find_max_pfn(void); +extern void one_highpage_init(struct page *, int, int); + +extern struct e820map e820; +extern unsigned long init_pg_tables_end; +extern unsigned long highend_pfn, highstart_pfn; +extern unsigned long max_low_pfn; +extern unsigned long totalram_pages; +extern unsigned long totalhigh_pages; + +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +int __init get_memcfg_numa_flat(void) +{ + printk("NUMA - single node, flat memory mode\n"); + + /* Run the memory configuration and find the top of memory. */ + find_max_pfn(); + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + memory_present(0, 0, max_pfn); + + /* Indicate there is one node available. */ + nodes_clear(node_online_map); + node_set_online(0); + return 1; +} + +/* + * Find the highest page frame number we have available for the node + */ +static void __init find_max_pfn_node(int nid) +{ + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + /* + * if a user has given mem=XXXX, then we need to make sure + * that the node _starts_ before that, too, not just ends + */ + if (node_start_pfn[nid] > max_pfn) + node_start_pfn[nid] = max_pfn; + if (node_start_pfn[nid] > node_end_pfn[nid]) + BUG(); +} + +/* + * Allocate memory for the pg_data_t for this node via a crude pre-bootmem + * method. For node zero take this from the bottom of memory, for + * subsequent nodes place them at node_remap_start_vaddr which contains + * node local data in physically node local memory. See setup_memory() + * for details. + */ +static void __init allocate_pgdat(int nid) +{ + if (nid && node_has_online_mem(nid)) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + min_low_pfn += PFN_UP(sizeof(pg_data_t)); + } +} + +void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for_each_online_node(node) { + if (node == 0) + continue; + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); + set_pmd_pfn((ulong) vaddr, + node_remap_start_pfn[node] + pfn, + PAGE_KERNEL_LARGE); + } + } +} + +static unsigned long calculate_numa_remap_pages(void) +{ + int nid; + unsigned long size, reserve_pages = 0; + + for_each_online_node(nid) { + if (nid == 0) + continue; + if (!node_remap_size[nid]) + continue; + + /* + * The acpi/srat node info can show hot-add memroy zones + * where memory could be added but not currently present. + */ + if (node_start_pfn[nid] > max_pfn) + continue; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid] + sizeof(pg_data_t); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + printk("Reserving %ld pages of KVA for lmem_map of node %d\n", + size, nid); + node_remap_size[nid] = size; + reserve_pages += size; + node_remap_offset[nid] = reserve_pages; + printk("Shrinking node %d from %ld pages to %ld pages\n", + nid, node_end_pfn[nid], node_end_pfn[nid] - size); + node_end_pfn[nid] -= size; + node_remap_start_pfn[nid] = node_end_pfn[nid]; + } + printk("Reserving total of %ld pages for numa KVA remap\n", + reserve_pages); + return reserve_pages; +} + +extern void setup_bootmem_allocator(void); +unsigned long __init setup_memory(void) +{ + int nid; + unsigned long system_start_pfn, system_max_low_pfn; + unsigned long reserve_pages; + + /* + * When mapping a NUMA machine we allocate the node_mem_map arrays + * from node local memory. They are then mapped directly into KVA + * between zone normal and vmalloc space. Calculate the size of + * this space and use it to adjust the boundry between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ + find_max_pfn(); + get_memcfg_numa(); + + reserve_pages = calculate_numa_remap_pages(); + + /* partially used pages are not usable - thus round upwards */ + system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); + + system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages; + printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n", + reserve_pages, max_low_pfn + reserve_pages); + printk("max_pfn = %ld\n", max_pfn); +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > system_max_low_pfn) + highstart_pfn = system_max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(system_max_low_pfn)); + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + min_low_pfn, max_low_pfn, highstart_pfn); + + printk("Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + for_each_online_node(nid) { + node_remap_start_vaddr[nid] = pfn_to_kaddr( + (highstart_pfn + reserve_pages) - node_remap_offset[nid]); + allocate_pgdat(nid); + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(highstart_pfn + reserve_pages + - node_remap_offset[nid] + node_remap_size[nid])); + } + printk("High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + vmalloc_earlyreserve = reserve_pages * PAGE_SIZE; + for_each_online_node(nid) + find_max_pfn_node(nid); + + memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); + NODE_DATA(0)->bdata = &node0_bdata; + setup_bootmem_allocator(); + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + int nid; + + /* + * Insert nodes into pgdat_list backward so they appear in order. + * Clobber node 0's links and NULL out pgdat_list before starting. + */ + pgdat_list = NULL; + for (nid = MAX_NUMNODES - 1; nid >= 0; nid--) { + if (!node_online(nid)) + continue; + NODE_DATA(nid)->pgdat_next = pgdat_list; + pgdat_list = NODE_DATA(nid); + } + + for_each_online_node(nid) { + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned long *zholes_size; + unsigned int max_dma; + + unsigned long low = max_low_pfn; + unsigned long start = node_start_pfn[nid]; + unsigned long high = node_end_pfn[nid]; + + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + if (node_has_online_mem(nid)){ + if (start > low) { +#ifdef CONFIG_HIGHMEM + BUG_ON(start > high); + zones_size[ZONE_HIGHMEM] = high - start; +#endif + } else { + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + BUG_ON(max_dma > low); + BUG_ON(low > high); + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = high - low; +#endif + } + } + } + + zholes_size = get_zholes_size(nid); + /* + * We let the lmem_map for node 0 be allocated from the + * normal bootmem allocator, but other nodes come from the + * remapped KVA area - mbligh + */ + if (!nid) + free_area_init_node(nid, NODE_DATA(nid), + zones_size, start, zholes_size); + else { + unsigned long lmem_map; + lmem_map = (unsigned long)node_remap_start_vaddr[nid]; + lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1; + lmem_map &= PAGE_MASK; + NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map; + free_area_init_node(nid, NODE_DATA(nid), zones_size, + start, zholes_size); + } + } + return; +} + +void __init set_highmem_pages_init(int bad_ppro) +{ +#ifdef CONFIG_HIGHMEM + struct zone *zone; + + for_each_zone(zone) { + unsigned long node_pfn, node_high_size, zone_start_pfn; + struct page * zone_mem_map; + + if (!is_highmem(zone)) + continue; + + printk("Initializing %s for node %d\n", zone->name, + zone->zone_pgdat->node_id); + + node_high_size = zone->spanned_pages; + zone_mem_map = zone->zone_mem_map; + zone_start_pfn = zone->zone_start_pfn; + + for (node_pfn = 0; node_pfn < node_high_size; node_pfn++) { + one_highpage_init((struct page *)(zone_mem_map + node_pfn), + zone_start_pfn + node_pfn, bad_ppro); + } + } + totalram_pages += totalhigh_pages; +#endif +} diff --git a/arch/i386/mm/extable.c b/arch/i386/mm/extable.c new file mode 100644 index 000000000000..f706449319c4 --- /dev/null +++ b/arch/i386/mm/extable.c @@ -0,0 +1,36 @@ +/* + * linux/arch/i386/mm/extable.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> + +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + +#ifdef CONFIG_PNPBIOS + if (unlikely((regs->xcs & ~15) == (GDT_ENTRY_PNPBIOS_BASE << 3))) + { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + fixup = search_exception_tables(regs->eip); + if (fixup) { + regs->eip = fixup->fixup; + return 1; + } + + return 0; +} diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c new file mode 100644 index 000000000000..a509237c4815 --- /dev/null +++ b/arch/i386/mm/fault.c @@ -0,0 +1,552 @@ +/* + * linux/arch/i386/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/highmem.h> +#include <linux/module.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/kdebug.h> + +extern void die(const char *,struct pt_regs *,long); + +/* + * Unlock any spinlocks which will prevent us from getting the + * message out + */ +void bust_spinlocks(int yes) +{ + int loglevel_save = console_loglevel; + + if (yes) { + oops_in_progress = 1; + return; + } +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() + * without oops_in_progress set so that printk will give klogd + * a poke. Hold onto your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; +} + +/* + * Return EIP plus the CS segment base. The segment limit is also + * adjusted, clamped to the kernel/user address space (whichever is + * appropriate), and returned in *eip_limit. + * + * The segment is checked, because it might have been changed by another + * task between the original faulting instruction and here. + * + * If CS is no longer a valid code segment, or if EIP is beyond the + * limit, or if it is a kernel address when CS is not a kernel segment, + * then the returned value will be greater than *eip_limit. + * + * This is slow, but is very rarely executed. + */ +static inline unsigned long get_segment_eip(struct pt_regs *regs, + unsigned long *eip_limit) +{ + unsigned long eip = regs->eip; + unsigned seg = regs->xcs & 0xffff; + u32 seg_ar, seg_limit, base, *desc; + + /* The standard kernel/user address space limit. */ + *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + + /* Unlikely, but must come before segment checks. */ + if (unlikely((regs->eflags & VM_MASK) != 0)) + return eip + (seg << 4); + + /* By far the most common cases. */ + if (likely(seg == __USER_CS || seg == __KERNEL_CS)) + return eip; + + /* Check the segment exists, is within the current LDT/GDT size, + that kernel/user (ring 0..3) has the appropriate privilege, + that it's a code segment, and get the limit. */ + __asm__ ("larl %3,%0; lsll %3,%1" + : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); + if ((~seg_ar & 0x9800) || eip > seg_limit) { + *eip_limit = 0; + return 1; /* So that returned eip > *eip_limit. */ + } + + /* Get the GDT/LDT descriptor base. + When you look for races in this code remember that + LDT and other horrors are only used in user space. */ + if (seg & (1<<2)) { + /* Must lock the LDT while reading it. */ + down(¤t->mm->context.sem); + desc = current->mm->context.ldt; + desc = (void *)desc + (seg & ~7); + } else { + /* Must disable preemption while reading the GDT. */ + desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu()); + desc = (void *)desc + (seg & ~7); + } + + /* Decode the code segment base from the descriptor */ + base = get_desc_base((unsigned long *)desc); + + if (seg & (1<<2)) { + up(¤t->mm->context.sem); + } else + put_cpu(); + + /* Adjust EIP and segment limit, and clamp at the kernel limit. + It's legitimate for segments to wrap at 0xffffffff. */ + seg_limit += base; + if (seg_limit < *eip_limit && seg_limit >= base) + *eip_limit = seg_limit; + return eip + base; +} + +/* + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + */ +static int __is_prefetch(struct pt_regs *regs, unsigned long addr) +{ + unsigned long limit; + unsigned long instr = get_segment_eip (regs, &limit); + int scan_more = 1; + int prefetch = 0; + int i; + + for (i = 0; scan_more && i < 15; i++) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ + scan_more = ((instr_lo & 7) == 0x6); + break; + + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + if (instr > limit) + break; + if (__get_user(opcode, (unsigned char *) instr)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 6)) { + /* Catch an obscure case of prefetch inside an NX page. */ + if (nx_enabled && (error_code & 16)) + return 0; + return __is_prefetch(regs, addr); + } + return 0; +} + +fastcall void do_invalid_op(struct pt_regs *, unsigned long); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + * + * error_code: + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + */ +fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct * vma; + unsigned long address; + unsigned long page; + int write; + siginfo_t info; + + /* get the address */ + __asm__("movl %%cr2,%0":"=r" (address)); + + if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + SIGSEGV) == NOTIFY_STOP) + return; + /* It's safe to allow irq's after cr2 has been saved */ + if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + tsk = current; + + info.si_code = SEGV_MAPERR; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 1) == 0. + */ + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & 5)) + goto vmalloc_fault; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + mm = tsk->mm; + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault.. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; + + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an + * erroneous fault occuring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibilty of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & 4) == 0 && + !search_exception_tables(regs->eip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & 4) { + /* + * accessing the stack below %esp is always a bug. + * The "+ 32" is there due to some instructions (like + * pusha) doing post-decrement on the stack and that + * doesn't show up until later.. + */ + if (address + 32 < regs->esp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + info.si_code = SEGV_ACCERR; + write = 0; + switch (error_code & 3) { + default: /* 3: write, present */ +#ifdef TEST_VERIFY_AREA + if (regs->cs == KERNEL_CS) + printk("WP fault at %08lx\n", regs->eip); +#endif + /* fall through */ + case 2: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case 1: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto bad_area; + } + + survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + switch (handle_mm_fault(mm, vma, address, write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + goto do_sigbus; + case VM_FAULT_OOM: + goto out_of_memory; + default: + BUG(); + } + + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (regs->eflags & VM_MASK) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & 4) { + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + info.si_signo = SIGSEGV; + info.si_errno = 0; + /* info.si_code has been set above */ + info.si_addr = (void __user *)address; + force_sig_info(SIGSEGV, &info, tsk); + return; + } + +#ifdef CONFIG_X86_F00F_BUG + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + unsigned long nr; + + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return; + } + } +#endif + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + */ + if (is_prefetch(regs, address, error_code)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + + bust_spinlocks(1); + +#ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); + + if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) + printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); + } +#endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); + asm("movl %%cr3,%0":"=r" (page)); + page = ((unsigned long *) __va(page))[address >> 22]; + printk(KERN_ALERT "*pde = %08lx\n", page); + /* + * We must not directly access the pte in the highpte + * case, the page table might be allocated in highmem. + * And lets rather not kmap-atomic the pte, just in case + * it's allocated already. + */ +#ifndef CONFIG_HIGHPTE + if (page & 1) { + page &= PAGE_MASK; + address &= 0x003ff000; + page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; + printk(KERN_ALERT "*pte = %08lx\n", page); + } +#endif + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + printk("VM: killing process %s\n", tsk->comm); + if (error_code & 4) + do_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & 4)) + goto no_context; + + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, tsk); + return; + +vmalloc_fault: + { + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "tsk" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + int index = pgd_index(address); + unsigned long pgd_paddr; + pgd_t *pgd, *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + pte_t *pte_k; + + asm("movl %%cr3,%0":"=r" (pgd_paddr)); + pgd = index + (pgd_t *)__va(pgd_paddr); + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + goto no_context; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + goto no_context; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + goto no_context; + set_pmd(pmd, *pmd_k); + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + goto no_context; + return; + } +} diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c new file mode 100644 index 000000000000..fc4c4cad4e98 --- /dev/null +++ b/arch/i386/mm/highmem.c @@ -0,0 +1,89 @@ +#include <linux/highmem.h> + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +void *kmap_atomic(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + inc_preempt_count(); + if (!PageHighMem(page)) + return page_address(page); + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#ifdef CONFIG_DEBUG_HIGHMEM + if (!pte_none(*(kmap_pte-idx))) + BUG(); +#endif + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +void kunmap_atomic(void *kvaddr, enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + if (vaddr < FIXADDR_START) { // FIXME + dec_preempt_count(); + preempt_check_resched(); + return; + } + + if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) + BUG(); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif + + dec_preempt_count(); + preempt_check_resched(); +} + +struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} + diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c new file mode 100644 index 000000000000..a8c45143088b --- /dev/null +++ b/arch/i386/mm/hugetlbpage.c @@ -0,0 +1,431 @@ +/* + * IA-32 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/config.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/sysctl.h> +#include <asm/mman.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + pmd = pmd_alloc(mm, pud, addr); + return (pte_t *) pmd; +} + +static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_offset(pgd, addr); + pmd = pmd_offset(pud, addr); + return (pte_t *) pmd; +} + +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +{ + pte_t entry; + + add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + if (write_access) { + entry = + pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } else + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); + entry = pte_mkyoung(entry); + mk_pte_huge(entry); + set_pte(page_table, entry); +} + +/* + * This function checks for proper alignment of input addr and len parameters. + */ +int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +{ + if (len & ~HPAGE_MASK) + return -EINVAL; + if (addr & ~HPAGE_MASK) + return -EINVAL; + return 0; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + while (addr < end) { + dst_pte = huge_pte_alloc(dst, addr); + if (!dst_pte) + goto nomem; + src_pte = huge_pte_offset(src, addr); + entry = *src_pte; + ptepage = pte_page(entry); + get_page(ptepage); + set_pte(dst_pte, entry); + add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + addr += HPAGE_SIZE; + } + return 0; + +nomem: + return -ENOMEM; +} + +int +follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, int *length, int i) +{ + unsigned long vpfn, vaddr = *position; + int remainder = *length; + + WARN_ON(!is_vm_hugetlb_page(vma)); + + vpfn = vaddr/PAGE_SIZE; + while (vaddr < vma->vm_end && remainder) { + + if (pages) { + pte_t *pte; + struct page *page; + + pte = huge_pte_offset(mm, vaddr); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageCompound(page)); + + get_page(page); + pages[i] = page; + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + ++vpfn; + --remainder; + ++i; + } + + *length = remainder; + *position = vaddr; + + return i; +} + +#if 0 /* This is just for testing */ +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + unsigned long start = address; + int length = 1; + int nr; + struct page *page; + struct vm_area_struct *vma; + + vma = find_vma(mm, addr); + if (!vma || !is_vm_hugetlb_page(vma)) + return ERR_PTR(-EINVAL); + + pte = huge_pte_offset(mm, address); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageCompound(page)); + + return page; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + return NULL; +} + +#else + +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + return ERR_PTR(-EINVAL); +} + +int pmd_huge(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PSE); +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pmd); + if (page) + page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); + return page; +} +#endif + +void unmap_hugepage_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t pte, *ptep; + struct page *page; + + BUG_ON(start & (HPAGE_SIZE - 1)); + BUG_ON(end & (HPAGE_SIZE - 1)); + + for (address = start; address < end; address += HPAGE_SIZE) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_none(pte)) + continue; + page = pte_page(pte); + put_page(page); + } + add_mm_counter(mm ,rss, -((end - start) >> PAGE_SHIFT)); + flush_tlb_range(vma, start, end); +} + +int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +{ + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret = 0; + + BUG_ON(vma->vm_start & ~HPAGE_MASK); + BUG_ON(vma->vm_end & ~HPAGE_MASK); + + spin_lock(&mm->page_table_lock); + for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + unsigned long idx; + pte_t *pte = huge_pte_alloc(mm, addr); + struct page *page; + + if (!pte) { + ret = -ENOMEM; + goto out; + } + + if (!pte_none(*pte)) { + pmd_t *pmd = (pmd_t *) pte; + + page = pmd_page(*pmd); + pmd_clear(pmd); + mm->nr_ptes--; + dec_page_state(nr_page_table_pages); + page_cache_release(page); + } + + idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + page = find_get_page(mapping, idx); + if (!page) { + /* charge the fs quota first */ + if (hugetlb_get_quota(mapping)) { + ret = -ENOMEM; + goto out; + } + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + ret = -ENOMEM; + goto out; + } + ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); + if (! ret) { + unlock_page(page); + } else { + hugetlb_put_quota(mapping); + free_huge_page(page); + goto out; + } + } + set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + } +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/* x86_64 also uses this file */ + +#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + start_addr = mm->free_area_cache; + +full_search: + addr = ALIGN(start_addr, HPAGE_SIZE); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + mm->free_area_cache = addr + len; + return addr; + } + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + } +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr0, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev_vma; + unsigned long base = mm->mmap_base, addr = addr0; + int first_time = 1; + + /* don't allow allocations above current base */ + if (mm->free_area_cache > base) + mm->free_area_cache = base; + +try_again: + /* make sure it can fit in the remaining address space */ + if (mm->free_area_cache < len) + goto fail; + + /* either no address requested or cant fit in requested address hole */ + addr = (mm->free_area_cache - len) & HPAGE_MASK; + do { + /* + * Lookup failure means no vma is above this address, + * i.e. return with success: + */ + if (!(vma = find_vma_prev(mm, addr, &prev_vma))) + return addr; + + /* + * new region fits between prev_vma->vm_end and + * vma->vm_start, use it: + */ + if (addr + len <= vma->vm_start && + (!prev_vma || (addr >= prev_vma->vm_end))) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + else + /* pull free_area_cache down to the first hole */ + if (mm->free_area_cache == vma->vm_end) + mm->free_area_cache = vma->vm_start; + + /* try just below the current vma->vm_start */ + addr = (vma->vm_start - len) & HPAGE_MASK; + } while (len <= vma->vm_start); + +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (first_time) { + mm->free_area_cache = base; + first_time = 0; + goto try_again; + } + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = hugetlb_get_unmapped_area_bottomup(file, addr0, + len, pgoff, flags); + + /* + * Restore the topdown base: + */ + mm->free_area_cache = base; + + return addr; +} + +unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~HPAGE_MASK) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = ALIGN(addr, HPAGE_SIZE); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + if (mm->get_unmapped_area == arch_get_unmapped_area) + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); +} + +#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ + diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c new file mode 100644 index 000000000000..7a7ea3737265 --- /dev/null +++ b/arch/i386/mm/init.c @@ -0,0 +1,696 @@ +/* + * linux/arch/i386/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/proc_fs.h> +#include <linux/efi.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +unsigned long highstart_pfn, highend_pfn; + +static int noinline do_test_wp_bit(void); + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + if (pmd_table != pmd_offset(pud, 0)) + BUG(); +#else + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +#endif + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry. + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + if (page_table != pte_offset_kernel(pmd, 0)) + BUG(); + + return page_table; + } + + return pte_offset_kernel(pmd, 0); +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + */ + +/* + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + if (pgd_none(*pgd)) + one_md_table_init(pgd); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + if (pmd_none(*pmd)) + one_page_table_init(pmd); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET. + */ +static void __init kernel_physical_mapping_init(pgd_t *pgd_base) +{ + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int pgd_idx, pmd_idx, pte_ofs; + + pgd_idx = pgd_index(PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + pfn = 0; + + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + if (pfn >= max_low_pfn) + continue; + for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* Map with big pages if possible, otherwise create normal page tables. */ + if (cpu_has_pse) { + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + pfn += PTRS_PER_PTE; + } else { + pte = one_page_table_init(pmd); + + for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + if (is_kernel_text(address)) + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + else + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } + } + } + } +} + +static inline int page_kills_ppro(unsigned long pagenr) +{ + if (pagenr >= 0x70000 && pagenr <= 0x7003F) + return 1; + return 0; +} + +extern int is_available_memory(efi_memory_desc_t *); + +static inline int page_is_ram(unsigned long pagenr) +{ + int i; + unsigned long addr, end; + + if (efi_enabled) { + efi_memory_desc_t *md; + + for (i = 0; i < memmap.nr_map; i++) { + md = &memmap.map[i]; + if (!is_available_memory(md)) + continue; + addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; + } + + for (i = 0; i < e820.nr_map; i++) { + + if (e820.map[i].type != E820_RAM) /* not usable memory */ + continue; + /* + * !!!FIXME!!! Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area. We need a sanity + * check here. + */ + addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +pgprot_t kmap_prot; + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} + +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +{ + if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { + ClearPageReserved(page); + set_bit(PG_highmem, &page->flags); + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; + } else + SetPageReserved(page); +} + +#ifndef CONFIG_DISCONTIGMEM +static void __init set_highmem_pages_init(int bad_ppro) +{ + int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) + one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + totalram_pages += totalhigh_pages; +} +#else +extern void set_highmem_pages_init(int); +#endif /* !CONFIG_DISCONTIGMEM */ + +#else +#define kmap_init() do { } while (0) +#define permanent_kmaps_init(pgd_base) do { } while (0) +#define set_highmem_pages_init(bad_ppro) do { } while (0) +#endif /* CONFIG_HIGHMEM */ + +unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; + +#ifndef CONFIG_DISCONTIGMEM +#define remap_numa_kva() do {} while (0) +#else +extern void __init remap_numa_kva(void); +#endif + +static void __init pagetable_init (void) +{ + unsigned long vaddr; + pgd_t *pgd_base = swapper_pg_dir; + +#ifdef CONFIG_X86_PAE + int i; + /* Init entries of the first-level page table to the zero page */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) { + set_in_cr4(X86_CR4_PSE); + } + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __PAGE_KERNEL |= _PAGE_GLOBAL; + __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; + } + + kernel_physical_mapping_init(pgd_base); + remap_numa_kva(); + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + page_table_range_init(vaddr, 0, pgd_base); + + permanent_kmaps_init(pgd_base); + +#ifdef CONFIG_X86_PAE + /* + * Add low memory identity-mappings - SMP needs it when + * starting up on an AP from real-mode. In the non-PAE + * case we already have these mappings through head.S. + * All user-space mappings are explicitly cleared after + * SMP startup. + */ + pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; +#endif +} + +#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) +/* + * Swap suspend & friends need this for resume because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +char __nosavedata swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +static inline void save_pg_dir(void) +{ + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); +} +#else +static inline void save_pg_dir(void) +{ +} +#endif + +void zap_low_mappings (void) +{ + int i; + + save_pg_dir(); + + /* + * Zap initial low-memory mappings. + * + * Note that "pgd_clear()" doesn't do it for + * us, because pgd_clear() is a no-op on i386. + */ + for (i = 0; i < USER_PTRS_PER_PGD; i++) +#ifdef CONFIG_X86_PAE + set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); +#else + set_pgd(swapper_pg_dir+i, __pgd(0)); +#endif + flush_tlb_all(); +} + +static int disable_nx __initdata = 0; +u64 __supported_pte_mask = ~_PAGE_NX; + +/* + * noexec = on|off + * + * Control non executable mappings. + * + * on Enable + * off Disable + */ +void __init noexec_setup(const char *str) +{ + if (!strncmp(str, "on",2) && cpu_has_nx) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str,"off",3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } +} + +int nx_enabled = 0; +#ifdef CONFIG_X86_PAE + +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} + +/* + * Enables/disables executability of a given kernel page and + * returns the previous setting. + */ +int __init set_kernel_exec(unsigned long vaddr, int enable) +{ + pte_t *pte; + int ret = 1; + + if (!nx_enabled) + goto out; + + pte = lookup_address(vaddr); + BUG_ON(!pte); + + if (!pte_exec_kernel(*pte)) + ret = 0; + + if (enable) + pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); + else + pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); + __flush_tlb_all(); +out: + return ret; +} + +#endif + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk("NX (Execute Disable) protection: active\n"); +#endif + + pagetable_init(); + + load_cr3(swapper_pg_dir); + +#ifdef CONFIG_X86_PAE + /* + * We will bail out later - printk doesn't work right now so + * the user would just see a hanging kernel. + */ + if (cpu_has_pae) + set_in_cr4(X86_CR4_PAE); +#endif + __flush_tlb_all(); + + kmap_init(); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This + * used to involve black magic jumps to work around some nasty CPU bugs, + * but fortunately the switch to using exceptions got rid of all that. + */ + +static void __init test_wp_bit(void) +{ + printk("Checking if this processor honours the WP bit even in supervisor mode... "); + + /* Any page-aligned address will do, the test is non-destructive */ + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + boot_cpu_data.wp_works_ok = do_test_wp_bit(); + clear_fixmap(FIX_WP_TEST); + + if (!boot_cpu_data.wp_works_ok) { + printk("No.\n"); +#ifdef CONFIG_X86_WP_WORKS_OK + panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); +#endif + } else { + printk("Ok.\n"); + } +} + +static void __init set_max_mapnr_init(void) +{ +#ifdef CONFIG_HIGHMEM + num_physpages = highend_pfn; +#else + num_physpages = max_low_pfn; +#endif +#ifndef CONFIG_DISCONTIGMEM + max_mapnr = num_physpages; +#endif +} + +static struct kcore_list kcore_mem, kcore_vmalloc; + +void __init mem_init(void) +{ + extern int ppro_with_ram_bug(void); + int codesize, reservedpages, datasize, initsize; + int tmp; + int bad_ppro; + +#ifndef CONFIG_DISCONTIGMEM + if (!mem_map) + BUG(); +#endif + + bad_ppro = ppro_with_ram_bug(); + +#ifdef CONFIG_HIGHMEM + /* check that fixmap and pkmap do not overlap */ + if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + BUG(); + } +#endif + + set_max_mapnr_init(); + +#ifdef CONFIG_HIGHMEM + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* + * Only count reserved RAM pages + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; + + set_highmem_pages_init(bad_ppro); + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END-VMALLOC_START); + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10, + (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) + ); + +#ifdef CONFIG_X86_PAE + if (!cpu_has_pae) + panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); +#endif + if (boot_cpu_data.wp_works_ok < 0) + test_wp_bit(); + + /* + * Subtle. SMP is doing it's boot stuff late (because it has to + * fork idle threads) - but it also needs low mappings for the + * protected-mode entry to work. We zap these entries only after + * the WP-bit has been tested. + */ +#ifndef CONFIG_SMP + zap_low_mappings(); +#endif +} + +kmem_cache_t *pgd_cache; +kmem_cache_t *pmd_cache; + +void __init pgtable_cache_init(void) +{ + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + 0, + pmd_ctor, + NULL); + if (!pmd_cache) + panic("pgtable_cache_init(): cannot create pmd cache"); + } + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), + 0, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) + panic("pgtable_cache_init(): Cannot create pgd cache"); +} + +/* + * This function cannot be __init, since exceptions don't work in that + * section. Put this after the callers, so that it cannot be inlined. + */ +static int noinline do_test_wp_bit(void) +{ + char tmp_reg; + int flag; + + __asm__ __volatile__( + " movb %0,%1 \n" + "1: movb %1,%0 \n" + " xorl %2,%2 \n" + "2: \n" + ".section __ex_table,\"a\"\n" + " .align 4 \n" + " .long 1b,2b \n" + ".previous \n" + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), + "=q" (tmp_reg), + "=r" (flag) + :"2" (1) + :"memory"); + + return flag; +} + +void free_initmem(void) +{ + unsigned long addr; + + addr = (unsigned long)(&__init_begin); + for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + set_page_count(virt_to_page(addr), 1); + memset((void *)addr, 0xcc, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } + printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < end) + printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + set_page_count(virt_to_page(start), 1); + free_page(start); + totalram_pages++; + } +} +#endif diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c new file mode 100644 index 000000000000..db06f7399913 --- /dev/null +++ b/arch/i386/mm/ioremap.c @@ -0,0 +1,320 @@ +/* + * arch/i386/mm/ioremap.c + * + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <asm/io.h> +#include <asm/fixmap.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> + +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, unsigned long phys_addr, unsigned long flags) +{ + pte_t *pte; + unsigned long pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel(&init_mm, pmd, addr); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | + _PAGE_DIRTY | _PAGE_ACCESSED | flags))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + return 0; +} + +static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, unsigned long phys_addr, unsigned long flags) +{ + pmd_t *pmd; + unsigned long next; + + phys_addr -= addr; + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, unsigned long phys_addr, unsigned long flags) +{ + pud_t *pud; + unsigned long next; + + phys_addr -= addr; + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +static int ioremap_page_range(unsigned long addr, + unsigned long end, unsigned long phys_addr, unsigned long flags) +{ + pgd_t *pgd; + unsigned long next; + int err; + + BUG_ON(addr >= end); + flush_cache_all(); + phys_addr -= addr; + pgd = pgd_offset_k(addr); + spin_lock(&init_mm.page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); + if (err) + break; + } while (pgd++, addr = next, addr != end); + spin_unlock(&init_mm.page_table_lock); + flush_tlb_all(); + return err; +} + +/* + * Generic mapping function (not visible outside): + */ + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) +{ + void __iomem * addr; + struct vm_struct * area; + unsigned long offset, last_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (void __iomem *) phys_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (phys_addr <= virt_to_phys(high_memory - 1)) { + char *t_addr, *t_end; + struct page *page; + + t_addr = __va(phys_addr); + t_end = t_addr + (size - 1); + + for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) + if(!PageReserved(page)) + return NULL; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP | (flags << 20)); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (ioremap_page_range((unsigned long) addr, + (unsigned long) addr + size, phys_addr, flags)) { + vunmap((void __force *) addr); + return NULL; + } + return (void __iomem *) (offset + (char __iomem *)addr); +} + + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + unsigned long last_addr; + void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); + if (!p) + return p; + + /* Guaranteed to be > phys_addr, as per __ioremap() */ + last_addr = phys_addr + size - 1; + + if (last_addr < virt_to_phys(high_memory) - 1) { + struct page *ppage = virt_to_page(__va(phys_addr)); + unsigned long npages; + + phys_addr &= PAGE_MASK; + + /* This might overflow and become zero.. */ + last_addr = PAGE_ALIGN(last_addr); + + /* .. but that's ok, because modulo-2**n arithmetic will make + * the page-aligned "last - first" come out right. + */ + npages = (last_addr - phys_addr) >> PAGE_SHIFT; + + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { + iounmap(p); + p = NULL; + } + global_flush_tlb(); + } + + return p; +} + +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p; + if ((void __force *) addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); + if (!p) { + printk("__iounmap: bad address %p\n", addr); + return; + } + + if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { + /* p->size includes the guard page, but cpa doesn't like that */ + change_page_attr(virt_to_page(__va(p->phys_addr)), + p->size >> PAGE_SHIFT, + PAGE_KERNEL); + global_flush_tlb(); + } + kfree(p); +} + +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return phys_to_virt(phys_addr); + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); +} + +void __init bt_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) + return; + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN; + while (nrpages > 0) { + clear_fixmap(idx); + --idx; + --nrpages; + } +} diff --git a/arch/i386/mm/mmap.c b/arch/i386/mm/mmap.c new file mode 100644 index 000000000000..e4730a1a43dd --- /dev/null +++ b/arch/i386/mm/mmap.c @@ -0,0 +1,76 @@ +/* + * linux/arch/i386/mm/mmap.c + * + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Started by Ingo Molnar <mingo@elte.hu> + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/random.h> + +/* + * Top of mmap area (just below the process stack). + * + * Leave an at least ~128 MB hole. + */ +#define MIN_GAP (128*1024*1024) +#define MAX_GAP (TASK_SIZE/6*5) + +static inline unsigned long mmap_base(struct mm_struct *mm) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; + unsigned long random_factor = 0; + + if (current->flags & PF_RANDOMIZE) + random_factor = get_random_int() % (1024*1024); + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(TASK_SIZE - gap - random_factor); +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (sysctl_legacy_va_layout || + (current->personality & ADDR_COMPAT_LAYOUT) || + current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(mm); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +} diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c new file mode 100644 index 000000000000..cb3da6baa704 --- /dev/null +++ b/arch/i386/mm/pageattr.c @@ -0,0 +1,221 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> + +static DEFINE_SPINLOCK(cpa_lock); +static struct list_head df_list = LIST_HEAD_INIT(df_list); + + +pte_t *lookup_address(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + return pte_offset_kernel(pmd, address); +} + +static struct page *split_large_page(unsigned long address, pgprot_t prot) +{ + int i; + unsigned long addr; + struct page *base; + pte_t *pbase; + + spin_unlock_irq(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + spin_lock_irq(&cpa_lock); + if (!base) + return NULL; + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : PAGE_KERNEL); + } + return base; +} + +static void flush_kernel_map(void *dummy) +{ + /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ + if (boot_cpu_data.x86_model >= 4) + asm volatile("wbinvd":::"memory"); + /* Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); +} + +static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + struct page *page; + unsigned long flags; + + set_pte_atomic(kpte, pte); /* change init_mm */ + if (PTRS_PER_PMD > 1) + return; + + spin_lock_irqsave(&pgd_lock, flags); + for (page = pgd_list; page; page = (struct page *)page->index) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + spin_unlock_irqrestore(&pgd_lock, flags); +} + +/* + * No more special protections in this 2/4MB area - revert to a + * large page again. + */ +static inline void revert_page(struct page *kpte_page, unsigned long address) +{ + pte_t *linear = (pte_t *) + pmd_offset(pud_offset(pgd_offset_k(address), address), address); + set_pmd_pte(linear, address, + pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); +} + +static int +__change_page_attr(struct page *page, pgprot_t prot) +{ + pte_t *kpte; + unsigned long address; + struct page *kpte_page; + + BUG_ON(PageHighMem(page)); + address = (unsigned long)page_address(page); + + kpte = lookup_address(address); + if (!kpte) + return -EINVAL; + kpte_page = virt_to_page(kpte); + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { + if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + set_pte_atomic(kpte, mk_pte(page, prot)); + } else { + struct page *split = split_large_page(address, prot); + if (!split) + return -ENOMEM; + set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL)); + kpte_page = split; + } + get_page(kpte_page); + } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); + __put_page(kpte_page); + } else + BUG(); + + /* + * If the pte was reserved, it means it was created at boot + * time (not via split_large_page) and in turn we must not + * replace it with a largepage. + */ + if (!PageReserved(kpte_page)) { + /* memleak and potential failed 2M page regeneration */ + BUG_ON(!page_count(kpte_page)); + + if (cpu_has_pse && (page_count(kpte_page) == 1)) { + list_add(&kpte_page->lru, &df_list); + revert_page(kpte_page, address); + } + } + return 0; +} + +static inline void flush_map(void) +{ + on_each_cpu(flush_kernel_map, NULL, 1, 1); +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * This should be used when a page is mapped with a different caching policy + * than write-back somewhere - some CPUs do not like it when mappings with + * different caching policies exist. This changes the page attributes of the + * in kernel linear mapping too. + * + * The caller needs to ensure that there are no conflicting mappings elsewhere. + * This function only deals with the kernel linear map. + * + * Caller must call global_flush_tlb() after this. + */ +int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + int err = 0; + int i; + unsigned long flags; + + spin_lock_irqsave(&cpa_lock, flags); + for (i = 0; i < numpages; i++, page++) { + err = __change_page_attr(page, prot); + if (err) + break; + } + spin_unlock_irqrestore(&cpa_lock, flags); + return err; +} + +void global_flush_tlb(void) +{ + LIST_HEAD(l); + struct page *pg, *next; + + BUG_ON(irqs_disabled()); + + spin_lock_irq(&cpa_lock); + list_splice_init(&df_list, &l); + spin_unlock_irq(&cpa_lock); + flush_map(); + list_for_each_entry_safe(pg, next, &l, lru) + __free_page(pg); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + /* the return value is ignored - the calls cannot fail, + * large pages are disabled at boot time. + */ + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); +} +#endif + +EXPORT_SYMBOL(change_page_attr); +EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c new file mode 100644 index 000000000000..0742d54f8bb0 --- /dev/null +++ b/arch/i386/mm/pgtable.c @@ -0,0 +1,260 @@ +/* + * linux/arch/i386/mm/pgtable.c + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> + +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +void show_mem(void) +{ + int total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + pg_data_t *pgdat; + unsigned long i; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pgdat->node_mem_map + i; + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + } + printk("%d pages of RAM\n", total); + printk("%d pages of HIGHMEM\n",highmem); + printk("%d reserved pages\n",reserved); + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk ("set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk ("set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk ("set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + return pte; +} + +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +{ + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * The locking scheme was chosen on the basis of manfred's + * recommendations and having no core impact whatsoever. + * -- wli + */ +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + page->index = (unsigned long)pgd_list; + if (pgd_list) + pgd_list->private = (unsigned long)&page->index; + pgd_list = page; + page->private = (unsigned long)&pgd_list; +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + next = (struct page *)page->index; + pprev = (struct page **)page->private; + *pprev = next; + if (next) + next->private = (unsigned long)pprev; +} + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + if (PTRS_PER_PMD == 1) + spin_lock_irqsave(&pgd_lock, flags); + + memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) + return; + + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +} + +/* never called when PTRS_PER_PMD > 1 */ +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; /* can be called from interrupt context */ + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } + return pgd; + +out_oom: + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, pgd); + return NULL; +} + +void pgd_free(pgd_t *pgd) +{ + int i; + + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) + for (i = 0; i < USER_PTRS_PER_PGD; ++i) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + /* in the non-PAE case, clear_page_range() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} diff --git a/arch/i386/oprofile/Kconfig b/arch/i386/oprofile/Kconfig new file mode 100644 index 000000000000..5ade19801b97 --- /dev/null +++ b/arch/i386/oprofile/Kconfig @@ -0,0 +1,23 @@ + +menu "Profiling support" + depends on EXPERIMENTAL + +config PROFILING + bool "Profiling support (EXPERIMENTAL)" + help + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + + +config OPROFILE + tristate "OProfile system profiling (EXPERIMENTAL)" + depends on PROFILING + help + OProfile is a profiling system capable of profiling the + whole system, include the kernel, kernel modules, libraries, + and applications. + + If unsure, say N. + +endmenu + diff --git a/arch/i386/oprofile/Makefile b/arch/i386/oprofile/Makefile new file mode 100644 index 000000000000..30f3eb366667 --- /dev/null +++ b/arch/i386/oprofile/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_OPROFILE) += oprofile.o + +DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ + oprof.o cpu_buffer.o buffer_sync.o \ + event_buffer.o oprofile_files.o \ + oprofilefs.o oprofile_stats.o \ + timer_int.o ) + +oprofile-y := $(DRIVER_OBJS) init.o backtrace.o +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ + op_model_ppro.o op_model_p4.o +oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c new file mode 100644 index 000000000000..52d72e074f7f --- /dev/null +++ b/arch/i386/oprofile/backtrace.c @@ -0,0 +1,111 @@ +/** + * @file backtrace.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author David Smith + */ + +#include <linux/oprofile.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <asm/ptrace.h> + +struct frame_head { + struct frame_head * ebp; + unsigned long ret; +} __attribute__((packed)); + +static struct frame_head * +dump_backtrace(struct frame_head * head) +{ + oprofile_add_trace(head->ret); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= head->ebp) + return NULL; + + return head->ebp; +} + +/* check that the page(s) containing the frame head are present */ +static int pages_present(struct frame_head * head) +{ + struct mm_struct * mm = current->mm; + + /* FIXME: only necessary once per page */ + if (!check_user_page_readable(mm, (unsigned long)head)) + return 0; + + return check_user_page_readable(mm, (unsigned long)(head + 1)); +} + +/* + * | | /\ Higher addresses + * | | + * --------------- stack base (address of current_thread_info) + * | thread info | + * . . + * | stack | + * --------------- saved regs->ebp value if valid (frame_head address) + * . . + * --------------- struct pt_regs stored on stack (struct pt_regs *) + * | | + * . . + * | | + * --------------- %esp + * | | + * | | \/ Lower addresses + * + * Thus, &pt_regs <-> stack base restricts the valid(ish) ebp values + */ +#ifdef CONFIG_FRAME_POINTER +static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs) +{ + unsigned long headaddr = (unsigned long)head; + unsigned long stack = (unsigned long)regs; + unsigned long stack_base = (stack & ~(THREAD_SIZE - 1)) + THREAD_SIZE; + + return headaddr > stack && headaddr < stack_base; +} +#else +/* without fp, it's just junk */ +static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs) +{ + return 0; +} +#endif + + +void +x86_backtrace(struct pt_regs * const regs, unsigned int depth) +{ + struct frame_head *head; + +#ifdef CONFIG_X86_64 + head = (struct frame_head *)regs->rbp; +#else + head = (struct frame_head *)regs->ebp; +#endif + + if (!user_mode(regs)) { + while (depth-- && valid_kernel_stack(head, regs)) + head = dump_backtrace(head); + return; + } + +#ifdef CONFIG_SMP + if (!spin_trylock(¤t->mm->page_table_lock)) + return; +#endif + + while (depth-- && head && pages_present(head)) + head = dump_backtrace(head); + +#ifdef CONFIG_SMP + spin_unlock(¤t->mm->page_table_lock); +#endif +} diff --git a/arch/i386/oprofile/init.c b/arch/i386/oprofile/init.c new file mode 100644 index 000000000000..c90332de582b --- /dev/null +++ b/arch/i386/oprofile/init.c @@ -0,0 +1,48 @@ +/** + * @file init.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + */ + +#include <linux/oprofile.h> +#include <linux/init.h> +#include <linux/errno.h> + +/* We support CPUs that have performance counters like the Pentium Pro + * with the NMI mode driver. + */ + +extern int nmi_init(struct oprofile_operations * ops); +extern int nmi_timer_init(struct oprofile_operations * ops); +extern void nmi_exit(void); +extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); + + +int __init oprofile_arch_init(struct oprofile_operations * ops) +{ + int ret; + + ret = -ENODEV; + +#ifdef CONFIG_X86_LOCAL_APIC + ret = nmi_init(ops); +#endif +#ifdef CONFIG_X86_IO_APIC + if (ret < 0) + ret = nmi_timer_init(ops); +#endif + ops->backtrace = x86_backtrace; + + return ret; +} + + +void oprofile_arch_exit(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + nmi_exit(); +#endif +} diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c new file mode 100644 index 000000000000..3492d961d3f1 --- /dev/null +++ b/arch/i386/oprofile/nmi_int.c @@ -0,0 +1,427 @@ +/** + * @file nmi_int.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon <levon@movementarian.org> + */ + +#include <linux/init.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/oprofile.h> +#include <linux/sysdev.h> +#include <linux/slab.h> +#include <asm/nmi.h> +#include <asm/msr.h> +#include <asm/apic.h> + +#include "op_counter.h" +#include "op_x86_model.h" + +static struct op_x86_model_spec const * model; +static struct op_msrs cpu_msrs[NR_CPUS]; +static unsigned long saved_lvtpc[NR_CPUS]; + +static int nmi_start(void); +static void nmi_stop(void); + +/* 0 == registered but off, 1 == registered and on */ +static int nmi_enabled = 0; + +#ifdef CONFIG_PM + +static int nmi_suspend(struct sys_device *dev, u32 state) +{ + if (nmi_enabled == 1) + nmi_stop(); + return 0; +} + + +static int nmi_resume(struct sys_device *dev) +{ + if (nmi_enabled == 1) + nmi_start(); + return 0; +} + + +static struct sysdev_class oprofile_sysclass = { + set_kset_name("oprofile"), + .resume = nmi_resume, + .suspend = nmi_suspend, +}; + + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + + +static int __init init_driverfs(void) +{ + int error; + if (!(error = sysdev_class_register(&oprofile_sysclass))) + error = sysdev_register(&device_oprofile); + return error; +} + + +static void exit_driverfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_driverfs() do { } while (0) +#define exit_driverfs() do { } while (0) +#endif /* CONFIG_PM */ + + +static int nmi_callback(struct pt_regs * regs, int cpu) +{ + return model->check_ctrs(regs, &cpu_msrs[cpu]); +} + + +static void nmi_cpu_save_registers(struct op_msrs * msrs) +{ + unsigned int const nr_ctrs = model->num_counters; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr * counters = msrs->counters; + struct op_msr * controls = msrs->controls; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + rdmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } + + for (i = 0; i < nr_ctrls; ++i) { + rdmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } +} + + +static void nmi_save_registers(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + model->fill_in_addresses(msrs); + nmi_cpu_save_registers(msrs); +} + + +static void free_msrs(void) +{ + int i; + for (i = 0; i < NR_CPUS; ++i) { + kfree(cpu_msrs[i].counters); + cpu_msrs[i].counters = NULL; + kfree(cpu_msrs[i].controls); + cpu_msrs[i].controls = NULL; + } +} + + +static int allocate_msrs(void) +{ + int success = 1; + size_t controls_size = sizeof(struct op_msr) * model->num_controls; + size_t counters_size = sizeof(struct op_msr) * model->num_counters; + + int i; + for (i = 0; i < NR_CPUS; ++i) { + if (!cpu_online(i)) + continue; + + cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL); + if (!cpu_msrs[i].counters) { + success = 0; + break; + } + cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL); + if (!cpu_msrs[i].controls) { + success = 0; + break; + } + } + + if (!success) + free_msrs(); + + return success; +} + + +static void nmi_cpu_setup(void * dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + spin_lock(&oprofilefs_lock); + model->setup_ctrs(msrs); + spin_unlock(&oprofilefs_lock); + saved_lvtpc[cpu] = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, APIC_DM_NMI); +} + + +static int nmi_setup(void) +{ + if (!allocate_msrs()) + return -ENOMEM; + + /* We walk a thin line between law and rape here. + * We need to be careful to install our NMI handler + * without actually triggering any NMIs as this will + * break the core code horrifically. + */ + if (reserve_lapic_nmi() < 0) { + free_msrs(); + return -EBUSY; + } + /* We need to serialize save and setup for HT because the subset + * of msrs are distinct for save and setup operations + */ + on_each_cpu(nmi_save_registers, NULL, 0, 1); + on_each_cpu(nmi_cpu_setup, NULL, 0, 1); + set_nmi_callback(nmi_callback); + nmi_enabled = 1; + return 0; +} + + +static void nmi_restore_registers(struct op_msrs * msrs) +{ + unsigned int const nr_ctrs = model->num_counters; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr * counters = msrs->counters; + struct op_msr * controls = msrs->controls; + unsigned int i; + + for (i = 0; i < nr_ctrls; ++i) { + wrmsr(controls[i].addr, + controls[i].saved.low, + controls[i].saved.high); + } + + for (i = 0; i < nr_ctrs; ++i) { + wrmsr(counters[i].addr, + counters[i].saved.low, + counters[i].saved.high); + } +} + + +static void nmi_cpu_shutdown(void * dummy) +{ + unsigned int v; + int cpu = smp_processor_id(); + struct op_msrs * msrs = &cpu_msrs[cpu]; + + /* restoring APIC_LVTPC can trigger an apic error because the delivery + * mode and vector nr combination can be illegal. That's by design: on + * power on apic lvt contain a zero vector nr which are legal only for + * NMI delivery mode. So inhibit apic err before restoring lvtpc + */ + v = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, saved_lvtpc[cpu]); + apic_write(APIC_LVTERR, v); + nmi_restore_registers(msrs); +} + + +static void nmi_shutdown(void) +{ + nmi_enabled = 0; + on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); + unset_nmi_callback(); + release_lapic_nmi(); + free_msrs(); +} + + +static void nmi_cpu_start(void * dummy) +{ + struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + model->start(msrs); +} + + +static int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 0, 1); + return 0; +} + + +static void nmi_cpu_stop(void * dummy) +{ + struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + model->stop(msrs); +} + + +static void nmi_stop(void) +{ + on_each_cpu(nmi_cpu_stop, NULL, 0, 1); +} + + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +static int nmi_create_files(struct super_block * sb, struct dentry * root) +{ + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + struct dentry * dir; + char buf[2]; + + snprintf(buf, 2, "%d", i); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + } + + return 0; +} + + +static int __init p4_init(char ** cpu_type) +{ + __u8 cpu_model = boot_cpu_data.x86_model; + + if (cpu_model > 4) + return 0; + +#ifndef CONFIG_SMP + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; +#else + switch (smp_num_siblings) { + case 1: + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; + + case 2: + *cpu_type = "i386/p4-ht"; + model = &op_p4_ht2_spec; + return 1; + } +#endif + + printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); + printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); + return 0; +} + + +static int __init ppro_init(char ** cpu_type) +{ + __u8 cpu_model = boot_cpu_data.x86_model; + + if (cpu_model > 0xd) + return 0; + + if (cpu_model == 9) { + *cpu_type = "i386/p6_mobile"; + } else if (cpu_model > 5) { + *cpu_type = "i386/piii"; + } else if (cpu_model > 2) { + *cpu_type = "i386/pii"; + } else { + *cpu_type = "i386/ppro"; + } + + model = &op_ppro_spec; + return 1; +} + +/* in order to get driverfs right */ +static int using_nmi; + +int __init nmi_init(struct oprofile_operations *ops) +{ + __u8 vendor = boot_cpu_data.x86_vendor; + __u8 family = boot_cpu_data.x86; + char *cpu_type; + + if (!cpu_has_apic) + return -ENODEV; + + switch (vendor) { + case X86_VENDOR_AMD: + /* Needs to be at least an Athlon (or hammer in 32bit mode) */ + + switch (family) { + default: + return -ENODEV; + case 6: + model = &op_athlon_spec; + cpu_type = "i386/athlon"; + break; + case 0xf: + model = &op_athlon_spec; + /* Actually it could be i386/hammer too, but give + user space an consistent name. */ + cpu_type = "x86-64/hammer"; + break; + } + break; + + case X86_VENDOR_INTEL: + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init(&cpu_type)) + return -ENODEV; + break; + + /* A P6-class processor */ + case 6: + if (!ppro_init(&cpu_type)) + return -ENODEV; + break; + + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + init_driverfs(); + using_nmi = 1; + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; + printk(KERN_INFO "oprofile: using NMI interrupt.\n"); + return 0; +} + + +void nmi_exit(void) +{ + if (using_nmi) + exit_driverfs(); +} diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c new file mode 100644 index 000000000000..b2e462abf337 --- /dev/null +++ b/arch/i386/oprofile/nmi_timer_int.c @@ -0,0 +1,55 @@ +/** + * @file nmi_timer_int.c + * + * @remark Copyright 2003 OProfile authors + * @remark Read the file COPYING + * + * @author Zwane Mwaikambo <zwane@linuxpower.ca> + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/oprofile.h> +#include <linux/rcupdate.h> + + +#include <asm/nmi.h> +#include <asm/apic.h> +#include <asm/ptrace.h> + +static int nmi_timer_callback(struct pt_regs * regs, int cpu) +{ + oprofile_add_sample(regs, 0); + return 1; +} + +static int timer_start(void) +{ + disable_timer_nmi_watchdog(); + set_nmi_callback(nmi_timer_callback); + return 0; +} + + +static void timer_stop(void) +{ + enable_timer_nmi_watchdog(); + unset_nmi_callback(); + synchronize_kernel(); +} + + +int __init nmi_timer_init(struct oprofile_operations * ops) +{ + extern int nmi_active; + + if (nmi_active <= 0) + return -ENODEV; + + ops->start = timer_start; + ops->stop = timer_stop; + ops->cpu_type = "timer"; + printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); + return 0; +} diff --git a/arch/i386/oprofile/op_counter.h b/arch/i386/oprofile/op_counter.h new file mode 100644 index 000000000000..2880b15c4675 --- /dev/null +++ b/arch/i386/oprofile/op_counter.h @@ -0,0 +1,29 @@ +/** + * @file op_counter.h + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + */ + +#ifndef OP_COUNTER_H +#define OP_COUNTER_H + +#define OP_MAX_COUNTER 8 + +/* Per-perfctr configuration as set via + * oprofilefs. + */ +struct op_counter_config { + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; +}; + +extern struct op_counter_config counter_config[]; + +#endif /* OP_COUNTER_H */ diff --git a/arch/i386/oprofile/op_model_athlon.c b/arch/i386/oprofile/op_model_athlon.c new file mode 100644 index 000000000000..3ad9a72a5036 --- /dev/null +++ b/arch/i386/oprofile/op_model_athlon.c @@ -0,0 +1,149 @@ +/** + * @file op_model_athlon.h + * athlon / K7 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + */ + +#include <linux/oprofile.h> +#include <asm/ptrace.h> +#include <asm/msr.h> + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 + +#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) +#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_READ(l,h,msrs,c) do {rdmsr(msrs->controls[(c)].addr, (l), (h));} while (0) +#define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR(x) (x &= (1<<21)) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT(val, e) (val |= e) + +static unsigned long reset_value[NUM_COUNTERS]; + +static void athlon_fill_in_addresses(struct op_msrs * const msrs) +{ + msrs->counters[0].addr = MSR_K7_PERFCTR0; + msrs->counters[1].addr = MSR_K7_PERFCTR1; + msrs->counters[2].addr = MSR_K7_PERFCTR2; + msrs->counters[3].addr = MSR_K7_PERFCTR3; + + msrs->controls[0].addr = MSR_K7_EVNTSEL0; + msrs->controls[1].addr = MSR_K7_EVNTSEL1; + msrs->controls[2].addr = MSR_K7_EVNTSEL2; + msrs->controls[3].addr = MSR_K7_EVNTSEL3; +} + + +static void athlon_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT(low, counter_config[i].event); + CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; + } + } +} + + +static int athlon_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + + /* See op_model_ppro.c */ + return 1; +} + + +static void athlon_start(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } +} + + +static void athlon_stop(struct op_msrs const * const msrs) +{ + unsigned int low,high; + int i; + + /* Subtle: stop on all counters to avoid race with + * setting our pm callback */ + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } +} + + +struct op_x86_model_spec const op_athlon_spec = { + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &athlon_fill_in_addresses, + .setup_ctrs = &athlon_setup_ctrs, + .check_ctrs = &athlon_check_ctrs, + .start = &athlon_start, + .stop = &athlon_stop +}; diff --git a/arch/i386/oprofile/op_model_p4.c b/arch/i386/oprofile/op_model_p4.c new file mode 100644 index 000000000000..ac8a066035c2 --- /dev/null +++ b/arch/i386/oprofile/op_model_p4.c @@ -0,0 +1,725 @@ +/** + * @file op_model_p4.c + * P4 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + */ + +#include <linux/oprofile.h> +#include <linux/smp.h> +#include <asm/msr.h> +#include <asm/ptrace.h> +#include <asm/fixmap.h> +#include <asm/apic.h> + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_EVENTS 39 + +#define NUM_COUNTERS_NON_HT 8 +#define NUM_ESCRS_NON_HT 45 +#define NUM_CCCRS_NON_HT 18 +#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) + +#define NUM_COUNTERS_HT2 4 +#define NUM_ESCRS_HT2 23 +#define NUM_CCCRS_HT2 9 +#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) + +static unsigned int num_counters = NUM_COUNTERS_NON_HT; + + +/* this has to be checked dynamically since the + hyper-threadedness of a chip is discovered at + kernel boot-time. */ +static inline void setup_num_counters(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) + num_counters = NUM_COUNTERS_HT2; +#endif +} + +static int inline addr_increment(void) +{ +#ifdef CONFIG_SMP + return smp_num_siblings == 2 ? 2 : 1; +#else + return 1; +#endif +} + + +/* tables to simulate simplified hardware view of p4 registers */ +struct p4_counter_binding { + int virt_counter; + int counter_address; + int cccr_address; +}; + +struct p4_event_binding { + int escr_select; /* value to put in CCCR */ + int event_select; /* value to put in ESCR */ + struct { + int virt_counter; /* for this counter... */ + int escr_address; /* use this ESCR */ + } bindings[2]; +}; + +/* nb: these CTR_* defines are a duplicate of defines in + event/i386.p4*events. */ + + +#define CTR_BPU_0 (1 << 0) +#define CTR_MS_0 (1 << 1) +#define CTR_FLAME_0 (1 << 2) +#define CTR_IQ_4 (1 << 3) +#define CTR_BPU_2 (1 << 4) +#define CTR_MS_2 (1 << 5) +#define CTR_FLAME_2 (1 << 6) +#define CTR_IQ_5 (1 << 7) + +static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { + { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, + { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, + { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, + { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, + { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, + { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, + { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, + { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } +}; + +#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT + +/* All cccr we don't use. */ +static int p4_unused_cccr[NUM_UNUSED_CCCRS] = { + MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, MSR_P4_IQ_CCCR3 +}; + +/* p4 event codes in libop/op_event.h are indices into this table. */ + +static struct p4_event_binding p4_events[NUM_EVENTS] = { + + { /* BRANCH_RETIRED */ + 0x05, 0x06, + { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, + {CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* MISPRED_BRANCH_RETIRED */ + 0x04, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* TC_DELIVER_MODE */ + 0x01, 0x01, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { CTR_MS_2, MSR_P4_TC_ESCR1} } + }, + + { /* BPU_FETCH_REQUEST */ + 0x00, 0x03, + { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, + { CTR_BPU_2, MSR_P4_BPU_ESCR1} } + }, + + { /* ITLB_REFERENCE */ + 0x03, 0x18, + { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, + { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } + }, + + { /* MEMORY_CANCEL */ + 0x05, 0x02, + { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, + { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } + }, + + { /* MEMORY_COMPLETE */ + 0x02, 0x08, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* LOAD_PORT_REPLAY */ + 0x02, 0x04, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* STORE_PORT_REPLAY */ + 0x02, 0x05, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* MOB_LOAD_REPLAY */ + 0x02, 0x03, + { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, + { CTR_BPU_2, MSR_P4_MOB_ESCR1} } + }, + + { /* PAGE_WALK_TYPE */ + 0x04, 0x01, + { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, + { CTR_BPU_2, MSR_P4_PMH_ESCR1} } + }, + + { /* BSQ_CACHE_REFERENCE */ + 0x07, 0x0c, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { CTR_BPU_2, MSR_P4_BSU_ESCR1} } + }, + + { /* IOQ_ALLOCATION */ + 0x06, 0x03, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { 0, 0 } } + }, + + { /* IOQ_ACTIVE_ENTRIES */ + 0x06, 0x1a, + { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, + { 0, 0 } } + }, + + { /* FSB_DATA_ACTIVITY */ + 0x06, 0x17, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* BSQ_ALLOCATION */ + 0x07, 0x05, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { 0, 0 } } + }, + + { /* BSQ_ACTIVE_ENTRIES */ + 0x07, 0x06, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + { 0, 0 } } + }, + + { /* X87_ASSIST */ + 0x05, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* SSE_INPUT_ASSIST */ + 0x01, 0x34, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_SP_UOP */ + 0x01, 0x08, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_DP_UOP */ + 0x01, 0x0c, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_SP_UOP */ + 0x01, 0x0a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_DP_UOP */ + 0x01, 0x0e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 64BIT_MMX_UOP */ + 0x01, 0x02, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 128BIT_MMX_UOP */ + 0x01, 0x1a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_FP_UOP */ + 0x01, 0x04, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_SIMD_MOVES_UOP */ + 0x01, 0x2e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* MACHINE_CLEAR */ + 0x05, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* GLOBAL_POWER_EVENTS */ + 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* TC_MS_XFER */ + 0x00, 0x05, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* UOP_QUEUE_WRITES */ + 0x00, 0x09, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* FRONT_END_EVENT */ + 0x05, 0x08, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* EXECUTION_EVENT */ + 0x05, 0x0c, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* REPLAY_EVENT */ + 0x05, 0x09, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* INSTR_RETIRED */ + 0x04, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOPS_RETIRED */ + 0x04, 0x01, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOP_TYPE */ + 0x02, 0x02, + { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, + { CTR_IQ_5, MSR_P4_RAT_ESCR1} } + }, + + { /* RETIRED_MISPRED_BRANCH_TYPE */ + 0x02, 0x05, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + }, + + { /* RETIRED_BRANCH_TYPE */ + 0x02, 0x04, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + } +}; + + +#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) + +#define ESCR_RESERVED_BITS 0x80000003 +#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) +#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) +#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) +#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) +#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) +#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) +#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) +#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) +#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) + +#define CCCR_RESERVED_BITS 0x38030FFF +#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) +#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) +#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) +#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) +#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) +#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) +#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) +#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) +#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) + +#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) +#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) +#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) + + +/* this assigns a "stagger" to the current CPU, which is used throughout + the code in this module as an extra array offset, to select the "even" + or "odd" part of all the divided resources. */ +static unsigned int get_stagger(void) +{ +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); +#endif + return 0; +} + + +/* finally, mediate access to a real hardware counter + by passing a "virtual" counter numer to this macro, + along with your stagger setting. */ +#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) + +static unsigned long reset_value[NUM_COUNTERS_NON_HT]; + + +static void p4_fill_in_addresses(struct op_msrs * const msrs) +{ + unsigned int i; + unsigned int addr, stag; + + setup_num_counters(); + stag = get_stagger(); + + /* the counter registers we pay attention to */ + for (i = 0; i < num_counters; ++i) { + msrs->counters[i].addr = + p4_counters[VIRT_CTR(stag, i)].counter_address; + } + + /* FIXME: bad feeling, we don't save the 10 counters we don't use. */ + + /* 18 CCCR registers */ + for (i = 0, addr = MSR_P4_BPU_CCCR0 + stag; + addr <= MSR_P4_IQ_CCCR5; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* 43 ESCR registers in three or four discontiguous group */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 + * to avoid special case in nmi_{save|restore}_registers() */ + if (boot_cpu_data.x86_model >= 0x3) { + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + } else { + for (addr = MSR_P4_IQ_ESCR0 + stag; + addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + } + + for (addr = MSR_P4_RAT_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { + msrs->controls[i].addr = addr; + } + + /* there are 2 remaining non-contiguously located ESCRs */ + + if (num_counters == NUM_COUNTERS_NON_HT) { + /* standard non-HT CPUs handle both remaining ESCRs*/ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else if (stag == 0) { + /* HT CPUs give the first remainder to the even thread, as + the 32nd control register */ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else { + /* and two copies of the second to the odd thread, + for the 22st and 23nd control registers */ + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + } +} + + +static void pmc_setup_one_p4_counter(unsigned int ctr) +{ + int i; + int const maxbind = 2; + unsigned int cccr = 0; + unsigned int escr = 0; + unsigned int high = 0; + unsigned int counter_bit; + struct p4_event_binding *ev = NULL; + unsigned int stag; + + stag = get_stagger(); + + /* convert from counter *number* to counter *bit* */ + counter_bit = 1 << VIRT_CTR(stag, ctr); + + /* find our event binding structure. */ + if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", + counter_config[ctr].event); + return; + } + + ev = &(p4_events[counter_config[ctr].event - 1]); + + for (i = 0; i < maxbind; i++) { + if (ev->bindings[i].virt_counter & counter_bit) { + + /* modify ESCR */ + ESCR_READ(escr, high, ev, i); + ESCR_CLEAR(escr); + if (stag == 0) { + ESCR_SET_USR_0(escr, counter_config[ctr].user); + ESCR_SET_OS_0(escr, counter_config[ctr].kernel); + } else { + ESCR_SET_USR_1(escr, counter_config[ctr].user); + ESCR_SET_OS_1(escr, counter_config[ctr].kernel); + } + ESCR_SET_EVENT_SELECT(escr, ev->event_select); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_WRITE(escr, high, ev, i); + + /* modify CCCR */ + CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + CCCR_CLEAR(cccr); + CCCR_SET_REQUIRED_BITS(cccr); + CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); + if (stag == 0) { + CCCR_SET_PMI_OVF_0(cccr); + } else { + CCCR_SET_PMI_OVF_1(cccr); + } + CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + return; + } + } + + printk(KERN_ERR + "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", + counter_config[ctr].event, stag, ctr); +} + + +static void p4_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int i; + unsigned int low, high; + unsigned int addr; + unsigned int stag; + + stag = get_stagger(); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (! MISC_PMC_ENABLED_P(low)) { + printk(KERN_ERR "oprofile: P4 PMC not available\n"); + return; + } + + /* clear the cccrs we will use */ + for (i = 0 ; i < num_counters ; i++) { + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + } + + /* clear cccrs outside our concern */ + for (i = stag ; i < NUM_UNUSED_CCCRS ; i += addr_increment()) { + rdmsr(p4_unused_cccr[i], low, high); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + wrmsr(p4_unused_cccr[i], low, high); + } + + /* clear all escrs (including those outside our concern) */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr < MSR_P4_IQ_ESCR0; addr += addr_increment()) { + wrmsr(addr, 0, 0); + } + + /* On older models clear also MSR_P4_IQ_ESCR0/1 */ + if (boot_cpu_data.x86_model < 0x3) { + wrmsr(MSR_P4_IQ_ESCR0, 0, 0); + wrmsr(MSR_P4_IQ_ESCR1, 0, 0); + } + + for (addr = MSR_P4_RAT_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; addr += addr_increment()){ + wrmsr(addr, 0, 0); + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; addr += addr_increment()){ + wrmsr(addr, 0, 0); + } + + if (num_counters == NUM_COUNTERS_NON_HT) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } else if (stag == 0) { + wrmsr(MSR_P4_CRU_ESCR4, 0, 0); + } else { + wrmsr(MSR_P4_CRU_ESCR5, 0, 0); + } + + /* setup all counters */ + for (i = 0 ; i < num_counters ; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + pmc_setup_one_p4_counter(i); + CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + } else { + reset_value[i] = 0; + } + } +} + + +static int p4_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned long ctr, low, high, stag, real; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + + if (!reset_value[i]) + continue; + + /* + * there is some eccentricity in the hardware which + * requires that we perform 2 extra corrections: + * + * - check both the CCCR:OVF flag for overflow and the + * counter high bit for un-flagged overflows. + * + * - write the counter back twice to ensure it gets + * updated properly. + * + * the former seems to be related to extra NMIs happening + * during the current NMI; the latter is reported as errata + * N15 in intel doc 249199-029, pentium 4 specification + * update, though their suggested work-around does not + * appear to solve the problem. + */ + + real = VIRT_CTR(stag, i); + + CCCR_READ(low, high, real); + CTR_READ(ctr, high, real); + if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], real); + CCCR_CLEAR_OVF(low); + CCCR_WRITE(low, high, real); + CTR_WRITE(reset_value[i], real); + } + } + + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* See op_model_ppro.c */ + return 1; +} + + +static void p4_start(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_ENABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +static void p4_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + CCCR_READ(low, high, VIRT_CTR(stag, i)); + CCCR_SET_DISABLE(low); + CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + } +} + + +#ifdef CONFIG_SMP +struct op_x86_model_spec const op_p4_ht2_spec = { + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; +#endif + +struct op_x86_model_spec const op_p4_spec = { + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop +}; diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c new file mode 100644 index 000000000000..d719015fc044 --- /dev/null +++ b/arch/i386/oprofile/op_model_ppro.c @@ -0,0 +1,143 @@ +/** + * @file op_model_ppro.h + * pentium pro / P6 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + */ + +#include <linux/oprofile.h> +#include <asm/ptrace.h> +#include <asm/msr.h> +#include <asm/apic.h> + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 2 +#define NUM_CONTROLS 2 + +#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) +#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0) +#define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR(x) (x &= (1<<21)) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT(val, e) (val |= e) + +static unsigned long reset_value[NUM_COUNTERS]; + +static void ppro_fill_in_addresses(struct op_msrs * const msrs) +{ + msrs->counters[0].addr = MSR_P6_PERFCTR0; + msrs->counters[1].addr = MSR_P6_PERFCTR1; + + msrs->controls[0].addr = MSR_P6_EVNTSEL0; + msrs->controls[1].addr = MSR_P6_EVNTSEL1; +} + + +static void ppro_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR(low); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT(low, counter_config[i].event); + CTRL_WRITE(low, high, msrs, i); + } + } +} + + +static int ppro_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + + /* Only P6 based Pentium M need to re-unmask the apic vector but it + * doesn't hurt other P6 variant */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* We can't work out if we really handled an interrupt. We + * might have caught a *second* counter just after overflowing + * the interrupt for this counter then arrives + * and we don't find a counter that's overflowed, so we + * would return 0 and get dazed + confused. Instead we always + * assume we found an overflow. This sucks. + */ + return 1; +} + + +static void ppro_start(struct op_msrs const * const msrs) +{ + unsigned int low,high; + CTRL_READ(low, high, msrs, 0); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); +} + + +static void ppro_stop(struct op_msrs const * const msrs) +{ + unsigned int low,high; + CTRL_READ(low, high, msrs, 0); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, 0); +} + + +struct op_x86_model_spec const op_ppro_spec = { + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop +}; diff --git a/arch/i386/oprofile/op_x86_model.h b/arch/i386/oprofile/op_x86_model.h new file mode 100644 index 000000000000..123b7e90a9ee --- /dev/null +++ b/arch/i386/oprofile/op_x86_model.h @@ -0,0 +1,50 @@ +/** + * @file op_x86_model.h + * interface to x86 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + */ + +#ifndef OP_X86_MODEL_H +#define OP_X86_MODEL_H + +struct op_saved_msr { + unsigned int high; + unsigned int low; +}; + +struct op_msr { + unsigned long addr; + struct op_saved_msr saved; +}; + +struct op_msrs { + struct op_msr * counters; + struct op_msr * controls; +}; + +struct pt_regs; + +/* The model vtable abstracts the differences between + * various x86 CPU model's perfctr support. + */ +struct op_x86_model_spec { + unsigned int const num_counters; + unsigned int const num_controls; + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); +}; + +extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec const op_p4_spec; +extern struct op_x86_model_spec const op_p4_ht2_spec; +extern struct op_x86_model_spec const op_athlon_spec; + +#endif /* OP_X86_MODEL_H */ diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile new file mode 100644 index 000000000000..1bff03f36965 --- /dev/null +++ b/arch/i386/pci/Makefile @@ -0,0 +1,14 @@ +obj-y := i386.o + +obj-$(CONFIG_PCI_BIOS) += pcbios.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o +obj-$(CONFIG_PCI_DIRECT) += direct.o + +pci-y := fixup.o +pci-$(CONFIG_ACPI_PCI) += acpi.o +pci-y += legacy.o irq.o + +pci-$(CONFIG_X86_VISWS) := visws.o fixup.o +pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o + +obj-y += $(pci-y) common.o diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c new file mode 100644 index 000000000000..2db65ec45dc3 --- /dev/null +++ b/arch/i386/pci/acpi.c @@ -0,0 +1,53 @@ +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/hw_irq.h> +#include "pci.h" + +struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) +{ + if (domain != 0) { + printk(KERN_WARNING "PCI: Multiple domains not supported\n"); + return NULL; + } + + return pcibios_scan_root(busnum); +} + +extern int pci_routeirq; +static int __init pci_acpi_init(void) +{ + struct pci_dev *dev = NULL; + + if (pcibios_scanned) + return 0; + + if (acpi_noirq) + return 0; + + printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); + acpi_irq_penalty_init(); + pcibios_scanned++; + pcibios_enable_irq = acpi_pci_irq_enable; + + if (pci_routeirq) { + /* + * PCI IRQ routing is set up by pci_enable_device(), but we + * also do it here in case there are still broken drivers that + * don't use pci_enable_device(). + */ + printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n"); + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) + acpi_pci_irq_enable(dev); + } else + printk(KERN_INFO "PCI: If a device doesn't work, try \"pci=routeirq\". If it helps, post a report\n"); + +#ifdef CONFIG_X86_IO_APIC + if (acpi_ioapic) + print_IO_APIC(); +#endif + + return 0; +} +subsys_initcall(pci_acpi_init); diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c new file mode 100644 index 000000000000..720975e1af50 --- /dev/null +++ b/arch/i386/pci/common.c @@ -0,0 +1,251 @@ +/* + * Low-Level PCI Support for PC + * + * (c) 1999--2000 Martin Mares <mj@ucw.cz> + */ + +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/ioport.h> +#include <linux/init.h> + +#include <asm/acpi.h> +#include <asm/segment.h> +#include <asm/io.h> +#include <asm/smp.h> + +#include "pci.h" + +#ifdef CONFIG_PCI_BIOS +extern void pcibios_sort(void); +#endif + +unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | + PCI_PROBE_MMCONF; + +int pci_routeirq; +int pcibios_last_bus = -1; +struct pci_bus *pci_root_bus = NULL; +struct pci_raw_ops *raw_pci_ops; + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value) +{ + return raw_pci_ops->read(0, bus->number, devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value) +{ + return raw_pci_ops->write(0, bus->number, devfn, where, size, value); +} + +struct pci_ops pci_root_ops = { + .read = pci_read, + .write = pci_write, +}; + +/* + * legacy, numa, and acpi all want to call pcibios_scan_root + * from their initcalls. This flag prevents that. + */ +int pcibios_scanned; + +/* + * This interrupt-safe spinlock protects all accesses to PCI + * configuration space. + */ +DEFINE_SPINLOCK(pci_config_lock); + +/* + * Several buggy motherboards address only 16 devices and mirror + * them to next 16 IDs. We try to detect this `feature' on all + * primary buses (those containing host bridges as they are + * expected to be unique) and remove the ghost devices. + */ + +static void __devinit pcibios_fixup_ghosts(struct pci_bus *b) +{ + struct list_head *ln, *mn; + struct pci_dev *d, *e; + int mirror = PCI_DEVFN(16,0); + int seen_host_bridge = 0; + int i; + + DBG("PCI: Scanning for ghost devices on bus %d\n", b->number); + list_for_each(ln, &b->devices) { + d = pci_dev_b(ln); + if ((d->class >> 8) == PCI_CLASS_BRIDGE_HOST) + seen_host_bridge++; + for (mn=ln->next; mn != &b->devices; mn=mn->next) { + e = pci_dev_b(mn); + if (e->devfn != d->devfn + mirror || + e->vendor != d->vendor || + e->device != d->device || + e->class != d->class) + continue; + for(i=0; i<PCI_NUM_RESOURCES; i++) + if (e->resource[i].start != d->resource[i].start || + e->resource[i].end != d->resource[i].end || + e->resource[i].flags != d->resource[i].flags) + continue; + break; + } + if (mn == &b->devices) + return; + } + if (!seen_host_bridge) + return; + printk(KERN_WARNING "PCI: Ignoring ghost devices on bus %02x\n", b->number); + + ln = &b->devices; + while (ln->next != &b->devices) { + d = pci_dev_b(ln->next); + if (d->devfn >= mirror) { + list_del(&d->global_list); + list_del(&d->bus_list); + kfree(d); + } else + ln = ln->next; + } +} + +/* + * Called after each bus is probed, but before its children + * are examined. + */ + +void __devinit pcibios_fixup_bus(struct pci_bus *b) +{ + pcibios_fixup_ghosts(b); + pci_read_bridge_bases(b); +} + + +struct pci_bus * __devinit pcibios_scan_root(int busnum) +{ + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus)) != NULL) { + if (bus->number == busnum) { + /* Already scanned */ + return bus; + } + } + + printk("PCI: Probing PCI hardware (bus %02x)\n", busnum); + + return pci_scan_bus(busnum, &pci_root_ops, NULL); +} + +extern u8 pci_cache_line_size; + +static int __init pcibios_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!raw_pci_ops) { + printk("PCI: System does not support PCI\n"); + return 0; + } + + /* + * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 + * and P4. It's also good for 386/486s (which actually have 16) + * as quite a few PCI devices do not support smaller values. + */ + pci_cache_line_size = 32 >> 2; + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) + pci_cache_line_size = 64 >> 2; /* K7 & K8 */ + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) + pci_cache_line_size = 128 >> 2; /* P4 */ + + pcibios_resource_survey(); + +#ifdef CONFIG_PCI_BIOS + if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) + pcibios_sort(); +#endif + return 0; +} + +subsys_initcall(pcibios_init); + +char * __devinit pcibios_setup(char *str) +{ + if (!strcmp(str, "off")) { + pci_probe = 0; + return NULL; + } +#ifdef CONFIG_PCI_BIOS + else if (!strcmp(str, "bios")) { + pci_probe = PCI_PROBE_BIOS; + return NULL; + } else if (!strcmp(str, "nobios")) { + pci_probe &= ~PCI_PROBE_BIOS; + return NULL; + } else if (!strcmp(str, "nosort")) { + pci_probe |= PCI_NO_SORT; + return NULL; + } else if (!strcmp(str, "biosirq")) { + pci_probe |= PCI_BIOS_IRQ_SCAN; + return NULL; + } +#endif +#ifdef CONFIG_PCI_DIRECT + else if (!strcmp(str, "conf1")) { + pci_probe = PCI_PROBE_CONF1 | PCI_NO_CHECKS; + return NULL; + } + else if (!strcmp(str, "conf2")) { + pci_probe = PCI_PROBE_CONF2 | PCI_NO_CHECKS; + return NULL; + } +#endif +#ifdef CONFIG_PCI_MMCONFIG + else if (!strcmp(str, "nommconf")) { + pci_probe &= ~PCI_PROBE_MMCONF; + return NULL; + } +#endif + else if (!strcmp(str, "noacpi")) { + acpi_noirq_set(); + return NULL; + } +#ifndef CONFIG_X86_VISWS + else if (!strcmp(str, "usepirqmask")) { + pci_probe |= PCI_USE_PIRQ_MASK; + return NULL; + } else if (!strncmp(str, "irqmask=", 8)) { + pcibios_irq_mask = simple_strtol(str+8, NULL, 0); + return NULL; + } else if (!strncmp(str, "lastbus=", 8)) { + pcibios_last_bus = simple_strtol(str+8, NULL, 0); + return NULL; + } +#endif + else if (!strcmp(str, "rom")) { + pci_probe |= PCI_ASSIGN_ROMS; + return NULL; + } else if (!strcmp(str, "assign-busses")) { + pci_probe |= PCI_ASSIGN_ALL_BUSSES; + return NULL; + } else if (!strcmp(str, "routeirq")) { + pci_routeirq = 1; + return NULL; + } + return str; +} + +unsigned int pcibios_assign_all_busses(void) +{ + return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + int err; + + if ((err = pcibios_enable_resources(dev, mask)) < 0) + return err; + + return pcibios_enable_irq(dev); +} diff --git a/arch/i386/pci/direct.c b/arch/i386/pci/direct.c new file mode 100644 index 000000000000..30b7e9b4f6a2 --- /dev/null +++ b/arch/i386/pci/direct.c @@ -0,0 +1,289 @@ +/* + * direct.c - Low-level direct PCI config space access + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include "pci.h" + +/* + * Functions for accessing PCI configuration space with type 1 accesses + */ + +#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) + +static int pci_conf1_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); + + switch (len) { + case 1: + *value = inb(0xCFC + (reg & 3)); + break; + case 2: + *value = inw(0xCFC + (reg & 2)); + break; + case 4: + *value = inl(0xCFC); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_conf1_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + + if ((bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); + + switch (len) { + case 1: + outb((u8)value, 0xCFC + (reg & 3)); + break; + case 2: + outw((u16)value, 0xCFC + (reg & 2)); + break; + case 4: + outl((u32)value, 0xCFC); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +#undef PCI_CONF1_ADDRESS + +struct pci_raw_ops pci_direct_conf1 = { + .read = pci_conf1_read, + .write = pci_conf1_write, +}; + + +/* + * Functions for accessing PCI configuration space with type 2 accesses + */ + +#define PCI_CONF2_ADDRESS(dev, reg) (u16)(0xC000 | (dev << 8) | reg) + +static int pci_conf2_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + int dev, fn; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + dev = PCI_SLOT(devfn); + fn = PCI_FUNC(devfn); + + if (dev & 0x10) + return PCIBIOS_DEVICE_NOT_FOUND; + + spin_lock_irqsave(&pci_config_lock, flags); + + outb((u8)(0xF0 | (fn << 1)), 0xCF8); + outb((u8)bus, 0xCFA); + + switch (len) { + case 1: + *value = inb(PCI_CONF2_ADDRESS(dev, reg)); + break; + case 2: + *value = inw(PCI_CONF2_ADDRESS(dev, reg)); + break; + case 4: + *value = inl(PCI_CONF2_ADDRESS(dev, reg)); + break; + } + + outb(0, 0xCF8); + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_conf2_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + int dev, fn; + + if ((bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + dev = PCI_SLOT(devfn); + fn = PCI_FUNC(devfn); + + if (dev & 0x10) + return PCIBIOS_DEVICE_NOT_FOUND; + + spin_lock_irqsave(&pci_config_lock, flags); + + outb((u8)(0xF0 | (fn << 1)), 0xCF8); + outb((u8)bus, 0xCFA); + + switch (len) { + case 1: + outb((u8)value, PCI_CONF2_ADDRESS(dev, reg)); + break; + case 2: + outw((u16)value, PCI_CONF2_ADDRESS(dev, reg)); + break; + case 4: + outl((u32)value, PCI_CONF2_ADDRESS(dev, reg)); + break; + } + + outb(0, 0xCF8); + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +#undef PCI_CONF2_ADDRESS + +static struct pci_raw_ops pci_direct_conf2 = { + .read = pci_conf2_read, + .write = pci_conf2_write, +}; + + +/* + * Before we decide to use direct hardware access mechanisms, we try to do some + * trivial checks to ensure it at least _seems_ to be working -- we just test + * whether bus 00 contains a host bridge (this is similar to checking + * techniques used in XFree86, but ours should be more reliable since we + * attempt to make use of direct access hints provided by the PCI BIOS). + * + * This should be close to trivial, but it isn't, because there are buggy + * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. + */ +static int __init pci_sanity_check(struct pci_raw_ops *o) +{ + u32 x = 0; + int devfn; + + if (pci_probe & PCI_NO_CHECKS) + return 1; + + for (devfn = 0; devfn < 0x100; devfn++) { + if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x)) + continue; + if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA) + return 1; + + if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x)) + continue; + if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ) + return 1; + } + + DBG("PCI: Sanity check failed\n"); + return 0; +} + +static int __init pci_check_type1(void) +{ + unsigned long flags; + unsigned int tmp; + int works = 0; + + local_irq_save(flags); + + outb(0x01, 0xCFB); + tmp = inl(0xCF8); + outl(0x80000000, 0xCF8); + if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { + works = 1; + } + outl(tmp, 0xCF8); + local_irq_restore(flags); + + return works; +} + +static int __init pci_check_type2(void) +{ + unsigned long flags; + int works = 0; + + local_irq_save(flags); + + outb(0x00, 0xCFB); + outb(0x00, 0xCF8); + outb(0x00, 0xCFA); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && + pci_sanity_check(&pci_direct_conf2)) { + works = 1; + } + + local_irq_restore(flags); + + return works; +} + +static int __init pci_direct_init(void) +{ + struct resource *region, *region2; + + if ((pci_probe & PCI_PROBE_CONF1) == 0) + goto type2; + region = request_region(0xCF8, 8, "PCI conf1"); + if (!region) + goto type2; + + if (pci_check_type1()) { + printk(KERN_INFO "PCI: Using configuration type 1\n"); + raw_pci_ops = &pci_direct_conf1; + return 0; + } + release_resource(region); + + type2: + if ((pci_probe & PCI_PROBE_CONF2) == 0) + goto out; + region = request_region(0xCF8, 4, "PCI conf2"); + if (!region) + goto out; + region2 = request_region(0xC000, 0x1000, "PCI conf2"); + if (!region2) + goto fail2; + + if (pci_check_type2()) { + printk(KERN_INFO "PCI: Using configuration type 2\n"); + raw_pci_ops = &pci_direct_conf2; + return 0; + } + + release_resource(region2); + fail2: + release_resource(region); + + out: + return 0; +} + +arch_initcall(pci_direct_init); diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c new file mode 100644 index 000000000000..be52c5ac4e05 --- /dev/null +++ b/arch/i386/pci/fixup.c @@ -0,0 +1,386 @@ +/* + * Exceptions for specific devices. Usually work-arounds for fatal design flaws. + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include "pci.h" + + +static void __devinit pci_fixup_i450nx(struct pci_dev *d) +{ + /* + * i450NX -- Find and scan all secondary buses on all PXB's. + */ + int pxb, reg; + u8 busno, suba, subb; + + printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); + reg = 0xd0; + for(pxb=0; pxb<2; pxb++) { + pci_read_config_byte(d, reg++, &busno); + pci_read_config_byte(d, reg++, &suba); + pci_read_config_byte(d, reg++, &subb); + DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + if (busno) + pci_scan_bus(busno, &pci_root_ops, NULL); /* Bus A */ + if (suba < subb) + pci_scan_bus(suba+1, &pci_root_ops, NULL); /* Bus B */ + } + pcibios_last_bus = -1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); + +static void __devinit pci_fixup_i450gx(struct pci_dev *d) +{ + /* + * i450GX and i450KX -- Find and scan all secondary buses. + * (called separately for each PCI bridge found) + */ + u8 busno; + pci_read_config_byte(d, 0x4a, &busno); + printk(KERN_INFO "PCI: i440KX/GX host bridge %s: secondary bus %02x\n", pci_name(d), busno); + pci_scan_bus(busno, &pci_root_ops, NULL); + pcibios_last_bus = -1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx); + +static void __devinit pci_fixup_umc_ide(struct pci_dev *d) +{ + /* + * UM8886BF IDE controller sets region type bits incorrectly, + * therefore they look like memory despite of them being I/O. + */ + int i; + + printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d)); + for(i=0; i<4; i++) + d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); + +static void __devinit pci_fixup_ncr53c810(struct pci_dev *d) +{ + /* + * NCR 53C810 returns class code 0 (at least on some systems). + * Fix class to be PCI_CLASS_STORAGE_SCSI + */ + if (!d->class) { + printk(KERN_WARNING "PCI: fixing NCR 53C810 class code for %s\n", pci_name(d)); + d->class = PCI_CLASS_STORAGE_SCSI << 8; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); + +static void __devinit pci_fixup_ide_bases(struct pci_dev *d) +{ + int i; + + /* + * PCI IDE controllers use non-standard I/O port decoding, respect it. + */ + if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE) + return; + DBG("PCI: IDE base address fixup for %s\n", pci_name(d)); + for(i=0; i<4; i++) { + struct resource *r = &d->resource[i]; + if ((r->start & ~0x80) == 0x374) { + r->start |= 2; + r->end = r->start; + } + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); + +static void __devinit pci_fixup_ide_trash(struct pci_dev *d) +{ + int i; + + /* + * Runs the fixup only for the first IDE controller + * (Shai Fultheim - shai@ftcon.com) + */ + static int called = 0; + if (called) + return; + called = 1; + + /* + * There exist PCI IDE controllers which have utter garbage + * in first four base registers. Ignore that. + */ + DBG("PCI: IDE base address trash cleared for %s\n", pci_name(d)); + for(i=0; i<4; i++) + d->resource[i].start = d->resource[i].end = d->resource[i].flags = 0; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5513, pci_fixup_ide_trash); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_10, pci_fixup_ide_trash); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_11, pci_fixup_ide_trash); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_9, pci_fixup_ide_trash); + +static void __devinit pci_fixup_latency(struct pci_dev *d) +{ + /* + * SiS 5597 and 5598 chipsets require latency timer set to + * at most 32 to avoid lockups. + */ + DBG("PCI: Setting max latency to 32\n"); + pcibios_max_latency = 32; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5598, pci_fixup_latency); + +static void __devinit pci_fixup_piix4_acpi(struct pci_dev *d) +{ + /* + * PIIX4 ACPI device: hardwired IRQ9 + */ + d->irq = 9; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci_fixup_piix4_acpi); + +/* + * Addresses issues with problems in the memory write queue timer in + * certain VIA Northbridges. This bugfix is per VIA's specifications, + * except for the KL133/KM133: clearing bit 5 on those Northbridges seems + * to trigger a bug in its integrated ProSavage video card, which + * causes screen corruption. We only clear bits 6 and 7 for that chipset, + * until VIA can provide us with definitive information on why screen + * corruption occurs, and what exactly those bits do. + * + * VIA 8363,8622,8361 Northbridges: + * - bits 5, 6, 7 at offset 0x55 need to be turned off + * VIA 8367 (KT266x) Northbridges: + * - bits 5, 6, 7 at offset 0x95 need to be turned off + * VIA 8363 rev 0x81/0x84 (KL133/KM133) Northbridges: + * - bits 6, 7 at offset 0x55 need to be turned off + */ + +#define VIA_8363_KL133_REVISION_ID 0x81 +#define VIA_8363_KM133_REVISION_ID 0x84 + +static void __devinit pci_fixup_via_northbridge_bug(struct pci_dev *d) +{ + u8 v; + u8 revision; + int where = 0x55; + int mask = 0x1f; /* clear bits 5, 6, 7 by default */ + + pci_read_config_byte(d, PCI_REVISION_ID, &revision); + + if (d->device == PCI_DEVICE_ID_VIA_8367_0) { + /* fix pci bus latency issues resulted by NB bios error + it appears on bug free^Wreduced kt266x's bios forces + NB latency to zero */ + pci_write_config_byte(d, PCI_LATENCY_TIMER, 0); + + where = 0x95; /* the memory write queue timer register is + different for the KT266x's: 0x95 not 0x55 */ + } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && + (revision == VIA_8363_KL133_REVISION_ID || + revision == VIA_8363_KM133_REVISION_ID)) { + mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5 + causes screen corruption on the KL133/KM133 */ + } + + pci_read_config_byte(d, where, &v); + if (v & ~mask) { + printk(KERN_WARNING "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \ + d->device, revision, where, v, mask, v & mask); + v &= mask; + pci_write_config_byte(d, where, v); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug); + +/* + * For some reasons Intel decided that certain parts of their + * 815, 845 and some other chipsets must look like PCI-to-PCI bridges + * while they are obviously not. The 82801 family (AA, AB, BAM/CAM, + * BA/CA/DB and E) PCI bridges are actually HUB-to-PCI ones, according + * to Intel terminology. These devices do forward all addresses from + * system to PCI bus no matter what are their window settings, so they are + * "transparent" (or subtractive decoding) from programmers point of view. + */ +static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && + (dev->device & 0xff00) == 0x2400) + dev->transparent = 1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge); + +/* + * Fixup for C1 Halt Disconnect problem on nForce2 systems. + * + * From information provided by "Allen Martin" <AMartin@nvidia.com>: + * + * A hang is caused when the CPU generates a very fast CONNECT/HALT cycle + * sequence. Workaround is to set the SYSTEM_IDLE_TIMEOUT to 80 ns. + * This allows the state-machine and timer to return to a proper state within + * 80 ns of the CONNECT and probe appearing together. Since the CPU will not + * issue another HALT within 80 ns of the initial HALT, the failure condition + * is avoided. + */ +static void __init pci_fixup_nforce2(struct pci_dev *dev) +{ + u32 val; + + /* + * Chip Old value New value + * C17 0x1F0FFF01 0x1F01FF01 + * C18D 0x9F0FFF01 0x9F01FF01 + * + * Northbridge chip version may be determined by + * reading the PCI revision ID (0xC1 or greater is C18D). + */ + pci_read_config_dword(dev, 0x6c, &val); + + /* + * Apply fixup if needed, but don't touch disconnect state + */ + if ((val & 0x00FF0000) != 0x00010000) { + printk(KERN_WARNING "PCI: nForce2 C1 Halt Disconnect fixup\n"); + pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2); + +/* Max PCI Express root ports */ +#define MAX_PCIEROOT 6 +static int quirk_aspm_offset[MAX_PCIEROOT << 3]; + +#define GET_INDEX(a, b) (((a - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + b) + +static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value) +{ + return raw_pci_ops->read(0, bus->number, devfn, where, size, value); +} + +/* + * Replace the original pci bus ops for write with a new one that will filter + * the request to insure ASPM cannot be enabled. + */ +static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value) +{ + u8 offset; + + offset = quirk_aspm_offset[GET_INDEX(bus->self->device, devfn)]; + + if ((offset) && (where == offset)) + value = value & 0xfffffffc; + + return raw_pci_ops->write(0, bus->number, devfn, where, size, value); +} + +static struct pci_ops quirk_pcie_aspm_ops = { + .read = quirk_pcie_aspm_read, + .write = quirk_pcie_aspm_write, +}; + +/* + * Prevents PCI Express ASPM (Active State Power Management) being enabled. + * + * Save the register offset, where the ASPM control bits are located, + * for each PCI Express device that is in the device list of + * the root port in an array for fast indexing. Replace the bus ops + * with the modified one. + */ +static void pcie_rootport_aspm_quirk(struct pci_dev *pdev) +{ + int cap_base, i; + struct pci_bus *pbus; + struct pci_dev *dev; + + if ((pbus = pdev->subordinate) == NULL) + return; + + /* + * Check if the DID of pdev matches one of the six root ports. This + * check is needed in the case this function is called directly by the + * hot-plug driver. + */ + if ((pdev->device < PCI_DEVICE_ID_INTEL_MCH_PA) || + (pdev->device > PCI_DEVICE_ID_INTEL_MCH_PC1)) + return; + + if (list_empty(&pbus->devices)) { + /* + * If no device is attached to the root port at power-up or + * after hot-remove, the pbus->devices is empty and this code + * will set the offsets to zero and the bus ops to parent's bus + * ops, which is unmodified. + */ + for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) + quirk_aspm_offset[i] = 0; + + pbus->ops = pbus->parent->ops; + } else { + /* + * If devices are attached to the root port at power-up or + * after hot-add, the code loops through the device list of + * each root port to save the register offsets and replace the + * bus ops. + */ + list_for_each_entry(dev, &pbus->devices, bus_list) { + /* There are 0 to 8 devices attached to this bus */ + cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP); + quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10; + } + pbus->ops = &quirk_pcie_aspm_ops; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); + +/* + * Fixup to mark boot BIOS video selected by BIOS before it changes + * + * From information provided by "Jon Smirl" <jonsmirl@gmail.com> + * + * The standard boot ROM sequence for an x86 machine uses the BIOS + * to select an initial video card for boot display. This boot video + * card will have it's BIOS copied to C0000 in system RAM. + * IORESOURCE_ROM_SHADOW is used to associate the boot video + * card with this copy. On laptops this copy has to be used since + * the main ROM may be compressed or combined with another image. + * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW + * is marked here since the boot video device will be the only enabled + * video device at this point. + */ + +static void __devinit pci_fixup_video(struct pci_dev *pdev) +{ + struct pci_dev *bridge; + struct pci_bus *bus; + u16 config; + + if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + return; + + /* Is VGA routed to us? */ + bus = pdev->bus; + while (bus) { + bridge = bus->self; + if (bridge) { + pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, + &config); + if (!(config & PCI_BRIDGE_CTL_VGA)) + return; + } + bus = bus->parent; + } + pci_read_config_word(pdev, PCI_COMMAND, &config); + if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { + pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; + printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev)); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c new file mode 100644 index 000000000000..c205ea7e233b --- /dev/null +++ b/arch/i386/pci/i386.c @@ -0,0 +1,304 @@ +/* + * Low-Level PCI Access for i386 machines + * + * Copyright 1993, 1994 Drew Eckhardt + * Visionary Computing + * (Unix and Linux consulting and custom programming) + * Drew@Colorado.EDU + * +1 (303) 786-7975 + * + * Drew's work was sponsored by: + * iX Multiuser Multitasking Magazine + * Hannover, Germany + * hm@ix.de + * + * Copyright 1997--2000 Martin Mares <mj@ucw.cz> + * + * For more information, please consult the following manuals (look at + * http://www.pcisig.com/ for how to get them): + * + * PCI BIOS Specification + * PCI Local Bus Specification + * PCI to PCI Bridge Specification + * PCI System Design Guide + * + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/errno.h> + +#include "pci.h" + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + * + * Why? Because some silly external IO cards only decode + * the low 10 bits of the IO address. The 0x00-0xff region + * is reserved for motherboard devices that decode all 16 + * bits, so it's ok to allocate at, say, 0x2800-0x28ff, + * but we want to try to avoid allocating at 0x2900-0x2bff + * which might have be mirrored at 0x0100-0x03ff.. + */ +void +pcibios_align_resource(void *data, struct resource *res, + unsigned long size, unsigned long align) +{ + if (res->flags & IORESOURCE_IO) { + unsigned long start = res->start; + + if (start & 0x300) { + start = (start + 0x3ff) & ~0x3ff; + res->start = start; + } + } +} + + +/* + * Handle resources of PCI devices. If the world were perfect, we could + * just allocate all the resource regions and do nothing more. It isn't. + * On the other hand, we cannot just re-allocate all devices, as it would + * require us to know lots of host bridge internals. So we attempt to + * keep as much of the original configuration as possible, but tweak it + * when it's found to be wrong. + * + * Known BIOS problems we have to work around: + * - I/O or memory regions not configured + * - regions configured, but not enabled in the command register + * - bogus I/O addresses above 64K used + * - expansion ROMs left enabled (this may sound harmless, but given + * the fact the PCI specs explicitly allow address decoders to be + * shared between expansion ROMs and other resource regions, it's + * at least dangerous) + * + * Our solution: + * (1) Allocate resources for all buses behind PCI-to-PCI bridges. + * This gives us fixed barriers on where we can allocate. + * (2) Allocate resources for all enabled devices. If there is + * a collision, just mark the resource as unallocated. Also + * disable expansion ROMs during this step. + * (3) Try to allocate resources for disabled devices. If the + * resources were assigned correctly, everything goes well, + * if they weren't, they won't disturb allocation of other + * resources. + * (4) Assign new addresses to resources which were either + * not configured at all or misconfigured. If explicitly + * requested by the user, configure expansion ROM address + * as well. + */ + +static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) +{ + struct pci_bus *bus; + struct pci_dev *dev; + int idx; + struct resource *r, *pr; + + /* Depth-First Search on bus tree */ + list_for_each_entry(bus, bus_list, node) { + if ((dev = bus->self)) { + for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { + r = &dev->resource[idx]; + if (!r->start) + continue; + pr = pci_find_parent_resource(dev, r); + if (!pr || request_resource(pr, r) < 0) + printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, pci_name(dev)); + } + } + pcibios_allocate_bus_resources(&bus->children); + } +} + +static void __init pcibios_allocate_resources(int pass) +{ + struct pci_dev *dev = NULL; + int idx, disabled; + u16 command; + struct resource *r, *pr; + + for_each_pci_dev(dev) { + pci_read_config_word(dev, PCI_COMMAND, &command); + for(idx = 0; idx < 6; idx++) { + r = &dev->resource[idx]; + if (r->parent) /* Already allocated */ + continue; + if (!r->start) /* Address not assigned at all */ + continue; + if (r->flags & IORESOURCE_IO) + disabled = !(command & PCI_COMMAND_IO); + else + disabled = !(command & PCI_COMMAND_MEMORY); + if (pass == disabled) { + DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n", + r->start, r->end, r->flags, disabled, pass); + pr = pci_find_parent_resource(dev, r); + if (!pr || request_resource(pr, r) < 0) { + printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, pci_name(dev)); + /* We'll assign a new address later */ + r->end -= r->start; + r->start = 0; + } + } + } + if (!pass) { + r = &dev->resource[PCI_ROM_RESOURCE]; + if (r->flags & IORESOURCE_ROM_ENABLE) { + /* Turn the ROM off, leave the resource region, but keep it unregistered. */ + u32 reg; + DBG("PCI: Switching off ROM of %s\n", pci_name(dev)); + r->flags &= ~IORESOURCE_ROM_ENABLE; + pci_read_config_dword(dev, dev->rom_base_reg, ®); + pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE); + } + } + } +} + +static int __init pcibios_assign_resources(void) +{ + struct pci_dev *dev = NULL; + int idx; + struct resource *r; + + for_each_pci_dev(dev) { + int class = dev->class >> 8; + + /* Don't touch classless devices and host bridges */ + if (!class || class == PCI_CLASS_BRIDGE_HOST) + continue; + + for(idx=0; idx<6; idx++) { + r = &dev->resource[idx]; + + /* + * Don't touch IDE controllers and I/O ports of video cards! + */ + if ((class == PCI_CLASS_STORAGE_IDE && idx < 4) || + (class == PCI_CLASS_DISPLAY_VGA && (r->flags & IORESOURCE_IO))) + continue; + + /* + * We shall assign a new address to this resource, either because + * the BIOS forgot to do so or because we have decided the old + * address was unusable for some reason. + */ + if (!r->start && r->end) + pci_assign_resource(dev, idx); + } + + if (pci_probe & PCI_ASSIGN_ROMS) { + r = &dev->resource[PCI_ROM_RESOURCE]; + r->end -= r->start; + r->start = 0; + if (r->end) + pci_assign_resource(dev, PCI_ROM_RESOURCE); + } + } + return 0; +} + +void __init pcibios_resource_survey(void) +{ + DBG("PCI: Allocating resources\n"); + pcibios_allocate_bus_resources(&pci_root_buses); + pcibios_allocate_resources(0); + pcibios_allocate_resources(1); +} + +/** + * called in fs_initcall (one below subsys_initcall), + * give a chance for motherboard reserve resources + */ +fs_initcall(pcibios_assign_resources); + +int pcibios_enable_resources(struct pci_dev *dev, int mask) +{ + u16 cmd, old_cmd; + int idx; + struct resource *r; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + old_cmd = cmd; + for(idx=0; idx<6; idx++) { + /* Only set up the requested stuff */ + if (!(mask & (1<<idx))) + continue; + + r = &dev->resource[idx]; + if (!r->start && r->end) { + printk(KERN_ERR "PCI: Device %s not available because of resource collisions\n", pci_name(dev)); + return -EINVAL; + } + if (r->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (r->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + if (dev->resource[PCI_ROM_RESOURCE].start) + cmd |= PCI_COMMAND_MEMORY; + if (cmd != old_cmd) { + printk("PCI: Enabling device %s (%04x -> %04x)\n", pci_name(dev), old_cmd, cmd); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * If we set up a device for bus mastering, we need to check the latency + * timer as certain crappy BIOSes forget to set it properly. + */ +unsigned int pcibios_max_latency = 255; + +void pcibios_set_master(struct pci_dev *dev) +{ + u8 lat; + pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); + if (lat < 16) + lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; + else if (lat > pcibios_max_latency) + lat = pcibios_max_latency; + else + return; + printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", pci_name(dev), lat); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); +} + +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine) +{ + unsigned long prot; + + /* I/O space cannot be accessed via normal processor loads and + * stores on this platform. + */ + if (mmap_state == pci_mmap_io) + return -EINVAL; + + /* Leave vm_pgoff as-is, the PCI space address is the physical + * address on this platform. + */ + vma->vm_flags |= (VM_SHM | VM_LOCKED | VM_IO); + + prot = pgprot_val(vma->vm_page_prot); + if (boot_cpu_data.x86 > 3) + prot |= _PAGE_PCD | _PAGE_PWT; + vma->vm_page_prot = __pgprot(prot); + + /* Write-combine setting is ignored, it is changed via the mtrr + * interfaces on this platform. + */ + if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c new file mode 100644 index 000000000000..1128451b5d74 --- /dev/null +++ b/arch/i386/pci/irq.c @@ -0,0 +1,1119 @@ +/* + * Low-Level PCI Support for PC -- Routing of Interrupts + * + * (c) 1999--2000 Martin Mares <mj@ucw.cz> + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/dmi.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/io_apic.h> +#include <asm/hw_irq.h> +#include <linux/acpi.h> + +#include "pci.h" + +#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) +#define PIRQ_VERSION 0x0100 + +static int broken_hp_bios_irq9; +static int acer_tm360_irqrouting; + +static struct irq_routing_table *pirq_table; + +static int pirq_enable_irq(struct pci_dev *dev); + +/* + * Never use: 0, 1, 2 (timer, keyboard, and cascade) + * Avoid using: 13, 14 and 15 (FP error and IDE). + * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse) + */ +unsigned int pcibios_irq_mask = 0xfff8; + +static int pirq_penalty[16] = { + 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000, + 0, 0, 0, 0, 1000, 100000, 100000, 100000 +}; + +struct irq_router { + char *name; + u16 vendor, device; + int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); + int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); +}; + +struct irq_router_handler { + u16 vendor; + int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); +}; + +int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; + +/* + * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. + */ + +static struct irq_routing_table * __init pirq_find_routing_table(void) +{ + u8 *addr; + struct irq_routing_table *rt; + int i; + u8 sum; + + for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { + rt = (struct irq_routing_table *) addr; + if (rt->signature != PIRQ_SIGNATURE || + rt->version != PIRQ_VERSION || + rt->size % 16 || + rt->size < sizeof(struct irq_routing_table)) + continue; + sum = 0; + for(i=0; i<rt->size; i++) + sum += addr[i]; + if (!sum) { + DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt); + return rt; + } + } + return NULL; +} + +/* + * If we have a IRQ routing table, use it to search for peer host + * bridges. It's a gross hack, but since there are no other known + * ways how to get a list of buses, we have to go this way. + */ + +static void __init pirq_peer_trick(void) +{ + struct irq_routing_table *rt = pirq_table; + u8 busmap[256]; + int i; + struct irq_info *e; + + memset(busmap, 0, sizeof(busmap)); + for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { + e = &rt->slots[i]; +#ifdef DEBUG + { + int j; + DBG("%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); + for(j=0; j<4; j++) + DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); + DBG("\n"); + } +#endif + busmap[e->bus] = 1; + } + for(i = 1; i < 256; i++) { + if (!busmap[i] || pci_find_bus(0, i)) + continue; + if (pci_scan_bus(i, &pci_root_ops, NULL)) + printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i); + } + pcibios_last_bus = -1; +} + +/* + * Code for querying and setting of IRQ routes on various interrupt routers. + */ + +void eisa_set_level_irq(unsigned int irq) +{ + unsigned char mask = 1 << (irq & 7); + unsigned int port = 0x4d0 + (irq >> 3); + unsigned char val; + static u16 eisa_irq_mask; + + if (irq >= 16 || (1 << irq) & eisa_irq_mask) + return; + + eisa_irq_mask |= (1 << irq); + printk("PCI: setting IRQ %u as level-triggered\n", irq); + val = inb(port); + if (!(val & mask)) { + DBG(" -> edge"); + outb(val | mask, port); + } +} + +/* + * Common IRQ routing practice: nybbles in config space, + * offset by some magic constant. + */ +static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + return (nr & 1) ? (x >> 4) : (x & 0xf); +} + +static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) +{ + u8 x; + unsigned reg = offset + (nr >> 1); + + pci_read_config_byte(router, reg, &x); + x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val); + pci_write_config_byte(router, reg, x); +} + +/* + * ALI pirq entries are damn ugly, and completely undocumented. + * This has been figured out from pirq tables, and it's not a pretty + * picture. + */ +static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + + return irqmap[read_config_nybble(router, 0x48, pirq-1)]; +} + +static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; + unsigned int val = irqmap[irq]; + + if (val) { + write_config_nybble(router, 0x48, pirq-1, val); + return 1; + } + return 0; +} + +/* + * The Intel PIIX4 pirq rules are fairly simple: "pirq" is + * just a pointer to the config space. + */ +static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + + pci_read_config_byte(router, pirq, &x); + return (x < 16) ? x : 0; +} + +static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + pci_write_config_byte(router, pirq, irq); + return 1; +} + +/* + * The VIA pirq rules are nibble-based, like ALI, + * but without the ugly irq number munging. + * However, PIRQD is in the upper instead of lower 4 bits. + */ +static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq); +} + +static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq); + return 1; +} + +/* + * ITE 8330G pirq rules are nibble-based + * FIXME: pirqmap may be { 1, 0, 3, 2 }, + * 2+3 are both mapped to irq 9 on my system + */ +static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + return read_config_nybble(router,0x43, pirqmap[pirq-1]); +} + +static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); + return 1; +} + +/* + * OPTI: high four bits are nibble pointer.. + * I wonder what the low bits do? + */ +static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0xb8, pirq >> 4); +} + +static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0xb8, pirq >> 4, irq); + return 1; +} + +/* + * Cyrix: nibble offset 0x5C + * 0x5C bits 7:4 is INTB bits 3:0 is INTA + * 0x5D bits 7:4 is INTD bits 3:0 is INTC + */ +static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + return read_config_nybble(router, 0x5C, (pirq-1)^1); +} + +static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + write_config_nybble(router, 0x5C, (pirq-1)^1, irq); + return 1; +} + +/* + * PIRQ routing for SiS 85C503 router used in several SiS chipsets. + * We have to deal with the following issues here: + * - vendors have different ideas about the meaning of link values + * - some onboard devices (integrated in the chipset) have special + * links and are thus routed differently (i.e. not via PCI INTA-INTD) + * - different revision of the router have a different layout for + * the routing registers, particularly for the onchip devices + * + * For all routing registers the common thing is we have one byte + * per routeable link which is defined as: + * bit 7 IRQ mapping enabled (0) or disabled (1) + * bits [6:4] reserved (sometimes used for onchip devices) + * bits [3:0] IRQ to map to + * allowed: 3-7, 9-12, 14-15 + * reserved: 0, 1, 2, 8, 13 + * + * The config-space registers located at 0x41/0x42/0x43/0x44 are + * always used to route the normal PCI INT A/B/C/D respectively. + * Apparently there are systems implementing PCI routing table using + * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. + * We try our best to handle both link mappings. + * + * Currently (2003-05-21) it appears most SiS chipsets follow the + * definition of routing registers from the SiS-5595 southbridge. + * According to the SiS 5595 datasheets the revision id's of the + * router (ISA-bridge) should be 0x01 or 0xb0. + * + * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1. + * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets. + * They seem to work with the current routing code. However there is + * some concern because of the two USB-OHCI HCs (original SiS 5595 + * had only one). YMMV. + * + * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1: + * + * 0x61: IDEIRQ: + * bits [6:5] must be written 01 + * bit 4 channel-select primary (0), secondary (1) + * + * 0x62: USBIRQ: + * bit 6 OHCI function disabled (0), enabled (1) + * + * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved + * + * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved + * + * We support USBIRQ (in addition to INTA-INTD) and keep the + * IDE, ACPI and DAQ routing untouched as set by the BIOS. + * + * Currently the only reported exception is the new SiS 65x chipset + * which includes the SiS 69x southbridge. Here we have the 85C503 + * router revision 0x04 and there are changes in the register layout + * mostly related to the different USB HCs with USB 2.0 support. + * + * Onchip routing for router rev-id 0x04 (try-and-error observation) + * + * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs + * bit 6-4 are probably unused, not like 5595 + */ + +#define PIRQ_SIS_IRQ_MASK 0x0f +#define PIRQ_SIS_IRQ_DISABLE 0x80 +#define PIRQ_SIS_USB_ENABLE 0x40 + +static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); +} + +static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + u8 x; + int reg; + + reg = pirq; + if (reg >= 0x01 && reg <= 0x04) + reg += 0x40; + pci_read_config_byte(router, reg, &x); + x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); + x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; + pci_write_config_byte(router, reg, x); + return 1; +} + + +/* + * VLSI: nibble offset 0x74 - educated guess due to routing table and + * config space of VLSI 82C534 PCI-bridge/router (1004:0102) + * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard + * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6 + * for the busbridge to the docking station. + */ + +static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + return read_config_nybble(router, 0x74, pirq-1); +} + +static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + if (pirq > 8) { + printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + return 0; + } + write_config_nybble(router, 0x74, pirq-1, irq); + return 1; +} + +/* + * ServerWorks: PCI interrupts mapped to system IRQ lines through Index + * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register + * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect + * register is a straight binary coding of desired PIC IRQ (low nibble). + * + * The 'link' value in the PIRQ table is already in the correct format + * for the Index register. There are some special index values: + * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1, + * and 0x03 for SMBus. + */ +static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + outb_p(pirq, 0xc00); + return inb(0xc01) & 0xf; +} + +static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + outb_p(pirq, 0xc00); + outb_p(irq, 0xc01); + return 1; +} + +/* Support for AMD756 PCI IRQ Routing + * Jhon H. Caicedo <jhcaiced@osso.org.co> + * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced) + * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced) + * The AMD756 pirq rules are nibble-based + * offset 0x56 0-3 PIRQA 4-7 PIRQB + * offset 0x57 0-3 PIRQC 4-7 PIRQD + */ +static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + u8 irq; + irq = 0; + if (pirq <= 4) + { + irq = read_config_nybble(router, 0x56, pirq - 1); + } + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + return irq; +} + +static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", + dev->vendor, dev->device, pirq, irq); + if (pirq <= 4) + { + write_config_nybble(router, 0x56, pirq - 1, irq); + } + return 1; +} + +#ifdef CONFIG_PCI_BIOS + +static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) +{ + struct pci_dev *bridge; + int pin = pci_get_interrupt_pin(dev, &bridge); + return pcibios_set_irq_routing(bridge, pin, irq); +} + +#endif + +static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + static struct pci_device_id pirq_440gx[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, + { }, + }; + + /* 440GX has a proprietary PIRQ router -- don't use it */ + if (pci_dev_present(pirq_440gx)) + return 0; + + switch(device) + { + case PCI_DEVICE_ID_INTEL_82371FB_0: + case PCI_DEVICE_ID_INTEL_82371SB_0: + case PCI_DEVICE_ID_INTEL_82371AB_0: + case PCI_DEVICE_ID_INTEL_82371MX: + case PCI_DEVICE_ID_INTEL_82443MX_0: + case PCI_DEVICE_ID_INTEL_82801AA_0: + case PCI_DEVICE_ID_INTEL_82801AB_0: + case PCI_DEVICE_ID_INTEL_82801BA_0: + case PCI_DEVICE_ID_INTEL_82801BA_10: + case PCI_DEVICE_ID_INTEL_82801CA_0: + case PCI_DEVICE_ID_INTEL_82801CA_12: + case PCI_DEVICE_ID_INTEL_82801DB_0: + case PCI_DEVICE_ID_INTEL_82801E_0: + case PCI_DEVICE_ID_INTEL_82801EB_0: + case PCI_DEVICE_ID_INTEL_ESB_1: + case PCI_DEVICE_ID_INTEL_ICH6_0: + case PCI_DEVICE_ID_INTEL_ICH6_1: + case PCI_DEVICE_ID_INTEL_ICH7_0: + case PCI_DEVICE_ID_INTEL_ICH7_1: + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; +} + +static __init int via_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + /* FIXME: We should move some of the quirk fixup stuff here */ + switch(device) + { + case PCI_DEVICE_ID_VIA_82C586_0: + case PCI_DEVICE_ID_VIA_82C596: + case PCI_DEVICE_ID_VIA_82C686: + case PCI_DEVICE_ID_VIA_8231: + /* FIXME: add new ones for 8233/5 */ + r->name = "VIA"; + r->get = pirq_via_get; + r->set = pirq_via_set; + return 1; + } + return 0; +} + +static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_VLSI_82C534: + r->name = "VLSI 82C534"; + r->get = pirq_vlsi_get; + r->set = pirq_vlsi_set; + return 1; + } + return 0; +} + + +static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_SERVERWORKS_OSB4: + case PCI_DEVICE_ID_SERVERWORKS_CSB5: + r->name = "ServerWorks"; + r->get = pirq_serverworks_get; + r->set = pirq_serverworks_set; + return 1; + } + return 0; +} + +static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + if (device != PCI_DEVICE_ID_SI_503) + return 0; + + r->name = "SIS"; + r->get = pirq_sis_get; + r->set = pirq_sis_set; + return 1; +} + +static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_CYRIX_5520: + r->name = "NatSemi"; + r->get = pirq_cyrix_get; + r->set = pirq_cyrix_set; + return 1; + } + return 0; +} + +static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_OPTI_82C700: + r->name = "OPTI"; + r->get = pirq_opti_get; + r->set = pirq_opti_set; + return 1; + } + return 0; +} + +static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_ITE_IT8330G_0: + r->name = "ITE"; + r->get = pirq_ite_get; + r->set = pirq_ite_set; + return 1; + } + return 0; +} + +static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AL_M1533: + case PCI_DEVICE_ID_AL_M1563: + printk("PCI: Using ALI IRQ Router\n"); + r->name = "ALI"; + r->get = pirq_ali_get; + r->set = pirq_ali_set; + return 1; + } + return 0; +} + +static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch(device) + { + case PCI_DEVICE_ID_AMD_VIPER_740B: + r->name = "AMD756"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7413: + r->name = "AMD766"; + break; + case PCI_DEVICE_ID_AMD_VIPER_7443: + r->name = "AMD768"; + break; + default: + return 0; + } + r->get = pirq_amd756_get; + r->set = pirq_amd756_set; + return 1; +} + +static __initdata struct irq_router_handler pirq_routers[] = { + { PCI_VENDOR_ID_INTEL, intel_router_probe }, + { PCI_VENDOR_ID_AL, ali_router_probe }, + { PCI_VENDOR_ID_ITE, ite_router_probe }, + { PCI_VENDOR_ID_VIA, via_router_probe }, + { PCI_VENDOR_ID_OPTI, opti_router_probe }, + { PCI_VENDOR_ID_SI, sis_router_probe }, + { PCI_VENDOR_ID_CYRIX, cyrix_router_probe }, + { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, + { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, + { PCI_VENDOR_ID_AMD, amd_router_probe }, + /* Someone with docs needs to add the ATI Radeon IGP */ + { 0, NULL } +}; +static struct irq_router pirq_router; +static struct pci_dev *pirq_router_dev; + + +/* + * FIXME: should we have an option to say "generic for + * chipset" ? + */ + +static void __init pirq_find_router(struct irq_router *r) +{ + struct irq_routing_table *rt = pirq_table; + struct irq_router_handler *h; + +#ifdef CONFIG_PCI_BIOS + if (!rt->signature) { + printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n"); + r->set = pirq_bios_set; + r->name = "BIOS"; + return; + } +#endif + + /* Default unless a driver reloads it */ + r->name = "default"; + r->get = NULL; + r->set = NULL; + + DBG("PCI: Attempting to find IRQ router for %04x:%04x\n", + rt->rtr_vendor, rt->rtr_device); + + pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); + if (!pirq_router_dev) { + DBG("PCI: Interrupt router not found at %02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); + return; + } + + for( h = pirq_routers; h->vendor; h++) { + /* First look for a router match */ + if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) + break; + /* Fall back to a device match */ + if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) + break; + } + printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", + pirq_router.name, + pirq_router_dev->vendor, + pirq_router_dev->device, + pci_name(pirq_router_dev)); +} + +static struct irq_info *pirq_get_info(struct pci_dev *dev) +{ + struct irq_routing_table *rt = pirq_table; + int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); + struct irq_info *info; + + for (info = rt->slots; entries--; info++) + if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) + return info; + return NULL; +} + +static int pcibios_lookup_irq(struct pci_dev *dev, int assign) +{ + u8 pin; + struct irq_info *info; + int i, pirq, newirq; + int irq = 0; + u32 mask; + struct irq_router *r = &pirq_router; + struct pci_dev *dev2 = NULL; + char *msg = NULL; + + /* Find IRQ pin */ + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (!pin) { + DBG(" -> no interrupt pin\n"); + return 0; + } + pin = pin - 1; + + /* Find IRQ routing entry */ + + if (!pirq_table) + return 0; + + DBG("IRQ for %s[%c]", pci_name(dev), 'A' + pin); + info = pirq_get_info(dev); + if (!info) { + DBG(" -> not found in routing table\n"); + return 0; + } + pirq = info->irq[pin].link; + mask = info->irq[pin].bitmap; + if (!pirq) { + DBG(" -> not routed\n"); + return 0; + } + DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); + mask &= pcibios_irq_mask; + + /* Work around broken HP Pavilion Notebooks which assign USB to + IRQ 9 even though it is actually wired to IRQ 11 */ + + if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) { + dev->irq = 11; + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11); + r->set(pirq_router_dev, dev, pirq, 11); + } + + /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ + if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { + pirq = 0x68; + mask = 0x400; + dev->irq = r->get(pirq_router_dev, dev, pirq); + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); + } + + /* + * Find the best IRQ to assign: use the one + * reported by the device if possible. + */ + newirq = dev->irq; + if (!((1 << newirq) & mask)) { + if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; + else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev)); + } + if (!newirq && assign) { + for (i = 0; i < 16; i++) { + if (!(mask & (1 << i))) + continue; + if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ)) + newirq = i; + } + } + DBG(" -> newirq=%d", newirq); + + /* Check if it is hardcoded */ + if ((pirq & 0xf0) == 0xf0) { + irq = pirq & 0xf; + DBG(" -> hardcoded IRQ %d\n", irq); + msg = "Hardcoded"; + } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ + ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { + DBG(" -> got IRQ %d\n", irq); + msg = "Found"; + } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { + DBG(" -> assigning IRQ %d", newirq); + if (r->set(pirq_router_dev, dev, pirq, newirq)) { + eisa_set_level_irq(newirq); + DBG(" ... OK\n"); + msg = "Assigned"; + irq = newirq; + } + } + + if (!irq) { + DBG(" ... failed\n"); + if (newirq && mask == (1 << newirq)) { + msg = "Guessed"; + irq = newirq; + } else + return 0; + } + printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); + + /* Update IRQ for all devices with the same pirq value */ + while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { + pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); + if (!pin) + continue; + pin--; + info = pirq_get_info(dev2); + if (!info) + continue; + if (info->irq[pin].link == pirq) { + /* We refuse to override the dev->irq information. Give a warning! */ + if ( dev2->irq && dev2->irq != irq && \ + (!(pci_probe & PCI_USE_PIRQ_MASK) || \ + ((1 << dev2->irq) & mask)) ) { +#ifndef CONFIG_PCI_MSI + printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", + pci_name(dev2), dev2->irq, irq); +#endif + continue; + } + dev2->irq = irq; + pirq_penalty[irq]++; + if (dev != dev2) + printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); + } + } + return 1; +} + +static void __init pcibios_fixup_irqs(void) +{ + struct pci_dev *dev = NULL; + u8 pin; + + DBG("PCI: IRQ fixup\n"); + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + /* + * If the BIOS has set an out of range IRQ number, just ignore it. + * Also keep track of which IRQ's are already in use. + */ + if (dev->irq >= 16) { + DBG("%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); + dev->irq = 0; + } + /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ + if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) + pirq_penalty[dev->irq] = 0; + pirq_penalty[dev->irq]++; + } + + dev = NULL; + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); +#ifdef CONFIG_X86_IO_APIC + /* + * Recalculate IRQ numbers if we use the I/O APIC. + */ + if (io_apic_assign_pci_irqs) + { + int irq; + + if (pin) { + pin--; /* interrupt pins are numbered starting from 1 */ + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + } + if (irq >= 0) { + if (use_pci_vector() && + !platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); + + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + } + } + } +#endif + /* + * Still no IRQ? Try to lookup one... + */ + if (pin && !dev->irq) + pcibios_lookup_irq(dev, 0); + } +} + +/* + * Work around broken HP Pavilion Notebooks which assign USB to + * IRQ 9 even though it is actually wired to IRQ 11 + */ +static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) +{ + if (!broken_hp_bios_irq9) { + broken_hp_bios_irq9 = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +/* + * Work around broken Acer TravelMate 360 Notebooks which assign + * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 + */ +static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) +{ + if (!acer_tm360_irqrouting) { + acer_tm360_irqrouting = 1; + printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata pciirq_dmi_table[] = { + { + .callback = fix_broken_hp_bios_irq9, + .ident = "HP Pavilion N5400 Series Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), + DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), + DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), + }, + }, + { + .callback = fix_acer_tm360_irqrouting, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + { } +}; + +static int __init pcibios_irq_init(void) +{ + DBG("PCI: IRQ init\n"); + + if (pcibios_enable_irq || raw_pci_ops == NULL) + return 0; + + dmi_check_system(pciirq_dmi_table); + + pirq_table = pirq_find_routing_table(); + +#ifdef CONFIG_PCI_BIOS + if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) + pirq_table = pcibios_get_irq_routing_table(); +#endif + if (pirq_table) { + pirq_peer_trick(); + pirq_find_router(&pirq_router); + if (pirq_table->exclusive_irqs) { + int i; + for (i=0; i<16; i++) + if (!(pirq_table->exclusive_irqs & (1 << i))) + pirq_penalty[i] += 100; + } + /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ + if (io_apic_assign_pci_irqs) + pirq_table = NULL; + } + + pcibios_enable_irq = pirq_enable_irq; + + pcibios_fixup_irqs(); + return 0; +} + +subsys_initcall(pcibios_irq_init); + + +static void pirq_penalize_isa_irq(int irq) +{ + /* + * If any ISAPnP device reports an IRQ in its list of possible + * IRQ's, we try to avoid assigning it to PCI devices. + */ + if (irq < 16) + pirq_penalty[irq] += 100; +} + +void pcibios_penalize_isa_irq(int irq) +{ +#ifdef CONFIG_ACPI_PCI + if (!acpi_noirq) + acpi_penalize_isa_irq(irq); + else +#endif + pirq_penalize_isa_irq(irq); +} + +static int pirq_enable_irq(struct pci_dev *dev) +{ + u8 pin; + extern int via_interrupt_line_quirk; + struct pci_dev *temp_dev; + + pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { + char *msg = ""; + + pin--; /* interrupt pins are numbered starting from 1 */ + + if (io_apic_assign_pci_irqs) { + int irq; + + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the MP-table. + * In this case we have to look up the IRQ based on the parent bus, + * parent slot, and pin number. The SMP code detects such bridged + * busses itself so we should get into this branch reliably. + */ + temp_dev = dev; + while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ + struct pci_dev * bridge = dev->bus->self; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", + pci_name(bridge), 'A' + pin, irq); + dev = bridge; + } + dev = temp_dev; + if (irq >= 0) { +#ifdef CONFIG_PCI_MSI + if (!platform_legacy_irq(irq)) + irq = IO_APIC_VECTOR(irq); +#endif + printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", + pci_name(dev), 'A' + pin, irq); + dev->irq = irq; + return 0; + } else + msg = " Probably buggy MP table."; + } else if (pci_probe & PCI_BIOS_IRQ_SCAN) + msg = ""; + else + msg = " Please try using pci=biosirq."; + + /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ + if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) + return 0; + + printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", + 'A' + pin, pci_name(dev), msg); + } + /* VIA bridges use interrupt line for apic/pci steering across + the V-Link */ + else if (via_interrupt_line_quirk) + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq & 15); + return 0; +} + +int pci_vector_resources(int last, int nr_released) +{ + int count = nr_released; + + int next = last; + int offset = (last % 8); + + while (next < FIRST_SYSTEM_VECTOR) { + next += 8; +#ifdef CONFIG_X86_64 + if (next == IA32_SYSCALL_VECTOR) + continue; +#else + if (next == SYSCALL_VECTOR) + continue; +#endif + count++; + if (next >= FIRST_SYSTEM_VECTOR) { + if (offset%8) { + next = FIRST_DEVICE_VECTOR + offset; + offset++; + continue; + } + count--; + } + } + + return count; +} diff --git a/arch/i386/pci/legacy.c b/arch/i386/pci/legacy.c new file mode 100644 index 000000000000..1492e3753869 --- /dev/null +++ b/arch/i386/pci/legacy.c @@ -0,0 +1,54 @@ +/* + * legacy.c - traditional, old school PCI bus probing + */ +#include <linux/init.h> +#include <linux/pci.h> +#include "pci.h" + +/* + * Discover remaining PCI buses in case there are peer host bridges. + * We use the number of last PCI bus provided by the PCI BIOS. + */ +static void __devinit pcibios_fixup_peer_bridges(void) +{ + int n, devfn; + + if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) + return; + DBG("PCI: Peer bridge fixup\n"); + + for (n=0; n <= pcibios_last_bus; n++) { + u32 l; + if (pci_find_bus(0, n)) + continue; + for (devfn = 0; devfn < 256; devfn += 8) { + if (!raw_pci_ops->read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && + l != 0x0000 && l != 0xffff) { + DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l); + printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n); + pci_scan_bus(n, &pci_root_ops, NULL); + break; + } + } + } +} + +static int __init pci_legacy_init(void) +{ + if (!raw_pci_ops) { + printk("PCI: System does not support PCI\n"); + return 0; + } + + if (pcibios_scanned++) + return 0; + + printk("PCI: Probing PCI hardware\n"); + pci_root_bus = pcibios_scan_root(0); + + pcibios_fixup_peer_bridges(); + + return 0; +} + +subsys_initcall(pci_legacy_init); diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c new file mode 100644 index 000000000000..021a50aa51f4 --- /dev/null +++ b/arch/i386/pci/mmconfig.c @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2004 Matthew Wilcox <matthew@wil.cx> + * Copyright (C) 2004 Intel Corp. + * + * This code is released under the GNU General Public License version 2. + */ + +/* + * mmconfig.c - Low-level direct PCI config space access via MMCONFIG + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include "pci.h" + +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +u32 pci_mmcfg_base_addr; + +#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) + +/* The base address of the last MMCONFIG device accessed */ +static u32 mmcfg_last_accessed_device; + +/* + * Functions for accessing PCI configuration space with MMCONFIG accesses + */ + +static inline void pci_exp_set_dev_base(int bus, int devfn) +{ + u32 dev_base = pci_mmcfg_base_addr | (bus << 20) | (devfn << 12); + if (dev_base != mmcfg_last_accessed_device) { + mmcfg_last_accessed_device = dev_base; + set_fixmap_nocache(FIX_PCIE_MCFG, dev_base); + } +} + +static int pci_mmcfg_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + *value = readb(mmcfg_virt_addr + reg); + break; + case 2: + *value = readw(mmcfg_virt_addr + reg); + break; + case 4: + *value = readl(mmcfg_virt_addr + reg); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_mmcfg_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + + if ((bus > 255) || (devfn > 255) || (reg > 4095)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + pci_exp_set_dev_base(bus, devfn); + + switch (len) { + case 1: + writeb(value, mmcfg_virt_addr + reg); + break; + case 2: + writew(value, mmcfg_virt_addr + reg); + break; + case 4: + writel(value, mmcfg_virt_addr + reg); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static struct pci_raw_ops pci_mmcfg = { + .read = pci_mmcfg_read, + .write = pci_mmcfg_write, +}; + +static int __init pci_mmcfg_init(void) +{ + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + goto out; + if (!pci_mmcfg_base_addr) + goto out; + + /* Kludge for now. Don't use mmconfig on AMD systems because + those have some busses where mmconfig doesn't work, + and we don't parse ACPI MCFG well enough to handle that. + Remove when proper handling is added. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + goto out; + + printk(KERN_INFO "PCI: Using MMCONFIG\n"); + raw_pci_ops = &pci_mmcfg; + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + + out: + return 0; +} + +arch_initcall(pci_mmcfg_init); diff --git a/arch/i386/pci/numa.c b/arch/i386/pci/numa.c new file mode 100644 index 000000000000..9e3695461899 --- /dev/null +++ b/arch/i386/pci/numa.c @@ -0,0 +1,130 @@ +/* + * numa.c - Low-level PCI access for NUMA-Q machines + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/nodemask.h> +#include "pci.h" + +#define BUS2QUAD(global) (mp_bus_id_to_node[global]) +#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) +#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) + +#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) + +static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + + if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + + switch (len) { + case 1: + *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus)); + break; + case 2: + *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus)); + break; + case 4: + *value = inl_quad(0xCFC, BUS2QUAD(bus)); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + + if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + + switch (len) { + case 1: + outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus)); + break; + case 2: + outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus)); + break; + case 4: + outl_quad((u32)value, 0xCFC, BUS2QUAD(bus)); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +#undef PCI_CONF1_MQ_ADDRESS + +static struct pci_raw_ops pci_direct_conf1_mq = { + .read = pci_conf1_mq_read, + .write = pci_conf1_mq_write +}; + + +static void __devinit pci_fixup_i450nx(struct pci_dev *d) +{ + /* + * i450NX -- Find and scan all secondary buses on all PXB's. + */ + int pxb, reg; + u8 busno, suba, subb; + int quad = BUS2QUAD(d->bus->number); + + printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); + reg = 0xd0; + for(pxb=0; pxb<2; pxb++) { + pci_read_config_byte(d, reg++, &busno); + pci_read_config_byte(d, reg++, &suba); + pci_read_config_byte(d, reg++, &subb); + DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + if (busno) + pci_scan_bus(QUADLOCAL2BUS(quad,busno), &pci_root_ops, NULL); /* Bus A */ + if (suba < subb) + pci_scan_bus(QUADLOCAL2BUS(quad,suba+1), &pci_root_ops, NULL); /* Bus B */ + } + pcibios_last_bus = -1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); + +static int __init pci_numa_init(void) +{ + int quad; + + raw_pci_ops = &pci_direct_conf1_mq; + + if (pcibios_scanned++) + return 0; + + pci_root_bus = pcibios_scan_root(0); + if (num_online_nodes() > 1) + for_each_online_node(quad) { + if (quad == 0) + continue; + printk("Scanning PCI bus %d for quad %d\n", + QUADLOCAL2BUS(quad,0), quad); + pci_scan_bus(QUADLOCAL2BUS(quad,0), + &pci_root_ops, NULL); + } + return 0; +} + +subsys_initcall(pci_numa_init); diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c new file mode 100644 index 000000000000..141421b673b0 --- /dev/null +++ b/arch/i386/pci/pcbios.c @@ -0,0 +1,487 @@ +/* + * BIOS32 and PCI BIOS handling. + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include "pci.h" +#include "pci-functions.h" + + +/* BIOS32 signature: "_32_" */ +#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) + +/* PCI signature: "PCI " */ +#define PCI_SIGNATURE (('P' << 0) + ('C' << 8) + ('I' << 16) + (' ' << 24)) + +/* PCI service signature: "$PCI" */ +#define PCI_SERVICE (('$' << 0) + ('P' << 8) + ('C' << 16) + ('I' << 24)) + +/* PCI BIOS hardware mechanism flags */ +#define PCIBIOS_HW_TYPE1 0x01 +#define PCIBIOS_HW_TYPE2 0x02 +#define PCIBIOS_HW_TYPE1_SPEC 0x10 +#define PCIBIOS_HW_TYPE2_SPEC 0x20 + +/* + * This is the standard structure used to identify the entry point + * to the BIOS32 Service Directory, as documented in + * Standard BIOS 32-bit Service Directory Proposal + * Revision 0.4 May 24, 1993 + * Phoenix Technologies Ltd. + * Norwood, MA + * and the PCI BIOS specification. + */ + +union bios32 { + struct { + unsigned long signature; /* _32_ */ + unsigned long entry; /* 32 bit physical address */ + unsigned char revision; /* Revision level, 0 */ + unsigned char length; /* Length in paragraphs should be 01 */ + unsigned char checksum; /* All bytes must add up to zero */ + unsigned char reserved[5]; /* Must be zero */ + } fields; + char chars[16]; +}; + +/* + * Physical address of the service directory. I don't know if we're + * allowed to have more than one of these or not, so just in case + * we'll make pcibios_present() take a memory start parameter and store + * the array there. + */ + +static struct { + unsigned long address; + unsigned short segment; +} bios32_indirect = { 0, __KERNEL_CS }; + +/* + * Returns the entry point for the given service, NULL on error + */ + +static unsigned long bios32_service(unsigned long service) +{ + unsigned char return_code; /* %al */ + unsigned long address; /* %ebx */ + unsigned long length; /* %ecx */ + unsigned long entry; /* %edx */ + unsigned long flags; + + local_irq_save(flags); + __asm__("lcall *(%%edi); cld" + : "=a" (return_code), + "=b" (address), + "=c" (length), + "=d" (entry) + : "0" (service), + "1" (0), + "D" (&bios32_indirect)); + local_irq_restore(flags); + + switch (return_code) { + case 0: + return address + entry; + case 0x80: /* Not present */ + printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service); + return 0; + default: /* Shouldn't happen */ + printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n", + service, return_code); + return 0; + } +} + +static struct { + unsigned long address; + unsigned short segment; +} pci_indirect = { 0, __KERNEL_CS }; + +static int pci_bios_present; + +static int __devinit check_pcibios(void) +{ + u32 signature, eax, ebx, ecx; + u8 status, major_ver, minor_ver, hw_mech; + unsigned long flags, pcibios_entry; + + if ((pcibios_entry = bios32_service(PCI_SERVICE))) { + pci_indirect.address = pcibios_entry + PAGE_OFFSET; + + local_irq_save(flags); + __asm__( + "lcall *(%%edi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=d" (signature), + "=a" (eax), + "=b" (ebx), + "=c" (ecx) + : "1" (PCIBIOS_PCI_BIOS_PRESENT), + "D" (&pci_indirect) + : "memory"); + local_irq_restore(flags); + + status = (eax >> 8) & 0xff; + hw_mech = eax & 0xff; + major_ver = (ebx >> 8) & 0xff; + minor_ver = ebx & 0xff; + if (pcibios_last_bus < 0) + pcibios_last_bus = ecx & 0xff; + DBG("PCI: BIOS probe returned s=%02x hw=%02x ver=%02x.%02x l=%02x\n", + status, hw_mech, major_ver, minor_ver, pcibios_last_bus); + if (status || signature != PCI_SIGNATURE) { + printk (KERN_ERR "PCI: BIOS BUG #%x[%08x] found\n", + status, signature); + return 0; + } + printk(KERN_INFO "PCI: PCI BIOS revision %x.%02x entry at 0x%lx, last bus=%d\n", + major_ver, minor_ver, pcibios_entry, pcibios_last_bus); +#ifdef CONFIG_PCI_DIRECT + if (!(hw_mech & PCIBIOS_HW_TYPE1)) + pci_probe &= ~PCI_PROBE_CONF1; + if (!(hw_mech & PCIBIOS_HW_TYPE2)) + pci_probe &= ~PCI_PROBE_CONF2; +#endif + return 1; + } + return 0; +} + +static int __devinit pci_bios_find_device (unsigned short vendor, unsigned short device_id, + unsigned short index, unsigned char *bus, unsigned char *device_fn) +{ + unsigned short bx; + unsigned short ret; + + __asm__("lcall *(%%edi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=b" (bx), + "=a" (ret) + : "1" (PCIBIOS_FIND_PCI_DEVICE), + "c" (device_id), + "d" (vendor), + "S" ((int) index), + "D" (&pci_indirect)); + *bus = (bx >> 8) & 0xff; + *device_fn = bx & 0xff; + return (int) (ret & 0xff00) >> 8; +} + +static int pci_bios_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long result = 0; + unsigned long flags; + unsigned long bx = (bus << 8) | devfn; + + if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + switch (len) { + case 1: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (result) + : "1" (PCIBIOS_READ_CONFIG_BYTE), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + case 2: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (result) + : "1" (PCIBIOS_READ_CONFIG_WORD), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + case 4: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=c" (*value), + "=a" (result) + : "1" (PCIBIOS_READ_CONFIG_DWORD), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return (int)((result & 0xff00) >> 8); +} + +static int pci_bios_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long result = 0; + unsigned long flags; + unsigned long bx = (bus << 8) | devfn; + + if ((bus > 255) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + switch (len) { + case 1: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (result) + : "0" (PCIBIOS_WRITE_CONFIG_BYTE), + "c" (value), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + case 2: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (result) + : "0" (PCIBIOS_WRITE_CONFIG_WORD), + "c" (value), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + case 4: + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (result) + : "0" (PCIBIOS_WRITE_CONFIG_DWORD), + "c" (value), + "b" (bx), + "D" ((long)reg), + "S" (&pci_indirect)); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return (int)((result & 0xff00) >> 8); +} + + +/* + * Function table for BIOS32 access + */ + +static struct pci_raw_ops pci_bios_access = { + .read = pci_bios_read, + .write = pci_bios_write +}; + +/* + * Try to find PCI BIOS. + */ + +static struct pci_raw_ops * __devinit pci_find_bios(void) +{ + union bios32 *check; + unsigned char sum; + int i, length; + + /* + * Follow the standard procedure for locating the BIOS32 Service + * directory by scanning the permissible address range from + * 0xe0000 through 0xfffff for a valid BIOS32 structure. + */ + + for (check = (union bios32 *) __va(0xe0000); + check <= (union bios32 *) __va(0xffff0); + ++check) { + if (check->fields.signature != BIOS32_SIGNATURE) + continue; + length = check->fields.length * 16; + if (!length) + continue; + sum = 0; + for (i = 0; i < length ; ++i) + sum += check->chars[i]; + if (sum != 0) + continue; + if (check->fields.revision != 0) { + printk("PCI: unsupported BIOS32 revision %d at 0x%p\n", + check->fields.revision, check); + continue; + } + DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); + if (check->fields.entry >= 0x100000) { + printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); + return NULL; + } else { + unsigned long bios32_entry = check->fields.entry; + DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); + bios32_indirect.address = bios32_entry + PAGE_OFFSET; + if (check_pcibios()) + return &pci_bios_access; + } + break; /* Hopefully more than one BIOS32 cannot happen... */ + } + + return NULL; +} + +/* + * Sort the device list according to PCI BIOS. Nasty hack, but since some + * fool forgot to define the `correct' device order in the PCI BIOS specs + * and we want to be (possibly bug-to-bug ;-]) compatible with older kernels + * which used BIOS ordering, we are bound to do this... + */ + +void __devinit pcibios_sort(void) +{ + LIST_HEAD(sorted_devices); + struct list_head *ln; + struct pci_dev *dev, *d; + int idx, found; + unsigned char bus, devfn; + + DBG("PCI: Sorting device list...\n"); + while (!list_empty(&pci_devices)) { + ln = pci_devices.next; + dev = pci_dev_g(ln); + idx = found = 0; + while (pci_bios_find_device(dev->vendor, dev->device, idx, &bus, &devfn) == PCIBIOS_SUCCESSFUL) { + idx++; + list_for_each(ln, &pci_devices) { + d = pci_dev_g(ln); + if (d->bus->number == bus && d->devfn == devfn) { + list_del(&d->global_list); + list_add_tail(&d->global_list, &sorted_devices); + if (d == dev) + found = 1; + break; + } + } + if (ln == &pci_devices) { + printk(KERN_WARNING "PCI: BIOS reporting unknown device %02x:%02x\n", bus, devfn); + /* + * We must not continue scanning as several buggy BIOSes + * return garbage after the last device. Grr. + */ + break; + } + } + if (!found) { + printk(KERN_WARNING "PCI: Device %s not found by BIOS\n", + pci_name(dev)); + list_del(&dev->global_list); + list_add_tail(&dev->global_list, &sorted_devices); + } + } + list_splice(&sorted_devices, &pci_devices); +} + +/* + * BIOS Functions for IRQ Routing + */ + +struct irq_routing_options { + u16 size; + struct irq_info *table; + u16 segment; +} __attribute__((packed)); + +struct irq_routing_table * __devinit pcibios_get_irq_routing_table(void) +{ + struct irq_routing_options opt; + struct irq_routing_table *rt = NULL; + int ret, map; + unsigned long page; + + if (!pci_bios_present) + return NULL; + page = __get_free_page(GFP_KERNEL); + if (!page) + return NULL; + opt.table = (struct irq_info *) page; + opt.size = PAGE_SIZE; + opt.segment = __KERNEL_DS; + + DBG("PCI: Fetching IRQ routing table... "); + __asm__("push %%es\n\t" + "push %%ds\n\t" + "pop %%es\n\t" + "lcall *(%%esi); cld\n\t" + "pop %%es\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret), + "=b" (map), + "=m" (opt) + : "0" (PCIBIOS_GET_ROUTING_OPTIONS), + "1" (0), + "D" ((long) &opt), + "S" (&pci_indirect), + "m" (opt) + : "memory"); + DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map); + if (ret & 0xff00) + printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff); + else if (opt.size) { + rt = kmalloc(sizeof(struct irq_routing_table) + opt.size, GFP_KERNEL); + if (rt) { + memset(rt, 0, sizeof(struct irq_routing_table)); + rt->size = opt.size + sizeof(struct irq_routing_table); + rt->exclusive_irqs = map; + memcpy(rt->slots, (void *) page, opt.size); + printk(KERN_INFO "PCI: Using BIOS Interrupt Routing Table\n"); + } + } + free_page(page); + return rt; +} + + +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq) +{ + int ret; + + __asm__("lcall *(%%esi); cld\n\t" + "jc 1f\n\t" + "xor %%ah, %%ah\n" + "1:" + : "=a" (ret) + : "0" (PCIBIOS_SET_PCI_HW_INT), + "b" ((dev->bus->number << 8) | dev->devfn), + "c" ((irq << 8) | (pin + 10)), + "S" (&pci_indirect)); + return !(ret & 0xff00); +} + +static int __init pci_pcbios_init(void) +{ + if ((pci_probe & PCI_PROBE_BIOS) + && ((raw_pci_ops = pci_find_bios()))) { + pci_probe |= PCI_BIOS_SORT; + pci_bios_present = 1; + } + return 0; +} + +arch_initcall(pci_pcbios_init); diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h new file mode 100644 index 000000000000..a8fc80ca69f3 --- /dev/null +++ b/arch/i386/pci/pci.h @@ -0,0 +1,74 @@ +/* + * Low-Level PCI Access for i386 machines. + * + * (c) 1999 Martin Mares <mj@ucw.cz> + */ + +#undef DEBUG + +#ifdef DEBUG +#define DBG(x...) printk(x) +#else +#define DBG(x...) +#endif + +#define PCI_PROBE_BIOS 0x0001 +#define PCI_PROBE_CONF1 0x0002 +#define PCI_PROBE_CONF2 0x0004 +#define PCI_PROBE_MMCONF 0x0008 +#define PCI_PROBE_MASK 0x000f + +#define PCI_NO_SORT 0x0100 +#define PCI_BIOS_SORT 0x0200 +#define PCI_NO_CHECKS 0x0400 +#define PCI_USE_PIRQ_MASK 0x0800 +#define PCI_ASSIGN_ROMS 0x1000 +#define PCI_BIOS_IRQ_SCAN 0x2000 +#define PCI_ASSIGN_ALL_BUSSES 0x4000 + +extern unsigned int pci_probe; + +/* pci-i386.c */ + +extern unsigned int pcibios_max_latency; + +void pcibios_resource_survey(void); +int pcibios_enable_resources(struct pci_dev *, int); + +/* pci-pc.c */ + +extern int pcibios_last_bus; +extern struct pci_bus *pci_root_bus; +extern struct pci_ops pci_root_ops; + +/* pci-irq.c */ + +struct irq_info { + u8 bus, devfn; /* Bus, device and function */ + struct { + u8 link; /* IRQ line ID, chipset dependent, 0=not routed */ + u16 bitmap; /* Available IRQs */ + } __attribute__((packed)) irq[4]; + u8 slot; /* Slot number, 0=onboard */ + u8 rfu; +} __attribute__((packed)); + +struct irq_routing_table { + u32 signature; /* PIRQ_SIGNATURE should be here */ + u16 version; /* PIRQ_VERSION */ + u16 size; /* Table size in bytes */ + u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */ + u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */ + u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */ + u32 miniport_data; /* Crap */ + u8 rfu[11]; + u8 checksum; /* Modulo 256 checksum must give zero */ + struct irq_info slots[0]; +} __attribute__((packed)); + +extern unsigned int pcibios_irq_mask; + +extern int pcibios_scanned; +extern spinlock_t pci_config_lock; + +extern int (*pcibios_enable_irq)(struct pci_dev *dev); diff --git a/arch/i386/pci/visws.c b/arch/i386/pci/visws.c new file mode 100644 index 000000000000..6a9248784439 --- /dev/null +++ b/arch/i386/pci/visws.c @@ -0,0 +1,110 @@ +/* + * Low-Level PCI Support for SGI Visual Workstation + * + * (c) 1999--2000 Martin Mares <mj@ucw.cz> + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/init.h> + +#include "cobalt.h" +#include "lithium.h" + +#include "pci.h" + + +extern struct pci_raw_ops pci_direct_conf1; + +static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } + +int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq; + +void __init pcibios_penalize_isa_irq(int irq) {} + + +unsigned int pci_bus0, pci_bus1; + +static inline u8 bridge_swizzle(u8 pin, u8 slot) +{ + return (((pin - 1) + slot) % 4) + 1; +} + +static u8 __init visws_swizzle(struct pci_dev *dev, u8 *pinp) +{ + u8 pin = *pinp; + + while (dev->bus->self) { /* Move up the chain of bridges. */ + pin = bridge_swizzle(pin, PCI_SLOT(dev->devfn)); + dev = dev->bus->self; + } + *pinp = pin; + + return PCI_SLOT(dev->devfn); +} + +static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin) +{ + int irq, bus = dev->bus->number; + + pin--; + + /* Nothing useful at PIIX4 pin 1 */ + if (bus == pci_bus0 && slot == 4 && pin == 0) + return -1; + + /* PIIX4 USB is on Bus 0, Slot 4, Line 3 */ + if (bus == pci_bus0 && slot == 4 && pin == 3) { + irq = CO_IRQ(CO_APIC_PIIX4_USB); + goto out; + } + + /* First pin spread down 1 APIC entry per slot */ + if (pin == 0) { + irq = CO_IRQ((bus == pci_bus0 ? CO_APIC_PCIB_BASE0 : + CO_APIC_PCIA_BASE0) + slot); + goto out; + } + + /* lines 1,2,3 from any slot is shared in this twirly pattern */ + if (bus == pci_bus1) { + /* lines 1-3 from devices 0 1 rotate over 2 apic entries */ + irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((slot + (pin - 1)) % 2)); + } else { /* bus == pci_bus0 */ + /* lines 1-3 from devices 0-3 rotate over 3 apic entries */ + if (slot == 0) + slot = 3; /* same pattern */ + irq = CO_IRQ(CO_APIC_PCIA_BASE123 + ((3 - slot) + (pin - 1) % 3)); + } +out: + printk(KERN_DEBUG "PCI: Bus %d Slot %d Line %d -> IRQ %d\n", bus, slot, pin, irq); + return irq; +} + +void __init pcibios_update_irq(struct pci_dev *dev, int irq) +{ + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + +static int __init pcibios_init(void) +{ + /* The VISWS supports configuration access type 1 only */ + pci_probe = (pci_probe | PCI_PROBE_CONF1) & + ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); + + pci_bus0 = li_pcib_read16(LI_PCI_BUSNUM) & 0xff; + pci_bus1 = li_pcia_read16(LI_PCI_BUSNUM) & 0xff; + + printk(KERN_INFO "PCI: Lithium bridge A bus: %u, " + "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0); + + raw_pci_ops = &pci_direct_conf1; + pci_scan_bus(pci_bus0, &pci_root_ops, NULL); + pci_scan_bus(pci_bus1, &pci_root_ops, NULL); + pci_fixup_irqs(visws_swizzle, visws_map_irq); + pcibios_resource_survey(); + return 0; +} + +subsys_initcall(pcibios_init); diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile new file mode 100644 index 000000000000..8cfa4e8a719d --- /dev/null +++ b/arch/i386/power/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_PM) += cpu.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c new file mode 100644 index 000000000000..cf337c673d92 --- /dev/null +++ b/arch/i386/power/cpu.c @@ -0,0 +1,152 @@ +/* + * Suspend support specific for i386. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/poll.h> +#include <linux/delay.h> +#include <linux/sysrq.h> +#include <linux/proc_fs.h> +#include <linux/irq.h> +#include <linux/pm.h> +#include <linux/device.h> +#include <linux/suspend.h> +#include <linux/acpi.h> +#include <asm/uaccess.h> +#include <asm/acpi.h> +#include <asm/tlbflush.h> + +static struct saved_context saved_context; + +unsigned long saved_context_ebx; +unsigned long saved_context_esp, saved_context_ebp; +unsigned long saved_context_esi, saved_context_edi; +unsigned long saved_context_eflags; + +extern void enable_sep_cpu(void *); + +void __save_processor_state(struct saved_context *ctxt) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); + asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); + asm volatile ("sldt %0" : "=m" (ctxt->ldt)); + asm volatile ("str %0" : "=m" (ctxt->tr)); + + /* + * segment registers + */ + asm volatile ("movw %%es, %0" : "=m" (ctxt->es)); + asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs)); + asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs)); + asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss)); + + /* + * control registers + */ + asm volatile ("movl %%cr0, %0" : "=r" (ctxt->cr0)); + asm volatile ("movl %%cr2, %0" : "=r" (ctxt->cr2)); + asm volatile ("movl %%cr3, %0" : "=r" (ctxt->cr3)); + asm volatile ("movl %%cr4, %0" : "=r" (ctxt->cr4)); +} + +void save_processor_state(void) +{ + __save_processor_state(&saved_context); +} + +static void +do_fpu_end(void) +{ + /* restore FPU regs if necessary */ + /* Do it out of line so that gcc does not move cr0 load to some stupid place */ + kernel_fpu_end(); + mxcsr_feature_mask_init(); +} + + +static void fix_processor_context(void) +{ + int cpu = smp_processor_id(); + struct tss_struct * t = &per_cpu(init_tss, cpu); + + set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ + per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff; + + load_TR_desc(); /* This does ltr */ + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg[7]){ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); + } + +} + +void __restore_processor_state(struct saved_context *ctxt) +{ + + /* + * control registers + */ + asm volatile ("movl %0, %%cr4" :: "r" (ctxt->cr4)); + asm volatile ("movl %0, %%cr3" :: "r" (ctxt->cr3)); + asm volatile ("movl %0, %%cr2" :: "r" (ctxt->cr2)); + asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0)); + + /* + * segment registers + */ + asm volatile ("movw %0, %%es" :: "r" (ctxt->es)); + asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs)); + asm volatile ("movw %0, %%gs" :: "r" (ctxt->gs)); + asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss)); + + /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + asm volatile ("lldt %0" :: "m" (ctxt->ldt)); + + /* + * sysenter MSRs + */ + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(NULL); + + fix_processor_context(); + do_fpu_end(); +} + +void restore_processor_state(void) +{ + __restore_processor_state(&saved_context); +} + +/* Needed by apm.c */ +EXPORT_SYMBOL(save_processor_state); +EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S new file mode 100644 index 000000000000..c4105286ff26 --- /dev/null +++ b/arch/i386/power/swsusp.S @@ -0,0 +1,73 @@ +.text + +/* Originally gcc generated, modified by hand + * + * This may not use any stack, nor any variable that is not "NoSave": + * + * Its rewriting one kernel image with another. What is stack in "old" + * image could very well be data page in "new" image, and overwriting + * your own stack under you is bad idea. + */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/asm_offsets.h> + + .text + +ENTRY(swsusp_arch_suspend) + + movl %esp, saved_context_esp + movl %ebx, saved_context_ebx + movl %ebp, saved_context_ebp + movl %esi, saved_context_esi + movl %edi, saved_context_edi + pushfl ; popl saved_context_eflags + + call swsusp_save + ret + +ENTRY(swsusp_arch_resume) + movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx + movl %ecx, %cr3 + + movl pagedir_nosave, %edx + .p2align 4,,7 + +copy_loop: + testl %edx, %edx + jz done + + movl pbe_address(%edx), %esi + movl pbe_orig_address(%edx), %edi + + movl $1024, %ecx + rep + movsl + + movl pbe_next(%edx), %edx + jmp copy_loop + .p2align 4,,7 + +done: + /* Flush TLB, including "global" things (vmalloc) */ + movl mmu_cr4_features, %eax + movl %eax, %edx + andl $~(1<<7), %edx; # PGE + movl %edx, %cr4; # turn off PGE + movl %cr3, %ecx; # flush TLB + movl %ecx, %cr3 + movl %eax, %cr4; # turn PGE back on + + movl saved_context_esp, %esp + movl saved_context_ebp, %ebp + movl saved_context_ebx, %ebx + movl saved_context_esi, %esi + movl saved_context_edi, %edi + + pushl saved_context_eflags ; popfl + + xorl %eax, %eax + + ret |