From f630e43a215a3129d0c1173cae0bce6ea4855cf7 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 19 Jun 2008 16:33:57 -0700 Subject: ipv6: Drop packets for loopback address from outside of the box. [ Based upon original report and patch by Karsten Keil. Karsten has verified that this fixes the TAHI test case "ICMPv6 test v6LC.5.1.2 Part F". -DaveM ] Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/net/ipv6.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e0a612bc9c4e..f422f7218e1c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -367,6 +367,12 @@ static inline int ipv6_addr_any(const struct in6_addr *a) a->s6_addr32[2] | a->s6_addr32[3] ) == 0); } +static inline int ipv6_addr_loopback(const struct in6_addr *a) +{ + return ((a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0); +} + static inline int ipv6_addr_v4mapped(const struct in6_addr *a) { return ((a->s6_addr32[0] | a->s6_addr32[1] | -- cgit v1.2.3 From 9267b4b3880d00dc2dab90f1d817c856939114f7 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:25:39 +0400 Subject: alpha: fix module load failures on smp (bug #10926) To calculate addresses of locally defined variables, GCC uses 32-bit displacement from the GP. Which doesn't work for per cpu variables in modules, as an offset to the kernel per cpu area is way above 4G. The workaround is to force allocation of a GOT entry for per cpu variable using ldq instruction with a 'literal' relocation. I had to use custom asm/percpu.h, as a required argument magic doesn't work with asm-generic/percpu.h macros. Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- include/asm-alpha/percpu.h | 72 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 48348fe34c19..82e8a94b4b2f 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -1,6 +1,76 @@ #ifndef __ALPHA_PERCPU_H #define __ALPHA_PERCPU_H +#include +#include -#include +/* + * Determine the real variable name from the name visible in the + * kernel sources. + */ +#define per_cpu_var(var) per_cpu__##var + +#ifdef CONFIG_SMP + +/* + * per_cpu_offset() is the offset that has to be added to a + * percpu variable to get to the instance for a certain processor. + */ +extern unsigned long __per_cpu_offset[NR_CPUS]; + +#define per_cpu_offset(x) (__per_cpu_offset[x]) + +#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id()) +#ifdef CONFIG_DEBUG_PREEMPT +#define my_cpu_offset per_cpu_offset(smp_processor_id()) +#else +#define my_cpu_offset __my_cpu_offset +#endif + +#ifndef MODULE +#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset)) +#define PER_CPU_ATTRIBUTES +#else +/* + * To calculate addresses of locally defined variables, GCC uses 32-bit + * displacement from the GP. Which doesn't work for per cpu variables in + * modules, as an offset to the kernel per cpu area is way above 4G. + * + * This forces allocation of a GOT entry for per cpu variable using + * ldq instruction with a 'literal' relocation. + */ +#define SHIFT_PERCPU_PTR(var, offset) ({ \ + extern int simple_identifier_##var(void); \ + unsigned long __ptr, tmp_gp; \ + asm ( "br %1, 1f \n\ + 1: ldgp %1, 0(%1) \n\ + ldq %0, per_cpu__" #var"(%1)\t!literal" \ + : "=&r"(__ptr), "=&r"(tmp_gp)); \ + (typeof(&per_cpu_var(var)))(__ptr + (offset)); }) + +#define PER_CPU_ATTRIBUTES __used + +#endif /* MODULE */ + +/* + * A percpu variable may point to a discarded regions. The following are + * established ways to produce a usable pointer from the percpu variable + * offset. + */ +#define per_cpu(var, cpu) \ + (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) +#define __get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(var, my_cpu_offset)) +#define __raw_get_cpu_var(var) \ + (*SHIFT_PERCPU_PTR(var, __my_cpu_offset)) + +#else /* ! SMP */ + +#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) +#define __get_cpu_var(var) per_cpu_var(var) +#define __raw_get_cpu_var(var) per_cpu_var(var) + +#endif /* SMP */ + +#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) #endif /* __ALPHA_PERCPU_H */ -- cgit v1.2.3 From d559d4a24a3fed75bd890abcc1f95cd8d8dad6e1 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 21 Jun 2008 03:28:31 +0400 Subject: alpha: fix compile failures with gcc-4.3 (bug #10438) Vast majority of these build failures are gcc-4.3 warnings about static functions and objects being referenced from non-static (read: "extern inline") functions, in conjunction with our -Werror. We cannot just convert "extern inline" to "static inline", as people keep suggesting all the time, because "extern inline" logic is crucial for generic kernel build. So - just make sure that all callees of critical "extern inline" functions are also "extern inline"; - use "static inline", wherever it's possible. traps.c: work around gcc-4.3 being too smart about array bounds-checking. TODO: add "gnu_inline" attribute to all our "extern inline" functions to ensure desired behaviour with future compilers. Signed-off-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- arch/alpha/kernel/core_t2.c | 2 ++ arch/alpha/kernel/traps.c | 3 ++- include/asm-alpha/core_mcpcia.h | 2 +- include/asm-alpha/core_t2.h | 14 +++++++------- include/asm-alpha/io.h | 6 +++--- include/asm-alpha/mmu_context.h | 6 +++--- include/asm-alpha/system.h | 10 +++++----- include/asm-alpha/vga.h | 6 +++--- 8 files changed, 26 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/core_t2.c b/arch/alpha/kernel/core_t2.c index c0750291b44a..d9980d47ab81 100644 --- a/arch/alpha/kernel/core_t2.c +++ b/arch/alpha/kernel/core_t2.c @@ -74,6 +74,8 @@ # define DBG(args) #endif +DEFINE_SPINLOCK(t2_hae_lock); + static volatile unsigned int t2_mcheck_any_expected; static volatile unsigned int t2_mcheck_last_taken; diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index dc57790250d2..c778779007fc 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -447,7 +447,7 @@ struct unaligned_stat { /* Macro for exception fixup code to access integer registers. */ -#define una_reg(r) (regs->regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)]) +#define una_reg(r) (_regs[(r) >= 16 && (r) <= 18 ? (r)+19 : (r)]) asmlinkage void @@ -456,6 +456,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg, { long error, tmp1, tmp2, tmp3, tmp4; unsigned long pc = regs->pc - 4; + unsigned long *_regs = regs->regs; const struct exception_table_entry *fixup; unaligned[0].count++; diff --git a/include/asm-alpha/core_mcpcia.h b/include/asm-alpha/core_mcpcia.h index 525b4f6a7ace..acf55b483472 100644 --- a/include/asm-alpha/core_mcpcia.h +++ b/include/asm-alpha/core_mcpcia.h @@ -261,7 +261,7 @@ struct el_MCPCIA_uncorrected_frame_mcheck { } #endif -static inline int __mcpcia_is_mmio(unsigned long addr) +extern inline int __mcpcia_is_mmio(unsigned long addr) { return (addr & 0x80000000UL) == 0; } diff --git a/include/asm-alpha/core_t2.h b/include/asm-alpha/core_t2.h index 90e6b5d6c214..46bfff58f670 100644 --- a/include/asm-alpha/core_t2.h +++ b/include/asm-alpha/core_t2.h @@ -356,13 +356,13 @@ struct el_t2_frame_corrected { #define vip volatile int * #define vuip volatile unsigned int * -static inline u8 t2_inb(unsigned long addr) +extern inline u8 t2_inb(unsigned long addr) { long result = *(vip) ((addr << 5) + T2_IO + 0x00); return __kernel_extbl(result, addr & 3); } -static inline void t2_outb(u8 b, unsigned long addr) +extern inline void t2_outb(u8 b, unsigned long addr) { unsigned long w; @@ -371,13 +371,13 @@ static inline void t2_outb(u8 b, unsigned long addr) mb(); } -static inline u16 t2_inw(unsigned long addr) +extern inline u16 t2_inw(unsigned long addr) { long result = *(vip) ((addr << 5) + T2_IO + 0x08); return __kernel_extwl(result, addr & 3); } -static inline void t2_outw(u16 b, unsigned long addr) +extern inline void t2_outw(u16 b, unsigned long addr) { unsigned long w; @@ -386,12 +386,12 @@ static inline void t2_outw(u16 b, unsigned long addr) mb(); } -static inline u32 t2_inl(unsigned long addr) +extern inline u32 t2_inl(unsigned long addr) { return *(vuip) ((addr << 5) + T2_IO + 0x18); } -static inline void t2_outl(u32 b, unsigned long addr) +extern inline void t2_outl(u32 b, unsigned long addr) { *(vuip) ((addr << 5) + T2_IO + 0x18) = b; mb(); @@ -435,7 +435,7 @@ static inline void t2_outl(u32 b, unsigned long addr) set_hae(msb); \ } -static DEFINE_SPINLOCK(t2_hae_lock); +extern spinlock_t t2_hae_lock; /* * NOTE: take T2_DENSE_MEM off in each readX/writeX routine, since diff --git a/include/asm-alpha/io.h b/include/asm-alpha/io.h index 38f18cf18c9d..e971ab000f95 100644 --- a/include/asm-alpha/io.h +++ b/include/asm-alpha/io.h @@ -35,7 +35,7 @@ * register not being up-to-date with respect to the hardware * value. */ -static inline void __set_hae(unsigned long new_hae) +extern inline void __set_hae(unsigned long new_hae) { unsigned long flags; local_irq_save(flags); @@ -49,7 +49,7 @@ static inline void __set_hae(unsigned long new_hae) local_irq_restore(flags); } -static inline void set_hae(unsigned long new_hae) +extern inline void set_hae(unsigned long new_hae) { if (new_hae != alpha_mv.hae_cache) __set_hae(new_hae); @@ -176,7 +176,7 @@ REMAP2(u64, writeq, volatile) #undef REMAP1 #undef REMAP2 -static inline void __iomem *generic_ioportmap(unsigned long a) +extern inline void __iomem *generic_ioportmap(unsigned long a) { return alpha_mv.mv_ioportmap(a); } diff --git a/include/asm-alpha/mmu_context.h b/include/asm-alpha/mmu_context.h index 6a5be1f7debf..86c08a02d239 100644 --- a/include/asm-alpha/mmu_context.h +++ b/include/asm-alpha/mmu_context.h @@ -23,7 +23,7 @@ #endif -extern inline unsigned long +static inline unsigned long __reload_thread(struct pcb_struct *pcb) { register unsigned long a0 __asm__("$16"); @@ -114,7 +114,7 @@ extern unsigned long last_asn; #define __MMU_EXTERN_INLINE #endif -static inline unsigned long +extern inline unsigned long __get_new_mm_context(struct mm_struct *mm, long cpu) { unsigned long asn = cpu_last_asn(cpu); @@ -226,7 +226,7 @@ ev4_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm) # endif #endif -extern inline int +static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { int i; diff --git a/include/asm-alpha/system.h b/include/asm-alpha/system.h index ed221d6408fc..afe20fa58c99 100644 --- a/include/asm-alpha/system.h +++ b/include/asm-alpha/system.h @@ -184,7 +184,7 @@ enum amask_enum { __amask; }) #define __CALL_PAL_R0(NAME, TYPE) \ -static inline TYPE NAME(void) \ +extern inline TYPE NAME(void) \ { \ register TYPE __r0 __asm__("$0"); \ __asm__ __volatile__( \ @@ -196,7 +196,7 @@ static inline TYPE NAME(void) \ } #define __CALL_PAL_W1(NAME, TYPE0) \ -static inline void NAME(TYPE0 arg0) \ +extern inline void NAME(TYPE0 arg0) \ { \ register TYPE0 __r16 __asm__("$16") = arg0; \ __asm__ __volatile__( \ @@ -207,7 +207,7 @@ static inline void NAME(TYPE0 arg0) \ } #define __CALL_PAL_W2(NAME, TYPE0, TYPE1) \ -static inline void NAME(TYPE0 arg0, TYPE1 arg1) \ +extern inline void NAME(TYPE0 arg0, TYPE1 arg1) \ { \ register TYPE0 __r16 __asm__("$16") = arg0; \ register TYPE1 __r17 __asm__("$17") = arg1; \ @@ -219,7 +219,7 @@ static inline void NAME(TYPE0 arg0, TYPE1 arg1) \ } #define __CALL_PAL_RW1(NAME, RTYPE, TYPE0) \ -static inline RTYPE NAME(TYPE0 arg0) \ +extern inline RTYPE NAME(TYPE0 arg0) \ { \ register RTYPE __r0 __asm__("$0"); \ register TYPE0 __r16 __asm__("$16") = arg0; \ @@ -232,7 +232,7 @@ static inline RTYPE NAME(TYPE0 arg0) \ } #define __CALL_PAL_RW2(NAME, RTYPE, TYPE0, TYPE1) \ -static inline RTYPE NAME(TYPE0 arg0, TYPE1 arg1) \ +extern inline RTYPE NAME(TYPE0 arg0, TYPE1 arg1) \ { \ register RTYPE __r0 __asm__("$0"); \ register TYPE0 __r16 __asm__("$16") = arg0; \ diff --git a/include/asm-alpha/vga.h b/include/asm-alpha/vga.h index e8df1e7aae6b..c00106bac521 100644 --- a/include/asm-alpha/vga.h +++ b/include/asm-alpha/vga.h @@ -13,7 +13,7 @@ #define VT_BUF_HAVE_MEMSETW #define VT_BUF_HAVE_MEMCPYW -extern inline void scr_writew(u16 val, volatile u16 *addr) +static inline void scr_writew(u16 val, volatile u16 *addr) { if (__is_ioaddr(addr)) __raw_writew(val, (volatile u16 __iomem *) addr); @@ -21,7 +21,7 @@ extern inline void scr_writew(u16 val, volatile u16 *addr) *addr = val; } -extern inline u16 scr_readw(volatile const u16 *addr) +static inline u16 scr_readw(volatile const u16 *addr) { if (__is_ioaddr(addr)) return __raw_readw((volatile const u16 __iomem *) addr); @@ -29,7 +29,7 @@ extern inline u16 scr_readw(volatile const u16 *addr) return *addr; } -extern inline void scr_memsetw(u16 *s, u16 c, unsigned int count) +static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) { if (__is_ioaddr(s)) memsetw_io((u16 __iomem *) s, c, count); -- cgit v1.2.3 From b9f75f45a6b46a0ab4eb0857d437a0845871f314 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jun 2008 22:16:51 -0700 Subject: netns: Don't receive new packets in a dead network namespace. Alexey Dobriyan writes: > Subject: ICMP sockets destruction vs ICMP packets oops > After icmp_sk_exit() nuked ICMP sockets, we get an interrupt. > icmp_reply() wants ICMP socket. > > Steps to reproduce: > > launch shell in new netns > move real NIC to netns > setup routing > ping -i 0 > exit from shell > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 > IP: [] icmp_sk+0x17/0x30 > PGD 17f3cd067 PUD 17f3ce067 PMD 0 > Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC > CPU 0 > Modules linked in: usblp usbcore > Pid: 0, comm: swapper Not tainted 2.6.26-rc6-netns-ct #4 > RIP: 0010:[] [] icmp_sk+0x17/0x30 > RSP: 0018:ffffffff8057fc30 EFLAGS: 00010286 > RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff81017c7db900 > RDX: 0000000000000034 RSI: ffff81017c7db900 RDI: ffff81017dc41800 > RBP: ffffffff8057fc40 R08: 0000000000000001 R09: 000000000000a815 > R10: 0000000000000000 R11: 0000000000000001 R12: ffffffff8057fd28 > R13: ffffffff8057fd00 R14: ffff81017c7db938 R15: ffff81017dc41800 > FS: 0000000000000000(0000) GS:ffffffff80525000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 0000000000000000 CR3: 000000017fcda000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 0, threadinfo ffffffff8053a000, task ffffffff804fa4a0) > Stack: 0000000000000000 ffff81017c7db900 ffffffff8057fcf0 ffffffff803fcfe4 > ffffffff804faa38 0000000000000246 0000000000005a40 0000000000000246 > 000000000001ffff ffff81017dd68dc0 0000000000005a40 0000000055342436 > Call Trace: > [] icmp_reply+0x44/0x1e0 > [] ? ip_route_input+0x23a/0x1360 > [] icmp_echo+0x65/0x70 > [] icmp_rcv+0x180/0x1b0 > [] ip_local_deliver+0xf4/0x1f0 > [] ip_rcv+0x33b/0x650 > [] netif_receive_skb+0x27a/0x340 > [] process_backlog+0x9d/0x100 > [] net_rx_action+0x18d/0x250 > [] __do_softirq+0x75/0x100 > [] call_softirq+0x1c/0x30 > [] do_softirq+0x65/0xa0 > [] irq_exit+0x97/0xa0 > [] do_IRQ+0xa8/0x130 > [] ? mwait_idle+0x0/0x60 > [] ret_from_intr+0x0/0xf > [] ? mwait_idle+0x4c/0x60 > [] ? mwait_idle+0x43/0x60 > [] ? cpu_idle+0x57/0xa0 > [] ? rest_init+0x70/0x80 > Code: 10 5b 41 5c 41 5d 41 5e c9 c3 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 53 > 48 83 ec 08 48 8b 9f 78 01 00 00 e8 2b c7 f1 ff 89 c0 <48> 8b 04 c3 48 83 c4 08 > 5b c9 c3 66 66 66 66 66 2e 0f 1f 84 00 > RIP [] icmp_sk+0x17/0x30 > RSP > CR2: 0000000000000000 > ---[ end trace ea161157b76b33e8 ]--- > Kernel panic - not syncing: Aiee, killing interrupt handler! Receiving packets while we are cleaning up a network namespace is a racy proposition. It is possible when the packet arrives that we have removed some but not all of the state we need to fully process it. We have the choice of either playing wack-a-mole with the cleanup routines or simply dropping packets when we don't have a network namespace to handle them. Since the check looks inexpensive in netif_receive_skb let's just drop the incoming packets. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 11 +++++++++++ net/core/dev.c | 4 ++++ net/core/net_namespace.c | 3 +++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index aa540e6be502..d9dd0f707296 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -95,6 +95,11 @@ extern struct list_head net_namespace_list; #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); +static inline int net_alive(struct net *net) +{ + return net && atomic_read(&net->count); +} + static inline struct net *get_net(struct net *net) { atomic_inc(&net->count); @@ -125,6 +130,12 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } #else + +static inline int net_alive(struct net *net) +{ + return 1; +} + static inline struct net *get_net(struct net *net) { return net; diff --git a/net/core/dev.c b/net/core/dev.c index 68d8df0992ab..c421a1f8f0b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2077,6 +2077,10 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); + /* Don't receive packets in an exiting network namespace */ + if (!net_alive(dev_net(skb->dev))) + goto out; + #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 72b4c184dd84..7c52fe277b62 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -140,6 +140,9 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; + /* Be very certain incoming network packets will not find us */ + rcu_barrier(); + net = container_of(work, struct net, work); mutex_lock(&net_mutex); -- cgit v1.2.3 From 71c2742f5e6348d76ee62085cf0a13e5eff0f00e Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sat, 21 Jun 2008 19:01:02 +0200 Subject: Add return value to reserve_bootmem_node() This patch changes the function reserve_bootmem_node() from void to int, returning -ENOMEM if the allocation fails. This fixes a build problem on x86 with CONFIG_KEXEC=y and CONFIG_NEED_MULTIPLE_NODES=y Signed-off-by: Bernhard Walle Reported-by: Adrian Bunk Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 2 +- mm/bootmem.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 6a5dbdc8a7dc..686895bacd9d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -94,7 +94,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); -extern void reserve_bootmem_node(pg_data_t *pgdat, +extern int reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags); diff --git a/mm/bootmem.c b/mm/bootmem.c index e8fb927392b9..8d9f60e06f62 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -442,15 +442,17 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { int ret; ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); if (ret < 0) - return; + return -ENOMEM; reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + + return 0; } void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, -- cgit v1.2.3 From 36c7343b4ecac2432430f5393314f1bdc2c219a5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 23 Jun 2008 12:06:52 +0100 Subject: tty_driver: Update required method documentation Some of the requirement rules are now more relaxed. Also correct a contradiction in the previous update Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- include/linux/tty_driver.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9c..d2a003586761 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -27,8 +27,7 @@ * This routine is called by the kernel to write a series of * characters to the tty device. The characters may come from * user space or kernel space. This routine will return the - * number of characters actually accepted for writing. This - * routine is mandatory. + * number of characters actually accepted for writing. * * Optional: Required for writable devices. * @@ -134,7 +133,7 @@ * This routine notifies the tty driver that it should hangup the * tty device. * - * Required: + * Optional: * * void (*break_ctl)(struct tty_stuct *tty, int state); * -- cgit v1.2.3 From 72c6e251ed84b3a9cdfde6711191155c47bb2b9c Mon Sep 17 00:00:00 2001 From: Thorsten Kranzkowski Date: Mon, 23 Jun 2008 20:57:22 +0000 Subject: alpha: fix compile error in arch/alpha/mm/init.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9267b4b3880d00dc2dab90f1d817c856939114f7 ("alpha: fix module load failures on smp (bug #10926)") causes a regression for my ev4 uniprocessor build: CC arch/alpha/mm/init.o /export/data/repositories/linux-2.6/arch/alpha/mm/init.c:34: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘typeof’ make[2]: *** [arch/alpha/mm/init.o] Error 1 make[1]: *** [arch/alpha/mm] Error 2 make: *** [sub-make] Error 2 This fixes it for me (compile and boot tested): Signed-off-by: Thorsten Kranzkowski Acked-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- include/asm-alpha/percpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2f..3495e8e00d70 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define PER_CPU_ATTRIBUTES + #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) -- cgit v1.2.3 From 06e05645661211b9eaadaf6344c335d2e80f0ba2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 6 Jun 2008 16:37:36 -0300 Subject: KVM: close timer injection race window in __vcpu_run If a timer fires after kvm_inject_pending_timer_irqs() but before local_irq_disable() the code will enter guest mode and only inject such timer interrupt the next time an unrelated event causes an exit. It would be simpler if the timer->pending irq conversion could be done with IRQ's disabled, so that the above problem cannot happen. For now introduce a new vcpu requests bit to cancel guest entry. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 9 ++++++--- arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874e..3829aa7b663f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); + if (vcpu0) { + set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); + if (waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); + } } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba63..ebc03f5ae162 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); if (waitqueue_active(q)) { apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90744a1dc3a..b08812d6b34c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2774,6 +2774,7 @@ again: } } + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); kvm_inject_pending_timer_irqs(vcpu); preempt_disable(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291d..de9d1df4bba2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -33,6 +33,7 @@ #define KVM_REQ_REPORT_TPR_ACCESS 2 #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -- cgit v1.2.3 From 28499143933f19b28008a556ed59255d6009391a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 9 May 2008 12:05:57 +0100 Subject: xen: remove support for non-PAE 32-bit Non-PAE operation has been deprecated in Xen for a while, and is rarely tested or used. xen-unstable has now officially dropped non-PAE support. Since Xen/pvops' non-PAE support has also been broken for a while, we may as well completely drop it altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 51 ++++++++++++++++------------------------------ arch/x86/xen/mmu.c | 19 ++--------------- arch/x86/xen/mmu.h | 24 ++++++---------------- arch/x86/xen/xen-head.S | 4 ---- include/asm-x86/xen/page.h | 4 ---- 6 files changed, 27 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..525b108411bd 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c048de34d6a1..f09c1c69c37a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowmem_page_readonly(pmd); - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); } /* make sure zero_page is mapped RO so we can use it in pagetables */ @@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ - { - unsigned level; - -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); - } + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } /* This is called once we have the cpu_possible_map */ @@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, -#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, .set_pud = xen_set_pud, @@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, -#endif /* PAE */ .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 265601d5a6ae..df40bf74ea75 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -228,7 +228,7 @@ pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } -#ifdef CONFIG_X86_PAE + void xen_set_pud(pud_t *ptr, pud_t val) { struct multicall_space mcs; @@ -276,12 +276,6 @@ pmd_t xen_make_pmd(pmdval_t pmd) pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} -#endif /* CONFIG_X86_PAE */ /* (Yet another) pagetable walker. This one is intended for pinning a @@ -434,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - unsigned level; - xen_mc_batch(); if (pgd_walk(pgd, pin_page, TASK_SIZE)) { @@ -445,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } -#ifdef CONFIG_X86_PAE - level = MMUEXT_PIN_L3_TABLE; -#else - level = MMUEXT_PIN_L2_TABLE; -#endif - - xen_do_pin(level, PFN_DOWN(__pa(pgd))); - + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index b5e189b1519d..5fe961caffd4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm); void xen_pgd_pin(pgd_t *pgd); //void xen_pgd_unpin(pgd_t *pgd); -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); +pteval_t xen_pte_val(pte_t); +pmdval_t xen_pmd_val(pmd_t); +pgdval_t xen_pgd_val(pgd_t); -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); +pte_t xen_make_pte(pteval_t); +pmd_t xen_make_pmd(pmdval_t); +pgd_t xen_make_pgd(pgdval_t); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); @@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val); void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_pmd_clear(pmd_t *pmdp); - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 3175e973fd0d..6ec3b4f7719b 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -30,11 +30,7 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") #endif /*CONFIG_XEN */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index baf3a4dce28c..e11f24038b1d 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -150,13 +150,9 @@ static inline pte_t __pte_ma(pteval_t x) return (pte_t) { .pte = x }; } -#ifdef CONFIG_X86_PAE #define pmd_val_ma(v) ((v).pmd) #define pud_val_ma(v) ((v).pgd.pgd) #define __pmd_ma(x) ((pmd_t) { (x) } ) -#else /* !X86_PAE */ -#define pmd_val_ma(v) ((v).pud.pgd.pgd) -#endif /* CONFIG_X86_PAE */ #define pgd_val_ma(x) ((x).pgd) -- cgit v1.2.3 From 7af192c954017499ec163bc9dbaaee2e593d7ef2 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:29 +0200 Subject: x86: Add structs and functions for paravirt clocksource This patch adds structs for the paravirt clocksource ABI used by both xen and kvm (pvclock-abi.h). It also adds some helper functions to read system time and wall clock time from a paravirtual clocksource (pvclock.[ch]). They are based on the xen code. They are enabled using CONFIG_PARAVIRT_CLOCK. Subsequent patches of this series will put the code in use. Signed-off-by: Gerd Hoffmann Acked-by: Jeremy Fitzhardinge Signed-off-by: Avi Kivity --- arch/x86/Kconfig | 4 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/pvclock.c | 141 ++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/pvclock-abi.h | 42 +++++++++++++ include/asm-x86/pvclock.h | 13 ++++ 5 files changed, 201 insertions(+) create mode 100644 arch/x86/kernel/pvclock.c create mode 100644 include/asm-x86/pvclock-abi.h create mode 100644 include/asm-x86/pvclock.h (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52e18e6d2ba0..f94bca6ff47f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -410,6 +410,10 @@ config PARAVIRT over full virtualization. However, when run without a hypervisor the kernel is theoretically slower and slightly larger. +config PARAVIRT_CLOCK + bool + default n + endif config MEMTEST_BOOTPARAM diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..77807d4769c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct pvclock_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) || (dst->version != src->version)); + + return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret, offset; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + offset = pvclock_get_nsec_offset(&shadow); + ret = shadow.system_timestamp + offset; + barrier(); + } while (version != src->version); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} diff --git a/include/asm-x86/pvclock-abi.h b/include/asm-x86/pvclock-abi.h new file mode 100644 index 000000000000..6857f840b243 --- /dev/null +++ b/include/asm-x86/pvclock-abi.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PVCLOCK_ABI_H_ +#define _ASM_X86_PVCLOCK_ABI_H_ +#ifndef __ASSEMBLY__ + +/* + * These structs MUST NOT be changed. + * They are the ABI between hypervisor and guest OS. + * Both Xen and KVM are using this. + * + * pvclock_vcpu_time_info holds the system time and the tsc timestamp + * of the last update. So the guest can use the tsc delta to get a + * more precise system time. There is one per virtual cpu. + * + * pvclock_wall_clock references the point in time when the system + * time was zero (usually boot time), thus the guest calculates the + * current wall clock by adding the system time. + * + * Protocol for the "version" fields is: hypervisor raises it (making + * it uneven) before it starts updating the fields and raises it again + * (making it even) when it is done. Thus the guest can make sure the + * time values it got are consistent by checking the version before + * and after reading them. + */ + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 pad[3]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PVCLOCK_ABI_H_ */ diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h new file mode 100644 index 000000000000..85b1bba8e0a3 --- /dev/null +++ b/include/asm-x86/pvclock.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_PVCLOCK_H_ +#define _ASM_X86_PVCLOCK_H_ + +#include +#include + +/* some helper functions for xen and kvm pv clock sources */ +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_read_wallclock(struct pvclock_wall_clock *wall, + struct pvclock_vcpu_time_info *vcpu, + struct timespec *ts); + +#endif /* _ASM_X86_PVCLOCK_H_ */ -- cgit v1.2.3 From 1c7b67f7576c4ca2a344379a4a29eec8fe8e7935 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:30 +0200 Subject: x86: Make xen use the paravirt clocksource structs and functions This patch updates the xen guest to use the pvclock structs and helper functions. Signed-off-by: Gerd Hoffmann Acked-by: Jeremy Fitzhardinge Signed-off-by: Avi Kivity --- arch/x86/xen/Kconfig | 1 + arch/x86/xen/time.c | 132 ++++---------------------------------------- include/xen/interface/xen.h | 7 +-- 3 files changed, 16 insertions(+), 124 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..3a4f16aea4bf 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,6 +5,7 @@ config XEN bool "Xen guest support" select PARAVIRT + select PARAVIRT_CLOCK depends on X86_32 depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) help diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 52b2e3856980..41e217503c96 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -31,17 +32,6 @@ static cycle_t xen_clocksource_read(void); -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -211,7 +201,7 @@ unsigned long long xen_sched_clock(void) unsigned long xen_cpu_khz(void) { u64 xen_khz = 1000000ULL << 32; - const struct vcpu_time_info *info = + const struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; do_div(xen_khz, info->tsc_to_system_mul); @@ -223,121 +213,26 @@ unsigned long xen_cpu_khz(void) return xen_khz; } -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - static cycle_t xen_clocksource_read(void) { - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + struct pvclock_vcpu_time_info *src; cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); + src = &get_cpu_var(xen_vcpu)->time; + ret = pvclock_clocksource_read(src); + put_cpu_var(xen_vcpu); return ret; } static void xen_read_wallclock(struct timespec *ts) { - const struct shared_info *s = HYPERVISOR_shared_info; - u32 version; - u64 delta; - struct timespec now; - - /* get wallclock at system boot */ - do { - version = s->wc_version; - rmb(); /* fetch version before time */ - now.tv_sec = s->wc_sec; - now.tv_nsec = s->wc_nsec; - rmb(); /* fetch time before checking version */ - } while ((s->wc_version & 1) | (version ^ s->wc_version)); + struct shared_info *s = HYPERVISOR_shared_info; + struct pvclock_wall_clock *wall_clock = &(s->wc); + struct pvclock_vcpu_time_info *vcpu_time; - delta = xen_clocksource_read(); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; - - now.tv_nsec = do_div(delta, NSEC_PER_SEC); - now.tv_sec = delta; - - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + vcpu_time = &get_cpu_var(xen_vcpu)->time; + pvclock_read_wallclock(wall_clock, vcpu_time, ts); + put_cpu_var(xen_vcpu); } unsigned long xen_get_wallclock(void) @@ -345,7 +240,6 @@ unsigned long xen_get_wallclock(void) struct timespec ts; xen_read_wallclock(&ts); - return ts.tv_sec; } @@ -569,8 +463,6 @@ __init void xen_time_init(void) { int cpu = smp_processor_id(); - get_time_values_from_xen(); - clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 9b018da48cf3..819a0331cda9 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -10,6 +10,7 @@ #define __XEN_PUBLIC_XEN_H__ #include +#include /* * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS). @@ -336,7 +337,7 @@ struct vcpu_info { uint8_t evtchn_upcall_mask; unsigned long evtchn_pending_sel; struct arch_vcpu_info arch; - struct vcpu_time_info time; + struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ /* @@ -384,9 +385,7 @@ struct shared_info { * Wallclock time: updated only by control software. Guests should base * their gettimeofday() syscall on this wallclock-base value. */ - uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ - uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ - uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ + struct pvclock_wall_clock wc; struct arch_shared_info arch; -- cgit v1.2.3 From 50d0a0f987b83a8dadb1134d834e35ec410392b5 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:31 +0200 Subject: KVM: Make kvm host use the paravirt clocksource structs This patch updates the kvm host code to use the pvclock structs. It also makes the paravirt clock compatible with Xen. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 75 ++++++++++++++++++++++++++++++++++++++-------- include/asm-x86/kvm_host.h | 4 ++- 2 files changed, 65 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b08812d6b34c..63a77caa59f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { static int version; - struct kvm_wall_clock wc; - struct timespec wc_ts; + struct pvclock_wall_clock wc; + struct timespec now, sys, boot; if (!wall_clock) return; @@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - wc_ts = current_kernel_time(); - wc.wc_sec = wc_ts.tv_sec; - wc.wc_nsec = wc_ts.tv_nsec; - wc.wc_version = version; + /* + * The guest calculates current wall clock time by adding + * system time (updated by kvm_write_guest_time below) to the + * wall clock specified here. guest system time equals host + * system time for us, thus we must fill in host boot time here. + */ + now = current_kernel_time(); + ktime_get_ts(&sys); + boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + + wc.sec = boot.tv_sec; + wc.nsec = boot.tv_nsec; + wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static uint32_t div_frac(uint32_t dividend, uint32_t divisor) +{ + uint32_t quotient, remainder; + + /* Don't try to replace with do_div(), this one calculates + * "(dividend << 32) / divisor" */ + __asm__ ( "divl %4" + : "=a" (quotient), "=d" (remainder) + : "0" (0), "1" (dividend), "r" (divisor) ); + return quotient; +} + +static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +{ + uint64_t nsecs = 1000000000LL; + int32_t shift = 0; + uint64_t tps64; + uint32_t tps32; + + tps64 = tsc_khz * 1000LL; + while (tps64 > nsecs*2) { + tps64 >>= 1; + shift--; + } + + tps32 = (uint32_t)tps64; + while (tps32 <= (uint32_t)nsecs) { + tps32 <<= 1; + shift++; + } + + hv_clock->tsc_shift = shift; + hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + + pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", + __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + hv_clock->tsc_to_system_mul); +} + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) if ((!vcpu->time_page)) return; + if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) { + kvm_set_time_scale(tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = tsc_khz; + } + /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, @@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate - * state, we just write "2" at the end + * state, we just increase by 2 at the end. */ - vcpu->hv_clock.version = 2; + vcpu->hv_clock.version += 2; shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + sizeof(vcpu->hv_clock)); kunmap_atomic(shared_kaddr, KM_USER0); @@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - vcpu->arch.hv_clock.tsc_to_system_mul = - clocksource_khz2mult(tsc_khz, 22); - vcpu->arch.hv_clock.tsc_shift = 22; - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 1d8cd01fa514..844f2a89afbc 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -18,6 +18,7 @@ #include #include +#include #include #define KVM_MAX_VCPUS 16 @@ -282,7 +283,8 @@ struct kvm_vcpu_arch { struct x86_emulate_ctxt emulate_ctxt; gpa_t time; - struct kvm_vcpu_time_info hv_clock; + struct pvclock_vcpu_time_info hv_clock; + unsigned int hv_clock_tsc_khz; unsigned int time_offset; struct page *time_page; }; -- cgit v1.2.3 From 6b1ed9086592fd4b066daae222751bb6757ca5eb Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:33 +0200 Subject: KVM: Remove now unused structs from kvm_para.h The kvm_* structs are obsoleted by the pvclock_* ones. Now all users have been switched over and the old structs can be dropped. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- include/asm-x86/kvm_para.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h index 509845942070..bfd9900742bf 100644 --- a/include/asm-x86/kvm_para.h +++ b/include/asm-x86/kvm_para.h @@ -48,24 +48,6 @@ struct kvm_mmu_op_release_pt { #ifdef __KERNEL__ #include -/* xen binary-compatible interface. See xen headers for details */ -struct kvm_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - int8_t pad[3]; -} __attribute__((__packed__)); /* 32 bytes */ - -struct kvm_wall_clock { - uint32_t wc_version; - uint32_t wc_sec; - uint32_t wc_nsec; -} __attribute__((__packed__)); - - extern void kvmclock_init(void); -- cgit v1.2.3